public inbox for [email protected]help / color / mirror / Atom feed
Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY 6+ messages / 1 participants [nested] [flat]
* Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY @ 2026-03-09 00:03 Mihail Nikalayeu <[email protected]> 2026-03-20 01:15 ` Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY Mihail Nikalayeu <[email protected]> 0 siblings, 1 reply; 6+ messages in thread From: Mihail Nikalayeu @ 2026-03-09 00:03 UTC (permalink / raw) To: PostgreSQL Hackers <[email protected]>; +Cc: Matthias van de Meent <[email protected]>; Álvaro Herrera <[email protected]>; Antonin Houska <[email protected]>; Sergey Sargsyan <[email protected]>; Hannu Krosing <[email protected]> Hello, everyone! Rebased. Attachments: [application/x-patch] v2-0004-Support-snapshot-resets-in-concurrent-builds-of-u.patch (42.6K, 2-v2-0004-Support-snapshot-resets-in-concurrent-builds-of-u.patch) download | inline diff: From e4f207350689364f6fb4a551f7516460e27f03ea Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <[email protected]> Date: Wed, 14 Jan 2026 19:02:34 +0300 Subject: [PATCH v2 4/4] Support snapshot resets in concurrent builds of unique indexes Previously, concurrent builds of unique index used a fixed snapshot for the entire scan to ensure proper uniqueness checks. Now reset snapshots periodically during concurrent unique index builds, while still maintaining uniqueness by: - ignoring SnapshotSelf dead tuples during uniqueness checks in tuplesort as not a guarantee, but a fail-fast mechanics - adding a uniqueness check in _bt_load that detects multiple alive tuples with the same key values as a guarantee of correctness Tuples are SnapshotSelf tested only in the case of equal index key values, otherwise _bt_load works like before. --- src/backend/access/heap/README.HOT | 12 +- src/backend/access/heap/heapam_handler.c | 6 +- src/backend/access/nbtree/nbtdedup.c | 8 +- src/backend/access/nbtree/nbtreadpage.c | 2 +- src/backend/access/nbtree/nbtsort.c | 195 ++++++++++++++---- src/backend/access/nbtree/nbtsplitloc.c | 12 +- src/backend/access/nbtree/nbtutils.c | 30 ++- src/backend/catalog/index.c | 8 +- src/backend/commands/indexcmds.c | 4 +- src/backend/utils/sort/tuplesortvariants.c | 71 +++++-- src/include/access/nbtree.h | 4 +- src/include/access/tableam.h | 2 +- src/include/utils/tuplesort.h | 1 + .../expected/cic_reset_snapshots.out | 31 ++- 14 files changed, 290 insertions(+), 96 deletions(-) diff --git a/src/backend/access/heap/README.HOT b/src/backend/access/heap/README.HOT index 74e407f375a..829dad1194e 100644 --- a/src/backend/access/heap/README.HOT +++ b/src/backend/access/heap/README.HOT @@ -386,12 +386,12 @@ have the HOT-safety property enforced before we start to build the new index. After waiting for transactions which had the table open, we build the index -for all rows that are valid in a fresh snapshot. Any tuples visible in the -snapshot will have only valid forward-growing HOT chains. (They might have -older HOT updates behind them which are broken, but this is OK for the same -reason it's OK in a regular index build.) As above, we point the index -entry at the root of the HOT-update chain but we use the key value from the -live tuple. +for all rows that are valid in a fresh snapshot, which is updated every so +often. Any tuples visible in the snapshot will have only valid forward-growing +HOT chains. (They might have older HOT updates behind them which are broken, +but this is OK for the same reason it's OK in a regular index build.) +As above, we point the index entry at the root of the HOT-update chain but we +use the key value from the live tuple. We mark the index open for inserts (but still not ready for reads) then we again wait for transactions which have the table open. Then we take diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 24a554a10d4..4bd1edf865d 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1252,15 +1252,15 @@ heapam_index_build_range_scan(Relation heapRelation, * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, or during bootstrap, we take a regular MVCC snapshot * and index whatever's live according to that while that snapshot is reset - * every so often (in case of non-unique index). + * every so often. */ OldestXmin = InvalidTransactionId; /* - * For unique index we need consistent snapshot for the whole scan. + * For concurrent builds of non-system indexes, we may want to periodically + * reset snapshots to allow vacuum to clean up tuples. */ reset_snapshots = indexInfo->ii_Concurrent && - !indexInfo->ii_Unique && !is_system_catalog; /* just for the case */ /* okay to ignore lazy VACUUMs here */ diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c index 95be0b17939..bf8c96121e6 100644 --- a/src/backend/access/nbtree/nbtdedup.c +++ b/src/backend/access/nbtree/nbtdedup.c @@ -148,7 +148,7 @@ _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, _bt_dedup_start_pending(state, itup, offnum); } else if (state->deduplicate && - _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + _bt_keep_natts_fast(rel, state->base, itup, NULL) > nkeyatts && _bt_dedup_save_htid(state, itup)) { /* @@ -374,7 +374,7 @@ _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, /* itup starts first pending interval */ _bt_dedup_start_pending(state, itup, offnum); } - else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + else if (_bt_keep_natts_fast(rel, state->base, itup, NULL) > nkeyatts && _bt_dedup_save_htid(state, itup)) { /* Tuple is equal; just added its TIDs to pending interval */ @@ -787,12 +787,12 @@ _bt_do_singleval(Relation rel, Page page, BTDedupState state, itemid = PageGetItemId(page, minoff); itup = (IndexTuple) PageGetItem(page, itemid); - if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + if (_bt_keep_natts_fast(rel, newitem, itup, NULL) > nkeyatts) { itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page)); itup = (IndexTuple) PageGetItem(page, itemid); - if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + if (_bt_keep_natts_fast(rel, newitem, itup, NULL) > nkeyatts) return true; } diff --git a/src/backend/access/nbtree/nbtreadpage.c b/src/backend/access/nbtree/nbtreadpage.c index 2ba1ca66023..c4c278e777a 100644 --- a/src/backend/access/nbtree/nbtreadpage.c +++ b/src/backend/access/nbtree/nbtreadpage.c @@ -623,7 +623,7 @@ _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate) lasttup = (IndexTuple) PageGetItem(pstate->page, iid); /* Determine the first attribute whose values change on caller's page */ - firstchangingattnum = _bt_keep_natts_fast(rel, firsttup, lasttup); + firstchangingattnum = _bt_keep_natts_fast(rel, firsttup, lasttup, NULL); for (; startikey < so->numberOfKeys; startikey++) { diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 9c395d1ac38..25dcb912a39 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -86,6 +86,7 @@ typedef struct BTSpool Relation index; bool isunique; bool nulls_not_distinct; + bool unique_dead_ignored; } BTSpool; /* @@ -104,6 +105,7 @@ typedef struct BTShared Oid indexrelid; bool isunique; bool nulls_not_distinct; + bool unique_dead_ignored; bool isconcurrent; int scantuplesortstates; @@ -206,15 +208,13 @@ typedef struct BTLeader */ typedef struct BTBuildState { - bool isunique; - bool nulls_not_distinct; bool havedead; Relation heap; BTSpool *spool; /* - * spool2 is needed only when the index is a unique index. Dead tuples are - * put into spool2 instead of spool in order to avoid uniqueness check. + * spool2 is needed only when the index is a unique index and built non-concurrently. + * Dead tuples are put into spool2 instead of spool in order to avoid uniqueness check. */ BTSpool *spool2; double indtuples; @@ -261,7 +261,7 @@ static double _bt_spools_heapscan(Relation heap, Relation index, static void _bt_spooldestroy(BTSpool *btspool); static void _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull); -static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots); +static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool isconcurrent); static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state); static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level); @@ -306,8 +306,6 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) ResetUsage(); #endif /* BTREE_BUILD_STATS */ - buildstate.isunique = indexInfo->ii_Unique; - buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct; buildstate.havedead = false; buildstate.heap = heap; buildstate.spool = NULL; @@ -324,20 +322,20 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) RelationGetRelationName(index)); reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo); - Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); /* * Finish the build by (1) completing the sort of the spool file, (2) * inserting the sorted tuples into btree pages and (3) building the upper * levels. Finally, it may also be necessary to end use of parallelism. */ - _bt_leafbuild(buildstate.spool, buildstate.spool2, !indexInfo->ii_Unique && indexInfo->ii_Concurrent); + _bt_leafbuild(buildstate.spool, buildstate.spool2, indexInfo->ii_Concurrent); _bt_spooldestroy(buildstate.spool); if (buildstate.spool2) _bt_spooldestroy(buildstate.spool2); if (buildstate.btleader) _bt_end_parallel(buildstate.btleader); - Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); result = palloc_object(IndexBuildResult); @@ -384,6 +382,11 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, btspool->index = index; btspool->isunique = indexInfo->ii_Unique; btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct; + /* + * We need to ignore dead tuples for unique checks in case of concurrent build. + * It is required because of periodic reset of snapshot. + */ + btspool->unique_dead_ignored = indexInfo->ii_Concurrent && indexInfo->ii_Unique; /* Save as primary spool */ buildstate->spool = btspool; @@ -432,8 +435,9 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * the use of parallelism or any other factor. */ buildstate->spool->sortstate = - tuplesort_begin_index_btree(heap, index, buildstate->isunique, - buildstate->nulls_not_distinct, + tuplesort_begin_index_btree(heap, index, btspool->isunique, + btspool->nulls_not_distinct, + btspool->unique_dead_ignored, maintenance_work_mem, coordinate, TUPLESORT_NONE); @@ -441,8 +445,12 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * If building a unique index, put dead tuples in a second spool to keep * them out of the uniqueness check. We expect that the second spool (for * dead tuples) won't get very full, so we give it only work_mem. + * + * In case of concurrent build dead tuples do not need to be put into index + * since we wait for all snapshots older than reference snapshot during the + * validation phase. */ - if (indexInfo->ii_Unique) + if (indexInfo->ii_Unique && !indexInfo->ii_Concurrent) { BTSpool *btspool2 = palloc0_object(BTSpool); SortCoordinate coordinate2 = NULL; @@ -473,7 +481,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * full, so we give it only work_mem */ buildstate->spool2->sortstate = - tuplesort_begin_index_btree(heap, index, false, false, work_mem, + tuplesort_begin_index_btree(heap, index, false, false, false, work_mem, coordinate2, TUPLESORT_NONE); } @@ -486,7 +494,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, reltuples = _bt_parallel_heapscan(buildstate, &indexInfo->ii_BrokenHotChain); InvalidateCatalogSnapshot(); - Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); /* * Set the progress target for the next phase. Reset the block number @@ -542,7 +550,7 @@ _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, co * create an entire btree. */ static void -_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots) +_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool isconcurrent) { BTWriteState wstate; @@ -564,7 +572,7 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots) PROGRESS_BTREE_PHASE_PERFORMSORT_2); tuplesort_performsort(btspool2->sortstate); } - Assert(!reset_snapshots || !TransactionIdIsValid(MyProc->xmin)); + Assert(!isconcurrent || !TransactionIdIsValid(MyProc->xmin)); wstate.heap = btspool->heap; wstate.index = btspool->index; @@ -578,7 +586,7 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots) pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_BTREE_PHASE_LEAF_LOAD); - Assert(!reset_snapshots || !TransactionIdIsValid(MyProc->xmin)); + Assert(!isconcurrent || !TransactionIdIsValid(MyProc->xmin)); _bt_load(&wstate, btspool, btspool2); } @@ -1155,13 +1163,118 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) SortSupport sortKeys; int64 tuples_done = 0; bool deduplicate; + bool fail_on_alive_duplicate; wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM); deduplicate = wstate->inskey->allequalimage && !btspool->isunique && BTGetDeduplicateItems(wstate->index); + /* + * The unique_dead_ignored does not guarantee absence of multiple alive + * tuples with same values exists in the spool. Such thing may happen if + * alive tuples are located between a few dead tuples, like this: addda. + */ + fail_on_alive_duplicate = btspool->unique_dead_ignored; - if (merge) + if (fail_on_alive_duplicate) + { + bool seen_alive = false, + prev_tested = false; + IndexTuple prev = NULL; + TupleTableSlot *slot = MakeSingleTupleTableSlot(RelationGetDescr(wstate->heap), + &TTSOpsBufferHeapTuple); + IndexFetchTableData *fetch = table_index_fetch_begin(wstate->heap); + + Assert(btspool->isunique); + Assert(!btspool2); + + while ((itup = tuplesort_getindextuple(btspool->sortstate, true)) != NULL) + { + bool tuples_equal = false; + + /* When we see first tuple, create first index page */ + if (state == NULL) + state = _bt_pagestate(wstate, 0); + + if (prev != NULL) /* if is not the first tuple */ + { + bool has_nulls = false, + call_again, /* just to pass something */ + ignored, /* just to pass something */ + now_alive; + ItemPointerData tid; + + /* if this tuples equal to previous one? */ + if (wstate->inskey->allequalimage) + tuples_equal = _bt_keep_natts_fast(wstate->index, prev, itup, &has_nulls) > keysz; + else + tuples_equal = _bt_keep_natts(wstate->index, prev, itup, wstate->inskey, &has_nulls) > keysz; + + /* handle null values correctly */ + if (has_nulls && !btspool->nulls_not_distinct) + tuples_equal = false; + + if (tuples_equal) + { + /* check previous tuple if not yet */ + if (!prev_tested) + { + call_again = false; + tid = prev->t_tid; + seen_alive = table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + prev_tested = true; + } + + call_again = false; + tid = itup->t_tid; + now_alive = table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + + /* are multiple alive tuples detected in equal group? */ + if (seen_alive && now_alive) + { + char *key_desc; + TupleDesc tupDes = RelationGetDescr(wstate->index); + bool isnull[INDEX_MAX_KEYS]; + Datum values[INDEX_MAX_KEYS]; + + index_deform_tuple(itup, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(wstate->index, values, isnull); + + /* keep this message in sync with the same in comparetup_index_btree_tiebreak */ + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(wstate->index)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(wstate->heap, + RelationGetRelationName(wstate->index)))); + } + seen_alive |= now_alive; + } + } + + if (!tuples_equal) + { + seen_alive = false; + prev_tested = false; + } + + _bt_buildadd(wstate, state, itup, 0); + if (prev) pfree(prev); + prev = CopyIndexTuple(itup); + + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + if (prev) pfree(prev); + ExecDropSingleTupleTableSlot(slot); + table_index_fetch_end(fetch); + Assert(!TransactionIdIsValid(MyProc->xmin)); + } + else if (merge) { /* * Another BTSpool for dead tuples exists. Now we have to merge @@ -1321,7 +1434,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) InvalidOffsetNumber); } else if (_bt_keep_natts_fast(wstate->index, dstate->base, - itup) > keysz && + itup, NULL) > keysz && _bt_dedup_save_htid(dstate, itup)) { /* @@ -1418,7 +1531,6 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) BufferUsage *bufferusage; bool leaderparticipates = true; bool need_pop_active_snapshot = true; - bool reset_snapshot; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -1436,21 +1548,12 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) scantuplesortstates = leaderparticipates ? request + 1 : request; - /* - * For concurrent non-unique index builds, we can periodically reset snapshots - * to allow the xmin horizon to advance. This is safe since these builds don't - * require a consistent view across the entire scan. Unique indexes still need - * a stable snapshot to properly enforce uniqueness constraints. - */ - reset_snapshot = isconcurrent && !btspool->isunique; - /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that, while that snapshot may be reset periodically in - * case of non-unique index. + * live according to that, while that snapshot may be reset periodically. */ if (!isconcurrent) { @@ -1458,16 +1561,16 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) snapshot = SnapshotAny; need_pop_active_snapshot = false; } - else if (reset_snapshot) + else { + /* + * For concurrent index builds, we can periodically reset snapshots to allow + * the xmin horizon to advance. This is safe since these builds don't + * require a consistent view across the entire scan. + */ snapshot = InvalidSnapshot; PushActiveSnapshot(GetTransactionSnapshot()); } - else - { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); - PushActiveSnapshot(snapshot); - } /* * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and @@ -1537,6 +1640,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) btshared->indexrelid = RelationGetRelid(btspool->index); btshared->isunique = btspool->isunique; btshared->nulls_not_distinct = btspool->nulls_not_distinct; + btshared->unique_dead_ignored = btspool->unique_dead_ignored; btshared->isconcurrent = isconcurrent; btshared->scantuplesortstates = scantuplesortstates; btshared->queryid = pgstat_get_my_query_id(); @@ -1551,7 +1655,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) table_parallelscan_initialize(btspool->heap, ParallelTableScanFromBTShared(btshared), snapshot, - reset_snapshot); + isconcurrent); /* * Store shared tuplesort-private state, for which we reserved space. @@ -1631,7 +1735,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * In case of concurrent build snapshots are going to be reset periodically. * Wait until all workers imported initial snapshot. */ - if (reset_snapshot) + if (isconcurrent) WaitForParallelWorkersToAttach(pcxt, true); /* Join heap scan ourselves */ @@ -1642,13 +1746,13 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - if (!reset_snapshot) + if (!isconcurrent) WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); InvalidateCatalogSnapshot(); - Assert(!reset_snapshot|| !TransactionIdIsValid(MyProc->xmin)); + Assert(!isconcurrent || !TransactionIdIsValid(MyProc->xmin)); } /* @@ -1748,6 +1852,7 @@ _bt_leader_participate_as_worker(BTBuildState *buildstate) leaderworker->index = buildstate->spool->index; leaderworker->isunique = buildstate->spool->isunique; leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct; + leaderworker->unique_dead_ignored = buildstate->spool->unique_dead_ignored; /* Initialize second spool, if required */ if (!btleader->btshared->isunique) @@ -1851,11 +1956,12 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) btspool->index = indexRel; btspool->isunique = btshared->isunique; btspool->nulls_not_distinct = btshared->nulls_not_distinct; + btspool->unique_dead_ignored = btshared->unique_dead_ignored; /* Look up shared state private to tuplesort.c */ sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false); tuplesort_attach_shared(sharedsort, seg); - if (!btshared->isunique) + if (!btshared->isunique || btshared->isconcurrent) { btspool2 = NULL; sharedsort2 = NULL; @@ -1935,6 +2041,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, btspool->index, btspool->isunique, btspool->nulls_not_distinct, + btspool->unique_dead_ignored, sortmem, coordinate, TUPLESORT_NONE); @@ -1957,14 +2064,12 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, coordinate2->nParticipants = -1; coordinate2->sharedsort = sharedsort2; btspool2->sortstate = - tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false, + tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false, false, Min(sortmem, work_mem), coordinate2, false); } /* Fill in buildstate for _bt_build_callback() */ - buildstate.isunique = btshared->isunique; - buildstate.nulls_not_distinct = btshared->nulls_not_distinct; buildstate.havedead = false; buildstate.heap = btspool->heap; buildstate.spool = btspool; diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index de9eca3c8b2..b174fb8d3ee 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -688,7 +688,7 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, { itemid = PageGetItemId(state->origpage, maxoff); tup = (IndexTuple) PageGetItem(state->origpage, itemid); - keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); + keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem, NULL); if (keepnatts > 1 && keepnatts <= nkeyatts) { @@ -719,7 +719,7 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) return false; /* Check same conditions as rightmost item case, too */ - keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); + keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem, NULL); if (keepnatts > 1 && keepnatts <= nkeyatts) { @@ -968,7 +968,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, * avoid appending a heap TID in new high key, we're done. Finish split * with default strategy and initial split interval. */ - perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost, NULL); if (perfectpenalty <= indnkeyatts) return perfectpenalty; @@ -989,7 +989,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, * If page is entirely full of duplicates, a single value strategy split * will be performed. */ - perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost, NULL); if (perfectpenalty <= indnkeyatts) { *strategy = SPLIT_MANY_DUPLICATES; @@ -1028,7 +1028,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, itemid = PageGetItemId(state->origpage, P_HIKEY); hikey = (IndexTuple) PageGetItem(state->origpage, itemid); perfectpenalty = _bt_keep_natts_fast(state->rel, hikey, - state->newitem); + state->newitem, NULL); if (perfectpenalty <= indnkeyatts) *strategy = SPLIT_SINGLE_VALUE; else @@ -1150,7 +1150,7 @@ _bt_split_penalty(FindSplitData *state, SplitPoint *split) lastleft = _bt_split_lastleft(state, split); firstright = _bt_split_firstright(state, split); - return _bt_keep_natts_fast(state->rel, lastleft, firstright); + return _bt_keep_natts_fast(state->rel, lastleft, firstright, NULL); } /* diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 5c50f0dd1bd..1326d0fdbb7 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -30,9 +30,6 @@ static int _bt_compare_int(const void *va, const void *vb); -static int _bt_keep_natts(Relation rel, IndexTuple lastleft, - IndexTuple firstright, BTScanInsert itup_key); - /* * _bt_mkscankey @@ -713,7 +710,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright)); /* Determine how many attributes must be kept in truncated tuple */ - keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key); + keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key, NULL); #ifdef DEBUG_NO_TRUNCATE /* Force truncation to be ineffective for testing purposes */ @@ -831,17 +828,24 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, /* * _bt_keep_natts - how many key attributes to keep when truncating. * + * This is exported to be used as comparison function during concurrent + * unique index build in case _bt_keep_natts_fast is not suitable because + * collation is not "allequalimage"/deduplication-safe. + * * Caller provides two tuples that enclose a split point. Caller's insertion * scankey is used to compare the tuples; the scankey's argument values are * not considered here. * + * hasnulls value set to true in case of any null column in any tuple. + * * This can return a number of attributes that is one greater than the * number of key attributes for the index relation. This indicates that the * caller must use a heap TID as a unique-ifier in new pivot tuple. */ -static int +int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, - BTScanInsert itup_key) + BTScanInsert itup_key, + bool *hasnulls) { int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); TupleDesc itupdesc = RelationGetDescr(rel); @@ -867,6 +871,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + if (hasnulls) + *hasnulls |= isNull1 | isNull2; if (isNull1 != isNull2) break; @@ -886,7 +892,7 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * expected in an allequalimage index. */ Assert(!itup_key->allequalimage || - keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright)); + keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright, NULL)); return keepnatts; } @@ -897,7 +903,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * This is exported so that a candidate split point can have its effect on * suffix truncation inexpensively evaluated ahead of time when finding a * split location. A naive bitwise approach to datum comparisons is used to - * save cycles. + * save cycles. Also, it may be used as comparison function during concurrent + * build of unique index. * * The approach taken here usually provides the same answer as _bt_keep_natts * will (for the same pair of tuples from a heapkeyspace index), since the @@ -906,6 +913,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * "equal image" columns, routine is guaranteed to give the same result as * _bt_keep_natts would. * + * hasnulls value set to true in case of any null column in any tuple. + * * Callers can rely on the fact that attributes considered equal here are * definitely also equal according to _bt_keep_natts, even when the index uses * an opclass or collation that is not "allequalimage"/deduplication-safe. @@ -914,7 +923,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * more balanced split point. */ int -_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) +_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright, + bool *hasnulls) { TupleDesc itupdesc = RelationGetDescr(rel); int keysz = IndexRelationGetNumberOfKeyAttributes(rel); @@ -931,6 +941,8 @@ _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + if (hasnulls) + *hasnulls |= isNull1 | isNull2; att = TupleDescCompactAttr(itupdesc, attnum - 1); if (isNull1 != isNull2) diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index b54921ad546..8ddef0858f1 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -1531,7 +1531,7 @@ index_concurrently_build(Oid heapRelationId, index_build(heapRel, indexRelation, indexInfo, false, true); InvalidateCatalogSnapshot(); - Assert(indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); + Assert(!TransactionIdIsValid(MyProc->xmin)); /* Roll back any GUC changes executed by index functions */ AtEOXact_GUC(false, save_nestlevel); @@ -3322,9 +3322,9 @@ IndexCheckExclusion(Relation heapRelation, * if we used HeapTupleSatisfiesVacuum). This leaves us with an index that * does not contain any tuples added to the table while we built the index. * - * Furthermore, in case of non-unique index we set SO_RESET_SNAPSHOT for the - * scan, which causes new snapshot to be set as active every so often. The reason - * for that is to propagate the xmin horizon forward. + * Furthermore, we set SO_RESET_SNAPSHOT for the scan, which causes new + * snapshot to be set as active every so often. The reason for that is to + * propagate the xmin horizon forward. * * Next, we mark the index "indisready" (but still not "indisvalid") and * commit the second transaction and start a third. Again we wait for all diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 67e207220d1..2c5b19b3910 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1701,8 +1701,8 @@ DefineIndex(ParseState *pstate, * chains can be created where the new tuple and the old tuple in the * chain have different index keys. * - * We build the index using all tuples that are visible using single or - * multiple refreshing snapshots. We can be sure that any HOT updates to + * We build the index using all tuples that are visible using multiple + * refreshing snapshots. We can be sure that any HOT updates to * these tuples will be compatible with the index, since any updates made * by transactions that didn't know about the index are now committed or * rolled back. Thus, each visible tuple is either the end of its diff --git a/src/backend/utils/sort/tuplesortvariants.c b/src/backend/utils/sort/tuplesortvariants.c index 2509ac3e3a4..1ecf745c663 100644 --- a/src/backend/utils/sort/tuplesortvariants.c +++ b/src/backend/utils/sort/tuplesortvariants.c @@ -25,6 +25,8 @@ #include "access/hash.h" #include "access/htup_details.h" #include "access/nbtree.h" +#include "access/relscan.h" +#include "access/tableam.h" #include "catalog/index.h" #include "catalog/pg_collation.h" #include "executor/executor.h" @@ -35,6 +37,7 @@ #include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/tuplesort.h" +#include "storage/proc.h" /* sort-type codes for sort__start probes */ @@ -136,6 +139,7 @@ typedef struct bool enforceUnique; /* complain if we find duplicate tuples */ bool uniqueNullsNotDistinct; /* unique constraint null treatment */ + bool uniqueDeadIgnored; /* ignore dead tuples in unique check */ } TuplesortIndexBTreeArg; /* @@ -361,6 +365,7 @@ tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, + bool uniqueDeadIgnored, int workMem, SortCoordinate coordinate, int sortopt) @@ -403,6 +408,7 @@ tuplesort_begin_index_btree(Relation heapRel, arg->index.indexRel = indexRel; arg->enforceUnique = enforceUnique; arg->uniqueNullsNotDistinct = uniqueNullsNotDistinct; + arg->uniqueDeadIgnored = uniqueDeadIgnored; indexScanKey = _bt_mkscankey(indexRel, NULL); @@ -1670,6 +1676,7 @@ comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; char *key_desc; + bool uniqueCheckFail = true; /* * Some rather brain-dead implementations of qsort (such as the one in @@ -1679,18 +1686,58 @@ comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, */ Assert(tuple1 != tuple2); - index_deform_tuple(tuple1, tupDes, values, isnull); - - key_desc = BuildIndexValueDescription(arg->index.indexRel, values, isnull); - - ereport(ERROR, - (errcode(ERRCODE_UNIQUE_VIOLATION), - errmsg("could not create unique index \"%s\"", - RelationGetRelationName(arg->index.indexRel)), - key_desc ? errdetail("Key %s is duplicated.", key_desc) : - errdetail("Duplicate keys exist."), - errtableconstraint(arg->index.heapRel, - RelationGetRelationName(arg->index.indexRel)))); + /* This is fail-fast check, see _bt_load for details. */ + if (arg->uniqueDeadIgnored) + { + bool any_tuple_dead, + call_again = false, + ignored; + + TupleTableSlot *slot = MakeSingleTupleTableSlot(RelationGetDescr(arg->index.heapRel), + &TTSOpsBufferHeapTuple); + ItemPointerData tid = tuple1->t_tid; + + IndexFetchTableData *fetch = table_index_fetch_begin(arg->index.heapRel); + any_tuple_dead = !table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + + if (!any_tuple_dead) + { + call_again = false; + tid = tuple2->t_tid; + any_tuple_dead = !table_index_fetch_tuple(fetch, &tuple2->t_tid, SnapshotSelf, slot, &call_again, + &ignored); + } + + if (any_tuple_dead) + { + elog(DEBUG5, "skipping duplicate values because some of them are dead: (%u,%u) vs (%u,%u)", + ItemPointerGetBlockNumber(&tuple1->t_tid), + ItemPointerGetOffsetNumber(&tuple1->t_tid), + ItemPointerGetBlockNumber(&tuple2->t_tid), + ItemPointerGetOffsetNumber(&tuple2->t_tid)); + + uniqueCheckFail = false; + } + ExecDropSingleTupleTableSlot(slot); + table_index_fetch_end(fetch); + Assert(!TransactionIdIsValid(MyProc->xmin)); + } + if (uniqueCheckFail) + { + index_deform_tuple(tuple1, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(arg->index.indexRel, values, isnull); + + /* keep this error message in sync with the same in _bt_load */ + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(arg->index.indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(arg->index.heapRel, + RelationGetRelationName(arg->index.indexRel)))); + } } /* diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 77224859685..76e277c2e3d 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1310,8 +1310,10 @@ extern bool btproperty(Oid index_oid, int attno, extern char *btbuildphasename(int64 phasenum); extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key); +extern int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, + BTScanInsert itup_key, bool *hasnulls); extern int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, - IndexTuple firstright); + IndexTuple firstright, bool *hasnulls); extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum); extern void _bt_check_third_page(Relation rel, Relation heap, diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 84b06ffa42f..92290e79591 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -1784,7 +1784,7 @@ table_scan_analyze_next_tuple(TableScanDesc scan, * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. * - * In case of non-unique concurrent index build, + * In case of concurrent index build, * SO_RESET_SNAPSHOT is applied for the scan. That leads to changing snapshots * on the fly to allow xmin horizon propagate. */ diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index da68f45acf2..448dc83aa58 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -396,6 +396,7 @@ extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, + bool uniqueDeadIgnored, int workMem, SortCoordinate coordinate, int sortopt); extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel, diff --git a/src/test/modules/injection_points/expected/cic_reset_snapshots.out b/src/test/modules/injection_points/expected/cic_reset_snapshots.out index b4ad90eb339..bb84d61f40d 100644 --- a/src/test/modules/injection_points/expected/cic_reset_snapshots.out +++ b/src/test/modules/injection_points/expected/cic_reset_snapshots.out @@ -17,6 +17,12 @@ SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice' (1 row) +SELECT injection_points_attach('table_parallelscan_initialize', 'notice'); + injection_points_attach +------------------------- + +(1 row) + CREATE SCHEMA cic_reset_snap; CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); INSERT INTO cic_reset_snap.tbl SELECT i, i * I FROM generate_series(1, 200) s(i); @@ -35,7 +41,11 @@ END; $$; ---------------- ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots @@ -72,30 +82,47 @@ NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective DROP INDEX CONCURRENTLY cic_reset_snap.idx; -- The same in parallel mode ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); +-- Detach to keep test stable, since parallel worker may complete scan before leader +SELECT injection_points_detach('heap_reset_scan_snapshot_effective'); + injection_points_detach +------------------------- + +(1 row) + CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots -NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots -NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i DESC NULLS LAST); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; BEGIN TRANSACTION; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); -- 2.43.0 [application/x-patch] v2-0002-Reset-snapshots-periodically-in-non-unique-non-pa.patch (51.1K, 3-v2-0002-Reset-snapshots-periodically-in-non-unique-non-pa.patch) download | inline diff: From 2cecfede527f3f886f3defb27938b3610257e1c6 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <[email protected]> Date: Wed, 14 Jan 2026 15:59:47 +0300 Subject: [PATCH v2 2/4] Reset snapshots periodically in non-unique non-parallel concurrent index builds Long-living snapshots used by CREATE INDEX CONCURRENTLY and REINDEX CONCURRENTLY can hold back the global xmin horizon. Commit d9d076222f5b attempted to allow VACUUM to ignore such snapshots to mitigate this problem. However, this was reverted in commit e28bb8851969 because it could cause indexes to miss heap tuples that were HOT-updated and HOT-pruned during the index creation, leading to index corruption. This patch introduces an alternative by periodically resetting the snapshot used during the first phase. By resetting the snapshot every N pages during the heap scan, it allows the xmin horizon to advance. Currently, this technique is applied to: - only during the first scan of the heap: The second scan during index validation still uses a single snapshot to ensure index correctness - non-parallel index builds: Parallel index builds are not yet supported and will be addressed in following commits - non-unique indexes: Unique index builds still require a consistent snapshot to enforce uniqueness constraints, will be addressed in following commits A new scan option SO_RESET_SNAPSHOT is introduced. When set, it causes the snapshot to be reset "between" every SO_RESET_SNAPSHOT_EACH_N_PAGE pages during the scan. The heap scan code is adjusted to support this option, and the index build code is modified to use it for applicable concurrent index builds that are not on system catalogs and not using parallel workers. --- contrib/amcheck/verify_nbtree.c | 3 +- contrib/pgstattuple/pgstattuple.c | 2 +- src/backend/access/brin/brin.c | 19 ++- src/backend/access/gin/gininsert.c | 22 ++++ src/backend/access/gist/gistbuild.c | 4 + src/backend/access/hash/hash.c | 3 + src/backend/access/heap/heapam.c | 47 ++++++- src/backend/access/heap/heapam_handler.c | 57 +++++++-- src/backend/access/index/genam.c | 2 +- src/backend/access/nbtree/nbtsort.c | 30 ++++- src/backend/access/spgist/spginsert.c | 3 + src/backend/access/transam/xact.c | 11 ++ src/backend/catalog/index.c | 31 ++++- src/backend/commands/indexcmds.c | 17 +-- src/backend/optimizer/plan/planner.c | 10 ++ src/backend/tcop/utility.c | 3 + src/backend/utils/errcodes.txt | 1 + src/bin/pg_amcheck/t/006_cic.pl | 2 +- src/include/access/heapam.h | 2 + src/include/access/tableam.h | 28 ++++- src/include/access/xact.h | 1 + src/test/modules/injection_points/Makefile | 2 +- .../expected/cic_reset_snapshots.out | 115 ++++++++++++++++++ src/test/modules/injection_points/meson.build | 1 + .../sql/cic_reset_snapshots.sql | 95 +++++++++++++++ 25 files changed, 474 insertions(+), 37 deletions(-) create mode 100644 src/test/modules/injection_points/expected/cic_reset_snapshots.out create mode 100644 src/test/modules/injection_points/sql/cic_reset_snapshots.sql diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index e04b7ca694e..032df2d1999 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -557,7 +557,8 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - true); /* syncscan OK? */ + true, /* syncscan OK? */ + false); /* * Scan will behave as the first scan of a CREATE INDEX CONCURRENTLY diff --git a/contrib/pgstattuple/pgstattuple.c b/contrib/pgstattuple/pgstattuple.c index 6a7f8cb4a7c..a0e7ed2e137 100644 --- a/contrib/pgstattuple/pgstattuple.c +++ b/contrib/pgstattuple/pgstattuple.c @@ -335,7 +335,7 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo) errmsg("only heap AM is supported"))); /* Disable syncscan because we assume we scan from block zero upwards */ - scan = table_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false); + scan = table_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false, false); hscan = (HeapScanDesc) scan; InitDirtySnapshot(SnapshotDirty); diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 146ee97a47d..498eb2b991b 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -1219,11 +1219,12 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) state->bs_sortstate = tuplesort_begin_index_brin(maintenance_work_mem, coordinate, TUPLESORT_NONE); - + InvalidateCatalogSnapshot(); /* scan the relation and merge per-worker results */ reltuples = _brin_parallel_merge(state); _brin_end_parallel(state->bs_leader, state); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } else /* no parallel index build */ { @@ -1236,6 +1237,7 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) reltuples = table_index_build_scan(heap, index, indexInfo, false, true, brinbuildCallback, state, NULL); + InvalidateCatalogSnapshot(); /* * process the final batch * @@ -1255,6 +1257,7 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) brin_fill_empty_ranges(state, state->bs_currRangeStart, state->bs_maxRangeStart); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } /* release resources */ @@ -2391,6 +2394,7 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; + bool need_pop_active_snapshot = true; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -2416,9 +2420,16 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * live according to that. */ if (!isconcurrent) + { + Assert(ActiveSnapshotSet()); snapshot = SnapshotAny; + need_pop_active_snapshot = false; + } else + { snapshot = RegisterSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(GetTransactionSnapshot()); + } /* * Estimate size for our own PARALLEL_KEY_BRIN_SHARED workspace. @@ -2461,6 +2472,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, /* If no DSM segment was available, back out (do serial build) */ if (pcxt->seg == NULL) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); if (IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); @@ -2540,6 +2553,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); _brin_end_parallel(brinleader, NULL); return; } @@ -2556,6 +2571,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * sure that the failure-to-start case will not hang forever. */ WaitForParallelWorkersToAttach(pcxt); + if (need_pop_active_snapshot) + PopActiveSnapshot(); } /* diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index c7e38dbe193..b0087cb1a62 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -29,6 +29,7 @@ #include "storage/bufmgr.h" #include "storage/proc.h" #include "storage/predicate.h" +#include "storage/proc.h" #include "tcop/tcopprot.h" #include "utils/datum.h" #include "utils/memutils.h" @@ -680,6 +681,9 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) buildstate.accum.ginstate = &buildstate.ginstate; ginInitBA(&buildstate.accum); + InvalidateCatalogSnapshot(); + Assert(!indexInfo->ii_Concurrent || indexInfo->ii_ParallelWorkers || !TransactionIdIsValid(MyProc->xmin)); + /* Report table scan phase started */ pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_GIN_PHASE_INDEXBUILD_TABLESCAN); @@ -742,11 +746,13 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) tuplesort_begin_index_gin(heap, index, maintenance_work_mem, coordinate, TUPLESORT_NONE); + InvalidateCatalogSnapshot(); /* scan the relation in parallel and merge per-worker results */ reltuples = _gin_parallel_merge(state); _gin_end_parallel(state->bs_leader, state); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } else /* no parallel index build */ { @@ -756,6 +762,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) */ reltuples = table_index_build_scan(heap, index, indexInfo, false, true, ginBuildCallback, &buildstate, NULL); + InvalidateCatalogSnapshot(); /* dump remaining entries to the index */ oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx); @@ -769,6 +776,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) list, nlist, &buildstate.buildStats); } MemoryContextSwitchTo(oldCtx); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } MemoryContextDelete(buildstate.funcCtx); @@ -941,6 +949,7 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; + bool need_pop_active_snapshot = true; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -965,9 +974,16 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * live according to that. */ if (!isconcurrent) + { + Assert(ActiveSnapshotSet()); snapshot = SnapshotAny; + need_pop_active_snapshot = false; + } else + { snapshot = RegisterSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(GetTransactionSnapshot()); + } /* * Estimate size for our own PARALLEL_KEY_GIN_SHARED workspace. @@ -1010,6 +1026,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, /* If no DSM segment was available, back out (do serial build) */ if (pcxt->seg == NULL) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); if (IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); @@ -1084,6 +1102,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); _gin_end_parallel(ginleader, NULL); return; } @@ -1100,6 +1120,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * sure that the failure-to-start case will not hang forever. */ WaitForParallelWorkersToAttach(pcxt); + if (need_pop_active_snapshot) + PopActiveSnapshot(); } /* diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 7f57c787f4c..6207e9f5d81 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -43,6 +43,7 @@ #include "optimizer/optimizer.h" #include "storage/bufmgr.h" #include "storage/bulk_write.h" +#include "storage/proc.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -259,6 +260,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) buildstate.indtuples = 0; buildstate.indtuplesSize = 0; + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); if (buildstate.buildMode == GIST_SORTED_BUILD) { /* @@ -350,6 +352,8 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = (double) buildstate.indtuples; + InvalidateCatalogSnapshot(); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); return result; } diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index e88ddb32a05..1ee1da1ec9b 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -30,6 +30,7 @@ #include "nodes/execnodes.h" #include "optimizer/plancat.h" #include "pgstat.h" +#include "storage/proc.h" #include "utils/fmgrprotos.h" #include "utils/index_selfuncs.h" #include "utils/rel.h" @@ -198,6 +199,8 @@ hashbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; + InvalidateCatalogSnapshot(); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); return result; } diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index a231563f0df..62657d07f04 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -54,6 +54,7 @@ #include "utils/inval.h" #include "utils/spccache.h" #include "utils/syscache.h" +#include "utils/injection_point.h" static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, @@ -697,6 +698,36 @@ heap_prepare_pagescan(TableScanDesc sscan) LockBuffer(buffer, BUFFER_LOCK_UNLOCK); } +/* + * Reset the active snapshot during a scan. + * This ensures the xmin horizon can advance while maintaining safe tuple visibility. + * Note: No other snapshot should be active during this operation. + */ +static inline void +heap_reset_scan_snapshot(TableScanDesc sscan) +{ + /* Make sure no other snapshot was set as active. */ + Assert(GetActiveSnapshot() == sscan->rs_snapshot); + /* And make sure active snapshot is not registered. */ + Assert(GetActiveSnapshot()->regd_count == 0); + PopActiveSnapshot(); + + sscan->rs_snapshot = InvalidSnapshot; /* just to be tidy */ + Assert(!HaveRegisteredOrActiveSnapshot()); + InvalidateCatalogSnapshot(); + + /* The goal of snapshot reset is to allow horizon to advance. */ + Assert(!TransactionIdIsValid(MyProc->xmin)); +#if USE_INJECTION_POINTS + /* In some cases it is still not possible due xid assign. */ + if (!TransactionIdIsValid(MyProc->xid)) + INJECTION_POINT("heap_reset_scan_snapshot_effective", NULL); +#endif + + PushActiveSnapshot(GetLatestSnapshot()); + sscan->rs_snapshot = GetActiveSnapshot(); +} + /* * heap_fetch_next_buffer - read and pin the next block from MAIN_FORKNUM. * @@ -738,7 +769,12 @@ heap_fetch_next_buffer(HeapScanDesc scan, ScanDirection dir) scan->rs_cbuf = read_stream_next_buffer(scan->rs_read_stream, NULL); if (BufferIsValid(scan->rs_cbuf)) + { scan->rs_cblock = BufferGetBlockNumber(scan->rs_cbuf); + if (scan->rs_base.rs_flags & SO_RESET_SNAPSHOT && + scan->rs_cblock % SO_RESET_SNAPSHOT_EACH_N_PAGE == 0) + heap_reset_scan_snapshot((TableScanDesc) scan); + } } /* @@ -1399,7 +1435,16 @@ heap_endscan(TableScanDesc sscan) if (scan->rs_parallelworkerdata != NULL) pfree(scan->rs_parallelworkerdata); - +#ifdef USE_ASSERT_CHECKING + if (scan->rs_base.rs_flags & SO_RESET_SNAPSHOT) + { + Assert(!(scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)); + /* Make sure no other snapshot was set as active. */ + Assert(GetActiveSnapshot() == sscan->rs_snapshot); + /* And make sure snapshot is not registered. */ + Assert(GetActiveSnapshot()->regd_count == 0); + } +#endif if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT) UnregisterSnapshot(scan->rs_base.rs_snapshot); diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 3ff36f59bf8..16c460aa23f 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1210,6 +1210,8 @@ heapam_index_build_range_scan(Relation heapRelation, ExprContext *econtext; Snapshot snapshot; bool need_unregister_snapshot = false; + bool need_pop_active_snapshot = false; + bool reset_snapshots = false; TransactionId OldestXmin; BlockNumber previous_blkno = InvalidBlockNumber; BlockNumber root_blkno = InvalidBlockNumber; @@ -1244,9 +1246,6 @@ heapam_index_build_range_scan(Relation heapRelation, /* Arrange for econtext's scan tuple to be the tuple under test */ econtext->ecxt_scantuple = slot; - /* Set up execution state for predicate, if any. */ - predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); - /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time @@ -1256,6 +1255,15 @@ heapam_index_build_range_scan(Relation heapRelation, */ OldestXmin = InvalidTransactionId; + /* + * For unique index we need consistent snapshot for the whole scan. + * In case of parallel scan some additional infrastructure required + * to perform scan with SO_RESET_SNAPSHOT which is not yet ready. + */ + reset_snapshots = indexInfo->ii_Concurrent && + !indexInfo->ii_Unique && + !is_system_catalog; /* just for the case */ + /* okay to ignore lazy VACUUMs here */ if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent) OldestXmin = GetOldestNonRemovableTransactionId(heapRelation); @@ -1264,24 +1272,41 @@ heapam_index_build_range_scan(Relation heapRelation, { /* * Serial index build. - * - * Must begin our own heap scan in this case. We may also need to - * register a snapshot whose lifetime is under our direct control. */ if (!TransactionIdIsValid(OldestXmin)) { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); - need_unregister_snapshot = true; + snapshot = GetTransactionSnapshot(); + /* + * Must begin our own heap scan in this case. We may also need to + * register a snapshot whose lifetime is under our direct control. + * In case of resetting of snapshot during the scan registration is + * not allowed because snapshot is going to be changed every so + * often. + */ + if (!reset_snapshots) + { + snapshot = RegisterSnapshot(snapshot); + need_unregister_snapshot = true; + } + Assert(!ActiveSnapshotSet()); + PushActiveSnapshot(snapshot); + /* store link to snapshot because it may be copied */ + snapshot = GetActiveSnapshot(); + need_pop_active_snapshot = true; } else + { + Assert(!indexInfo->ii_Concurrent); snapshot = SnapshotAny; + } scan = table_beginscan_strat(heapRelation, /* relation */ snapshot, /* snapshot */ 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - allow_sync); /* syncscan OK? */ + allow_sync, /* syncscan OK? */ + reset_snapshots /* reset snapshots? */); } else { @@ -1295,6 +1320,8 @@ heapam_index_build_range_scan(Relation heapRelation, Assert(!IsBootstrapProcessingMode()); Assert(allow_sync); snapshot = scan->rs_snapshot; + PushActiveSnapshot(snapshot); + need_pop_active_snapshot = true; } hscan = (HeapScanDesc) scan; @@ -1309,6 +1336,13 @@ heapam_index_build_range_scan(Relation heapRelation, Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) : !TransactionIdIsValid(OldestXmin)); Assert(snapshot == SnapshotAny || !anyvisible); + Assert(snapshot == SnapshotAny || ActiveSnapshotSet()); + + /* Set up execution state for predicate, if any. */ + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + /* Clear reference to snapshot since it may be changed by the scan itself. */ + if (reset_snapshots) + snapshot = InvalidSnapshot; /* Publish number of blocks to scan */ if (progress) @@ -1744,6 +1778,8 @@ heapam_index_build_range_scan(Relation heapRelation, table_endscan(scan); + if (need_pop_active_snapshot) + PopActiveSnapshot(); /* we can now forget our snapshot, if set and registered by us */ if (need_unregister_snapshot) UnregisterSnapshot(snapshot); @@ -1816,7 +1852,8 @@ heapam_index_validate_scan(Relation heapRelation, 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - false); /* syncscan not OK */ + false, /* syncscan not OK */ + false); hscan = (HeapScanDesc) scan; pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL, diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 5e89b86a62c..e4284181738 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -472,7 +472,7 @@ systable_beginscan(Relation heapRelation, */ sysscan->scan = table_beginscan_strat(heapRelation, snapshot, nkeys, key, - true, false); + true, false, false); sysscan->iscan = NULL; } diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 69ef1527e06..6fc72bd2c94 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -261,7 +261,7 @@ static double _bt_spools_heapscan(Relation heap, Relation index, static void _bt_spooldestroy(BTSpool *btspool); static void _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull); -static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2); +static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots); static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state); static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level); @@ -324,18 +324,22 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) RelationGetRelationName(index)); reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo); + Assert(indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique || + !indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); /* * Finish the build by (1) completing the sort of the spool file, (2) * inserting the sorted tuples into btree pages and (3) building the upper * levels. Finally, it may also be necessary to end use of parallelism. */ - _bt_leafbuild(buildstate.spool, buildstate.spool2); + _bt_leafbuild(buildstate.spool, buildstate.spool2, !indexInfo->ii_ParallelWorkers && indexInfo->ii_Concurrent); _bt_spooldestroy(buildstate.spool); if (buildstate.spool2) _bt_spooldestroy(buildstate.spool2); if (buildstate.btleader) _bt_end_parallel(buildstate.btleader); + Assert(indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique || + !indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); result = palloc_object(IndexBuildResult); @@ -483,6 +487,9 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, else reltuples = _bt_parallel_heapscan(buildstate, &indexInfo->ii_BrokenHotChain); + InvalidateCatalogSnapshot(); + Assert(indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique || + !indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); /* * Set the progress target for the next phase. Reset the block number @@ -538,7 +545,7 @@ _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, co * create an entire btree. */ static void -_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) +_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots) { BTWriteState wstate; @@ -560,18 +567,21 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) PROGRESS_BTREE_PHASE_PERFORMSORT_2); tuplesort_performsort(btspool2->sortstate); } + Assert(!reset_snapshots || !TransactionIdIsValid(MyProc->xmin)); wstate.heap = btspool->heap; wstate.index = btspool->index; wstate.inskey = _bt_mkscankey(wstate.index, NULL); /* _bt_mkscankey() won't set allequalimage without metapage */ wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true); + InvalidateCatalogSnapshot(); /* reserve the metapage */ wstate.btws_pages_alloced = BTREE_METAPAGE + 1; pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_BTREE_PHASE_LEAF_LOAD); + Assert(!reset_snapshots || !TransactionIdIsValid(MyProc->xmin)); _bt_load(&wstate, btspool, btspool2); } @@ -1410,6 +1420,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; + bool need_pop_active_snapshot = true; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -1435,9 +1446,16 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * live according to that. */ if (!isconcurrent) + { + Assert(ActiveSnapshotSet()); snapshot = SnapshotAny; + need_pop_active_snapshot = false; + } else + { snapshot = RegisterSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(snapshot); + } /* * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and @@ -1491,6 +1509,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) /* If no DSM segment was available, back out (do serial build) */ if (pcxt->seg == NULL) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); if (IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); @@ -1585,6 +1605,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); _bt_end_parallel(btleader); return; } @@ -1601,6 +1623,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * sure that the failure-to-start case will not hang forever. */ WaitForParallelWorkersToAttach(pcxt); + if (need_pop_active_snapshot) + PopActiveSnapshot(); } /* diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 780ef646a54..49832a36470 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -24,6 +24,7 @@ #include "nodes/execnodes.h" #include "storage/bufmgr.h" #include "storage/bulk_write.h" +#include "storage/proc.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -143,6 +144,8 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) result = palloc0_object(IndexBuildResult); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; + InvalidateCatalogSnapshot(); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); return result; } diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index aafc53e0164..7772d1379bc 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -3708,6 +3708,17 @@ PreventInTransactionBlock(bool isTopLevel, const char *stmtType) MyXactFlags |= XACT_FLAGS_NEEDIMMEDIATECOMMIT; } +void +PreventIsolationUsesXactSnapshot(const char *stmtType) +{ + if (IsolationUsesXactSnapshot()) + ereport(ERROR, + (errcode(ERRCODE_INAPPROPRIATE_ISOLATION_LEVEL_FOR_COMMAND), + /* translator: %s represents an SQL statement name */ + errmsg("%s does not support transaction isolation higher than READ COMMITTED", + stmtType))); +} + /* * WarnNoTransactionBlock * RequireTransactionBlock diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 43de42ce39e..2ace60c1f07 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -80,6 +80,7 @@ #include "utils/snapmgr.h" #include "utils/syscache.h" #include "utils/tuplesort.h" +#include "storage/proc.h" /* Potentially set by pg_upgrade_support functions */ Oid binary_upgrade_next_index_pg_class_oid = InvalidOid; @@ -1490,8 +1491,8 @@ index_concurrently_build(Oid heapRelationId, Relation indexRelation; IndexInfo *indexInfo; - /* This had better make sure that a snapshot is active */ - Assert(ActiveSnapshotSet()); + Assert(!TransactionIdIsValid(MyProc->xmin)); + Assert(!TransactionIdIsValid(MyProc->xid)); /* Open and lock the parent heap relation */ heapRel = table_open(heapRelationId, ShareUpdateExclusiveLock); @@ -1509,19 +1510,29 @@ index_concurrently_build(Oid heapRelationId, indexRelation = index_open(indexRelationId, RowExclusiveLock); + /* BuildIndexInfo may require as snapshot for expressions and predicates */ + PushActiveSnapshot(GetTransactionSnapshot()); /* * We have to re-build the IndexInfo struct, since it was lost in the * commit of the transaction where this concurrent index was created at * the catalog level. */ indexInfo = BuildIndexInfo(indexRelation); + /* Done with snapshot */ + PopActiveSnapshot(); Assert(!indexInfo->ii_ReadyForInserts); indexInfo->ii_Concurrent = true; indexInfo->ii_BrokenHotChain = false; + InvalidateCatalogSnapshot(); + Assert(!TransactionIdIsValid(MyProc->xmin)); + /* Now build the index */ index_build(heapRel, indexRelation, indexInfo, false, true); + InvalidateCatalogSnapshot(); + Assert((indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique) || !TransactionIdIsValid(MyProc->xmin)); + /* Roll back any GUC changes executed by index functions */ AtEOXact_GUC(false, save_nestlevel); @@ -1532,12 +1543,19 @@ index_concurrently_build(Oid heapRelationId, table_close(heapRel, NoLock); index_close(indexRelation, NoLock); + /* + * Updating pg_index might involve TOAST table access, so ensure we + * have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); /* * Update the pg_index row to mark the index as ready for inserts. Once we * commit this transaction, any new transactions that open the table must * insert new entries into the index for insertions and non-HOT updates. */ index_set_state_flags(indexRelationId, INDEX_CREATE_SET_READY); + /* we can do away with our snapshot */ + PopActiveSnapshot(); } /* @@ -3234,7 +3252,8 @@ IndexCheckExclusion(Relation heapRelation, 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - true); /* syncscan OK */ + true, /* syncscan OK */ + false); while (table_scan_getnextslot(scan, ForwardScanDirection, slot)) { @@ -3297,12 +3316,16 @@ IndexCheckExclusion(Relation heapRelation, * as of the start of the scan (see table_index_build_scan), whereas a normal * build takes care to include recently-dead tuples. This is OK because * we won't mark the index valid until all transactions that might be able - * to see those tuples are gone. The reason for doing that is to avoid + * to see those tuples are gone. One of reasons for doing that is to avoid * bogus unique-index failures due to concurrent UPDATEs (we might see * different versions of the same row as being valid when we pass over them, * if we used HeapTupleSatisfiesVacuum). This leaves us with an index that * does not contain any tuples added to the table while we built the index. * + * Furthermore, in case of non-unique index we set SO_RESET_SNAPSHOT for the + * scan, which causes new snapshot to be set as active every so often. The reason + * for that is to propagate the xmin horizon forward. + * * Next, we mark the index "indisready" (but still not "indisvalid") and * commit the second transaction and start a third. Again we wait for all * transactions that could have been modifying the table to terminate. Now diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 635679cc1f2..67e207220d1 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1701,23 +1701,17 @@ DefineIndex(ParseState *pstate, * chains can be created where the new tuple and the old tuple in the * chain have different index keys. * - * We now take a new snapshot, and build the index using all tuples that - * are visible in this snapshot. We can be sure that any HOT updates to + * We build the index using all tuples that are visible using single or + * multiple refreshing snapshots. We can be sure that any HOT updates to * these tuples will be compatible with the index, since any updates made * by transactions that didn't know about the index are now committed or * rolled back. Thus, each visible tuple is either the end of its * HOT-chain or the extension of the chain is HOT-safe for this index. */ - /* Set ActiveSnapshot since functions in the indexes may need it */ - PushActiveSnapshot(GetTransactionSnapshot()); - /* Perform concurrent build of index */ index_concurrently_build(tableId, indexRelationId); - /* we can do away with our snapshot */ - PopActiveSnapshot(); - /* * Commit this transaction to make the indisready update visible. */ @@ -2874,8 +2868,11 @@ ExecReindex(ParseState *pstate, const ReindexStmt *stmt, bool isTopLevel) } if (concurrently) + { PreventInTransactionBlock(isTopLevel, "REINDEX CONCURRENTLY"); + PreventIsolationUsesXactSnapshot("REINDEX CONCURRENTLY"); + } params.options = (verbose ? REINDEXOPT_VERBOSE : 0) | @@ -4131,9 +4128,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein if (newidx->safe) set_indexsafe_procflags(); - /* Set ActiveSnapshot since functions in the indexes may need it */ - PushActiveSnapshot(GetTransactionSnapshot()); - /* * Update progress for the index to build, with the correct parent * table involved. @@ -4148,7 +4142,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein /* Perform concurrent build of new index */ index_concurrently_build(newidx->tableId, newidx->indexId); - PopActiveSnapshot(); CommitTransactionCommand(); } diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 42604a0f75c..d6bb3546bea 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -63,6 +63,7 @@ #include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/selfuncs.h" +#include "utils/snapmgr.h" /* GUC parameters */ double cursor_tuple_fraction = DEFAULT_CURSOR_TUPLE_FRACTION; @@ -7027,6 +7028,7 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) Relation heap; Relation index; RelOptInfo *rel; + bool need_pop_active_snapshot = false; int parallel_workers; BlockNumber heap_blocks; double reltuples; @@ -7082,6 +7084,12 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) heap = table_open(tableOid, NoLock); index = index_open(indexOid, NoLock); + /* Set ActiveSnapshot since functions in the indexes may need it */ + if (!ActiveSnapshotSet()) + { + PushActiveSnapshot(GetTransactionSnapshot()); + need_pop_active_snapshot = true; + } /* * Determine if it's safe to proceed. * @@ -7139,6 +7147,8 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) parallel_workers--; done: + if (need_pop_active_snapshot) + PopActiveSnapshot(); index_close(index, NoLock); table_close(heap, NoLock); diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index bf707f2d57f..a0895f2c46b 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -1460,8 +1460,11 @@ ProcessUtilitySlow(ParseState *pstate, bool is_alter_table; if (stmt->concurrent) + { PreventInTransactionBlock(isTopLevel, "CREATE INDEX CONCURRENTLY"); + PreventIsolationUsesXactSnapshot("CREATE INDEX CONCURRENTLY"); + } /* * Look up the relation OID just once, right here at the diff --git a/src/backend/utils/errcodes.txt b/src/backend/utils/errcodes.txt index 5b25402ebbe..a80f08af241 100644 --- a/src/backend/utils/errcodes.txt +++ b/src/backend/utils/errcodes.txt @@ -259,6 +259,7 @@ Section: Class 25 - Invalid Transaction State 25P02 E ERRCODE_IN_FAILED_SQL_TRANSACTION in_failed_sql_transaction 25P03 E ERRCODE_IDLE_IN_TRANSACTION_SESSION_TIMEOUT idle_in_transaction_session_timeout 25P04 E ERRCODE_TRANSACTION_TIMEOUT transaction_timeout +25P05 E ERRCODE_INAPPROPRIATE_ISOLATION_LEVEL_FOR_COMMAND inappropriate_isolation_level_for_command Section: Class 26 - Invalid SQL Statement Name diff --git a/src/bin/pg_amcheck/t/006_cic.pl b/src/bin/pg_amcheck/t/006_cic.pl index 70b185212e7..4ae7b796c6e 100644 --- a/src/bin/pg_amcheck/t/006_cic.pl +++ b/src/bin/pg_amcheck/t/006_cic.pl @@ -152,7 +152,7 @@ $node->pgbench( 0, [qr{actually processed}], [qr{^$}], - 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for GIN/GIST/BRIN/HASH/SPGIST', + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for GIN', { 'concurrent_ops_gin_idx' => q( SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 24a27cc043a..9499d5f05a9 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -43,6 +43,8 @@ #define HEAP_PAGE_PRUNE_MARK_UNUSED_NOW (1 << 0) #define HEAP_PAGE_PRUNE_FREEZE (1 << 1) +#define SO_RESET_SNAPSHOT_EACH_N_PAGE 4096 + typedef struct BulkInsertStateData *BulkInsertState; typedef struct GlobalVisState GlobalVisState; typedef struct TupleTableSlot TupleTableSlot; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 06084752245..d0709782dde 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -25,6 +25,7 @@ #include "storage/read_stream.h" #include "utils/rel.h" #include "utils/snapshot.h" +#include "utils/injection_point.h" #define DEFAULT_TABLE_ACCESS_METHOD "heap" @@ -63,6 +64,17 @@ typedef enum ScanOptions /* unregister snapshot at scan end? */ SO_TEMP_SNAPSHOT = 1 << 9, + /* + * Reset scan and catalog snapshot every so often? If so, each + * SO_RESET_SNAPSHOT_EACH_N_PAGE pages active snapshot is popped, + * catalog snapshot invalidated, latest snapshot pushed as active. + * + * At the end of the scan snapshot is not popped. + * Goal of such mode is keep xmin propagating horizon forward. + * + * see heap_reset_scan_snapshot for details. + */ + SO_RESET_SNAPSHOT = 1 << 10, } ScanOptions; /* @@ -919,7 +931,8 @@ extern TableScanDesc table_beginscan_catalog(Relation relation, int nkeys, static inline TableScanDesc table_beginscan_strat(Relation rel, Snapshot snapshot, int nkeys, ScanKeyData *key, - bool allow_strat, bool allow_sync) + bool allow_strat, bool allow_sync, + bool reset_snapshot) { uint32 flags = SO_TYPE_SEQSCAN | SO_ALLOW_PAGEMODE; @@ -927,6 +940,15 @@ table_beginscan_strat(Relation rel, Snapshot snapshot, flags |= SO_ALLOW_STRAT; if (allow_sync) flags |= SO_ALLOW_SYNC; + if (reset_snapshot) + { + INJECTION_POINT("table_beginscan_strat_reset_snapshots", NULL); + /* Active snapshot is required on start. */ + Assert(GetActiveSnapshot() == snapshot); + /* Active snapshot should not be registered to keep xmin propagating. */ + Assert(GetActiveSnapshot()->regd_count == 0); + flags |= (SO_RESET_SNAPSHOT); + } return table_beginscan_common(rel, snapshot, nkeys, key, NULL, flags); } @@ -1760,6 +1782,10 @@ table_scan_analyze_next_tuple(TableScanDesc scan, * very hard to detect whether they're really incompatible with the chain tip. * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. + * + * In case of non-unique index and non-parallel concurrent build, + * SO_RESET_SNAPSHOT is applied for the scan. That leads to changing snapshots + * on the fly to allow xmin horizon propagate. */ static inline double table_index_build_scan(Relation table_rel, diff --git a/src/include/access/xact.h b/src/include/access/xact.h index f0b4d795071..2e3f617a949 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -488,6 +488,7 @@ extern bool IsTransactionOrTransactionBlock(void); extern char TransactionBlockStatusCode(void); extern void AbortOutOfAnyTransaction(void); extern void PreventInTransactionBlock(bool isTopLevel, const char *stmtType); +extern void PreventIsolationUsesXactSnapshot(const char *stmtType); extern void RequireTransactionBlock(bool isTopLevel, const char *stmtType); extern void WarnNoTransactionBlock(bool isTopLevel, const char *stmtType); extern bool IsInTransactionBlock(bool isTopLevel); diff --git a/src/test/modules/injection_points/Makefile b/src/test/modules/injection_points/Makefile index a41d781f8c9..eeaeaaf2cc6 100644 --- a/src/test/modules/injection_points/Makefile +++ b/src/test/modules/injection_points/Makefile @@ -9,7 +9,7 @@ EXTENSION = injection_points DATA = injection_points--1.0.sql PGFILEDESC = "injection_points - facility for injection points" -REGRESS = injection_points hashagg reindex_conc vacuum +REGRESS = injection_points hashagg reindex_conc vacuum cic_reset_snapshots REGRESS_OPTS = --dlpath=$(top_builddir)/src/test/regress ISOLATION = basic \ diff --git a/src/test/modules/injection_points/expected/cic_reset_snapshots.out b/src/test/modules/injection_points/expected/cic_reset_snapshots.out new file mode 100644 index 00000000000..b4ad90eb339 --- /dev/null +++ b/src/test/modules/injection_points/expected/cic_reset_snapshots.out @@ -0,0 +1,115 @@ +CREATE EXTENSION injection_points; +SELECT injection_points_set_local(); + injection_points_set_local +---------------------------- + +(1 row) + +SELECT injection_points_attach('heap_reset_scan_snapshot_effective', 'notice'); + injection_points_attach +------------------------- + +(1 row) + +SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice'); + injection_points_attach +------------------------- + +(1 row) + +CREATE SCHEMA cic_reset_snap; +CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); +INSERT INTO cic_reset_snap.tbl SELECT i, i * I FROM generate_series(1, 200) s(i); +CREATE FUNCTION cic_reset_snap.predicate_stable(integer) RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN MOD($1, 2) = 0; +END; $$; +CREATE FUNCTION cic_reset_snap.predicate_stable_no_param() RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN false; +END; $$; +---------------- +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +-- The same in parallel mode +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i DESC NULLS LAST); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +BEGIN TRANSACTION; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +ERROR: CREATE INDEX CONCURRENTLY cannot run inside a transaction block +ROLLBACK ; +SET default_transaction_isolation = serializable; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +ERROR: CREATE INDEX CONCURRENTLY does not support transaction isolation higher than READ COMMITTED +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +ERROR: REINDEX CONCURRENTLY does not support transaction isolation higher than READ COMMITTED +RESET default_transaction_isolation; +DROP SCHEMA cic_reset_snap CASCADE; +NOTICE: drop cascades to 3 other objects +DETAIL: drop cascades to table cic_reset_snap.tbl +drop cascades to function cic_reset_snap.predicate_stable(integer) +drop cascades to function cic_reset_snap.predicate_stable_no_param() +DROP EXTENSION injection_points; diff --git a/src/test/modules/injection_points/meson.build b/src/test/modules/injection_points/meson.build index fcc85414515..5d298cd2710 100644 --- a/src/test/modules/injection_points/meson.build +++ b/src/test/modules/injection_points/meson.build @@ -36,6 +36,7 @@ tests += { 'hashagg', 'reindex_conc', 'vacuum', + 'cic_reset_snapshots', ], 'regress_args': ['--dlpath', meson.project_build_root() / 'src/test/regress'], # The injection points are cluster-wide, so disable installcheck diff --git a/src/test/modules/injection_points/sql/cic_reset_snapshots.sql b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql new file mode 100644 index 00000000000..7f7dffa5be4 --- /dev/null +++ b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql @@ -0,0 +1,95 @@ +CREATE EXTENSION injection_points; + +SELECT injection_points_set_local(); +SELECT injection_points_attach('heap_reset_scan_snapshot_effective', 'notice'); +SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice'); + + +CREATE SCHEMA cic_reset_snap; +CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); +INSERT INTO cic_reset_snap.tbl SELECT i, i * I FROM generate_series(1, 200) s(i); + +CREATE FUNCTION cic_reset_snap.predicate_stable(integer) RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN MOD($1, 2) = 0; +END; $$; + +CREATE FUNCTION cic_reset_snap.predicate_stable_no_param() RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN false; +END; $$; + +---------------- +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); + +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +-- The same in parallel mode +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); + +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i DESC NULLS LAST); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +BEGIN TRANSACTION; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +ROLLBACK ; + +SET default_transaction_isolation = serializable; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +RESET default_transaction_isolation; + +DROP SCHEMA cic_reset_snap CASCADE; + +DROP EXTENSION injection_points; -- 2.43.0 [application/x-patch] v2-0001-Add-stress-tests-for-concurrent-index-builds.patch (9.3K, 4-v2-0001-Add-stress-tests-for-concurrent-index-builds.patch) download | inline diff: From df3da79299db69d7e068d7a497eea1f350556108 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <[email protected]> Date: Sat, 30 Nov 2024 16:24:20 +0100 Subject: [PATCH v2 1/4] Add stress tests for concurrent index builds Introduce stress tests for concurrent index operations: - test concurrent inserts/updates during CREATE/REINDEX INDEX CONCURRENTLY - cover various index types (btree, gin, gist, brin, hash, spgist) - test unique and non-unique indexes - test with expressions and predicates - test both parallel and non-parallel operations These tests verify the behavior of the following commits. --- src/bin/pg_amcheck/meson.build | 1 + src/bin/pg_amcheck/t/006_cic.pl | 225 ++++++++++++++++++++++++++++++++ 2 files changed, 226 insertions(+) create mode 100644 src/bin/pg_amcheck/t/006_cic.pl diff --git a/src/bin/pg_amcheck/meson.build b/src/bin/pg_amcheck/meson.build index 592cef74ecb..51a62dccb7b 100644 --- a/src/bin/pg_amcheck/meson.build +++ b/src/bin/pg_amcheck/meson.build @@ -28,6 +28,7 @@ tests += { 't/003_check.pl', 't/004_verify_heapam.pl', 't/005_opclass_damage.pl', + 't/006_cic.pl', ], }, } diff --git a/src/bin/pg_amcheck/t/006_cic.pl b/src/bin/pg_amcheck/t/006_cic.pl new file mode 100644 index 00000000000..70b185212e7 --- /dev/null +++ b/src/bin/pg_amcheck/t/006_cic.pl @@ -0,0 +1,225 @@ +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Test REINDEX CONCURRENTLY with concurrent modifications and HOT updates +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +Test::More->builder->todo_start('filesystem bug') + if PostgreSQL::Test::Utils::has_wal_read_bug; + +my ($node, $result); + +# +# Test set-up +# +$node = PostgreSQL::Test::Cluster->new('RC_test'); +$node->init; +$node->append_conf('postgresql.conf', + 'lock_timeout = ' . (1000 * $PostgreSQL::Test::Utils::timeout_default)); +$node->append_conf('postgresql.conf', 'fsync = off'); +$node->append_conf('postgresql.conf', 'maintenance_work_mem = 32MB'); # to avoid OOM +$node->append_conf('postgresql.conf', 'shared_buffers = 32MB'); # to avoid OOM +$node->start; +$node->safe_psql('postgres', q(CREATE EXTENSION amcheck)); +$node->safe_psql('postgres', q(CREATE TABLE tbl(i int primary key, + c1 money default 0, c2 money default 0, + c3 money default 0, updated_at timestamp, + ia int4[], p point))); +# uncomment to force non-HOT -> $node->safe_psql('postgres', q(CREATE INDEX CONCURRENTLY idx ON tbl(i, updated_at);)); +# create sequence +$node->safe_psql('postgres', q(CREATE UNLOGGED SEQUENCE in_row_rebuild START 1 INCREMENT 1;)); +$node->safe_psql('postgres', q(SELECT nextval('in_row_rebuild');)); + +# Create helper functions for predicate tests +$node->safe_psql('postgres', q( + CREATE FUNCTION predicate_stable() RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ + BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN true; + END; $$; +)); + +$node->safe_psql('postgres', q( + CREATE FUNCTION predicate_const(integer) RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ + BEGIN + RETURN MOD($1, 2) = 0; + END; $$; +)); + +# Run CIC/RIC in different options concurrently with upserts +$node->pgbench( + '--no-vacuum --client=15 --jobs=4 --exit-on-abort --transactions=1000', + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY', + { + 'concurrent_ops' => q( + SET debug_parallel_query = off; -- this is because predicate_stable implementation + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set variant random(0, 5) + \set parallels random(0, 4) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + \if :variant = 0 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at); + \elif :variant = 1 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at) WHERE predicate_stable(); + \elif :variant = 2 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at) WHERE MOD(i, 2) = 0; + \elif :variant = 3 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at) WHERE predicate_const(i); + \elif :variant = 4 + CREATE INDEX CONCURRENTLY new_idx ON tbl(predicate_const(i)); + \elif :variant = 5 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, predicate_const(i), updated_at) WHERE predicate_const(i); + \endif + \sleep 10 ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + REINDEX INDEX CONCURRENTLY new_idx; + \sleep 10 ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + DROP INDEX CONCURRENTLY new_idx; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1000, 100000) + BEGIN; + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now()) + ON CONFLICT(i) DO UPDATE SET updated_at = now(); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now()) + ON CONFLICT(i) DO UPDATE SET updated_at = now(); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now()) + ON CONFLICT(i) DO UPDATE SET updated_at = now(); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now()) + ON CONFLICT(i) DO UPDATE SET updated_at = now(); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now()) + ON CONFLICT(i) DO UPDATE SET updated_at = now(); + SELECT setval('in_row_rebuild', 1); + COMMIT; + \endif + ) + }); + +$node->safe_psql('postgres', q(TRUNCATE TABLE tbl;)); + +# Run CIC/RIC for unique index concurrently with upserts +$node->pgbench( + '--no-vacuum --client=15 --jobs=4 --exit-on-abort --transactions=1000', + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for unique BTREE', + { + 'concurrent_ops_unique_idx' => q( + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set parallels random(0, 4) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + CREATE UNIQUE INDEX CONCURRENTLY new_idx ON tbl(i); + \sleep 10 ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + REINDEX INDEX CONCURRENTLY new_idx; + \sleep 10 ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + DROP INDEX CONCURRENTLY new_idx; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1, power(10, random(1, 5))) + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now()) + ON CONFLICT(i) DO UPDATE SET updated_at = now(); + SELECT setval('in_row_rebuild', 1); + \endif + ) + }); + +$node->safe_psql('postgres', q(TRUNCATE TABLE tbl;)); + +# Run CIC/RIC for GIN with upserts +$node->pgbench( + '--no-vacuum --client=15 --jobs=4 --exit-on-abort --transactions=1000', + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for GIN/GIST/BRIN/HASH/SPGIST', + { + 'concurrent_ops_gin_idx' => q( + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set parallels random(0, 4) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + CREATE INDEX CONCURRENTLY new_idx ON tbl USING GIN (ia); + \sleep 10 ms + SELECT gin_index_check('new_idx'); + REINDEX INDEX CONCURRENTLY new_idx; + \sleep 10 ms + SELECT gin_index_check('new_idx'); + DROP INDEX CONCURRENTLY new_idx; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1, power(10, random(1, 5))) + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now()) + ON CONFLICT(i) DO UPDATE SET updated_at = now(); + SELECT setval('in_row_rebuild', 1); + \endif + ) + }); + +$node->safe_psql('postgres', q(TRUNCATE TABLE tbl;)); + +# Run CIC/RIC for GIST/BRIN/HASH/SPGIST index concurrently with upserts +$node->pgbench( + '--no-vacuum --client=15 --jobs=4 --exit-on-abort --transactions=1000', + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for GIN/GIST/BRIN/HASH/SPGIST', + { + 'concurrent_ops_other_idx' => q( + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set parallels random(0, 4) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + \set variant random(0, 3) + \if :variant = 0 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING GIST (p); + \elif :variant = 1 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING BRIN (updated_at); + \elif :variant = 2 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING HASH (updated_at); + \elif :variant = 3 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING SPGIST (p); + \endif + \sleep 10 ms + REINDEX INDEX CONCURRENTLY new_idx; + \sleep 10 ms + DROP INDEX CONCURRENTLY new_idx; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1, power(10, random(1, 5))) + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now()) + ON CONFLICT(i) DO UPDATE SET updated_at = now(); + SELECT setval('in_row_rebuild', 1); + \endif + ) + }); + +$node->stop; +done_testing(); \ No newline at end of file -- 2.43.0 [application/x-patch] v2-0003-Support-snapshot-resets-in-parallel-concurrent-in.patch (37.6K, 5-v2-0003-Support-snapshot-resets-in-parallel-concurrent-in.patch) download | inline diff: From 39b26f7deb07901048fdf2ee6193b38f47fdcef3 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <[email protected]> Date: Wed, 14 Jan 2026 18:45:56 +0300 Subject: [PATCH v2 3/4] Support snapshot resets in parallel concurrent index builds Extend periodic snapshot reset support to parallel builds, previously limited to non-parallel operations. This allows the xmin horizon to advance during parallel concurrent index builds as well. The main limitation of applying that technique to parallel builds was a requirement to wait until worker processes restore their initial snapshot from leader. To address this, following changes applied: - add infrastructure to track snapshot restoration in parallel workers - extend parallel scan initialization to support periodic snapshot resets - wait for parallel workers to restore their initial snapshots before proceeding with scan - relax limitation for parallel worker to call GetLatestSnapshot --- src/backend/access/brin/brin.c | 54 ++++++++++------- src/backend/access/gin/gininsert.c | 51 +++++++++------- src/backend/access/heap/heapam_handler.c | 12 ++-- src/backend/access/nbtree/nbtsort.c | 60 ++++++++++++++----- src/backend/access/table/tableam.c | 37 ++++++++++-- src/backend/access/transam/parallel.c | 49 +++++++++++++-- src/backend/catalog/index.c | 2 +- src/backend/executor/nodeSeqscan.c | 3 +- src/backend/executor/nodeTidrangescan.c | 3 +- src/backend/utils/time/snapmgr.c | 8 --- src/include/access/parallel.h | 3 +- src/include/access/relscan.h | 1 + src/include/access/tableam.h | 5 +- .../sql/cic_reset_snapshots.sql | 5 +- 14 files changed, 208 insertions(+), 85 deletions(-) diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 498eb2b991b..c1839e15c9d 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -145,7 +145,6 @@ typedef struct BrinLeader */ BrinShared *brinshared; Sharedsort *sharedsort; - Snapshot snapshot; WalUsage *walusage; BufferUsage *bufferusage; } BrinLeader; @@ -233,7 +232,7 @@ static void brin_fill_empty_ranges(BrinBuildState *state, static void _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, bool isconcurrent, int request); static void _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state); -static Size _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot); +static Size _brin_parallel_estimate_shared(Relation heap); static double _brin_parallel_heapscan(BrinBuildState *state); static double _brin_parallel_merge(BrinBuildState *state); static void _brin_leader_participate_as_worker(BrinBuildState *buildstate, @@ -1224,7 +1223,6 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) reltuples = _brin_parallel_merge(state); _brin_end_parallel(state->bs_leader, state); - Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } else /* no parallel index build */ { @@ -1257,7 +1255,6 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) brin_fill_empty_ranges(state, state->bs_currRangeStart, state->bs_maxRangeStart); - Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } /* release resources */ @@ -1273,6 +1270,9 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = idxtuples; + InvalidateCatalogSnapshot(); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); + return result; } @@ -2385,7 +2385,6 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, { ParallelContext *pcxt; int scantuplesortstates; - Snapshot snapshot; Size estbrinshared; Size estsort; BrinShared *brinshared; @@ -2416,25 +2415,25 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a - * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that. + * concurrent build, we take a regular MVCC snapshot and push it as active. + * Later we index whatever's live according to that snapshot while that + * snapshot is reset periodically. */ if (!isconcurrent) { Assert(ActiveSnapshotSet()); - snapshot = SnapshotAny; need_pop_active_snapshot = false; } else { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); + Assert(!ActiveSnapshotSet()); PushActiveSnapshot(GetTransactionSnapshot()); } /* * Estimate size for our own PARALLEL_KEY_BRIN_SHARED workspace. */ - estbrinshared = _brin_parallel_estimate_shared(heap, snapshot); + estbrinshared = _brin_parallel_estimate_shared(heap); shm_toc_estimate_chunk(&pcxt->estimator, estbrinshared); estsort = tuplesort_estimate_shared(scantuplesortstates); shm_toc_estimate_chunk(&pcxt->estimator, estsort); @@ -2474,8 +2473,6 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, { if (need_pop_active_snapshot) PopActiveSnapshot(); - if (IsMVCCSnapshot(snapshot)) - UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); ExitParallelMode(); return; @@ -2500,7 +2497,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, table_parallelscan_initialize(heap, ParallelTableScanFromBrinShared(brinshared), - snapshot); + isconcurrent ? InvalidSnapshot : SnapshotAny, + isconcurrent); /* * Store shared tuplesort-private state, for which we reserved space. @@ -2546,7 +2544,6 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, brinleader->nparticipanttuplesorts++; brinleader->brinshared = brinshared; brinleader->sharedsort = sharedsort; - brinleader->snapshot = snapshot; brinleader->walusage = walusage; brinleader->bufferusage = bufferusage; @@ -2562,6 +2559,13 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, /* Save leader state now that it's clear build will be parallel */ buildstate->bs_leader = brinleader; + /* + * In case of concurrent build snapshots are going to be reset periodically. + * We need to wait until all workers imported initial snapshot. + */ + if (isconcurrent) + WaitForParallelWorkersToAttach(pcxt, true); + /* Join heap scan ourselves */ if (leaderparticipates) _brin_leader_participate_as_worker(buildstate, heap, index); @@ -2570,9 +2574,13 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - WaitForParallelWorkersToAttach(pcxt); + if (!isconcurrent) + WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); + + InvalidateCatalogSnapshot(); + Assert(!brinleader->brinshared->isconcurrent || !TransactionIdIsValid(MyProc->xmin)); } /* @@ -2593,9 +2601,6 @@ _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state) for (i = 0; i < brinleader->pcxt->nworkers_launched; i++) InstrAccumParallelQuery(&brinleader->bufferusage[i], &brinleader->walusage[i]); - /* Free last reference to MVCC snapshot, if one was used */ - if (IsMVCCSnapshot(brinleader->snapshot)) - UnregisterSnapshot(brinleader->snapshot); DestroyParallelContext(brinleader->pcxt); ExitParallelMode(); } @@ -2795,14 +2800,14 @@ _brin_parallel_merge(BrinBuildState *state) /* * Returns size of shared memory required to store state for a parallel - * brin index build based on the snapshot its parallel scan will use. + * brin index build. */ static Size -_brin_parallel_estimate_shared(Relation heap, Snapshot snapshot) +_brin_parallel_estimate_shared(Relation heap) { /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */ return add_size(BUFFERALIGN(sizeof(BrinShared)), - table_parallelscan_estimate(heap, snapshot)); + table_parallelscan_estimate(heap, InvalidSnapshot)); } /* @@ -2964,6 +2969,13 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) _brin_parallel_scan_and_build(buildstate, brinshared, sharedsort, heapRel, indexRel, sortmem, false); + if (brinshared->isconcurrent) + { + PopActiveSnapshot(); + InvalidateCatalogSnapshot(); + Assert(!TransactionIdIsValid(MyProc->xmin)); + PushActiveSnapshot(GetTransactionSnapshot()); + } /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index b0087cb1a62..d46ad210651 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -135,7 +135,6 @@ typedef struct GinLeader */ GinBuildShared *ginshared; Sharedsort *sharedsort; - Snapshot snapshot; WalUsage *walusage; BufferUsage *bufferusage; } GinLeader; @@ -185,7 +184,7 @@ typedef struct static void _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, bool isconcurrent, int request); static void _gin_end_parallel(GinLeader *ginleader, GinBuildState *state); -static Size _gin_parallel_estimate_shared(Relation heap, Snapshot snapshot); +static Size _gin_parallel_estimate_shared(Relation heap); static double _gin_parallel_heapscan(GinBuildState *state); static double _gin_parallel_merge(GinBuildState *state); static void _gin_leader_participate_as_worker(GinBuildState *buildstate, @@ -752,7 +751,6 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) reltuples = _gin_parallel_merge(state); _gin_end_parallel(state->bs_leader, state); - Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } else /* no parallel index build */ { @@ -776,7 +774,6 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) list, nlist, &buildstate.buildStats); } MemoryContextSwitchTo(oldCtx); - Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } MemoryContextDelete(buildstate.funcCtx); @@ -806,6 +803,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); return result; } @@ -940,7 +938,6 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, { ParallelContext *pcxt; int scantuplesortstates; - Snapshot snapshot; Size estginshared; Size estsort; GinBuildShared *ginshared; @@ -970,25 +967,25 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a - * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that. + * concurrent build, we take a regular MVCC snapshot and push it as active. + * Later we index whatever's live according to that snapshot while that + * snapshot is reset periodically. */ if (!isconcurrent) { Assert(ActiveSnapshotSet()); - snapshot = SnapshotAny; need_pop_active_snapshot = false; } else { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); + Assert(!ActiveSnapshotSet()); PushActiveSnapshot(GetTransactionSnapshot()); } /* * Estimate size for our own PARALLEL_KEY_GIN_SHARED workspace. */ - estginshared = _gin_parallel_estimate_shared(heap, snapshot); + estginshared = _gin_parallel_estimate_shared(heap); shm_toc_estimate_chunk(&pcxt->estimator, estginshared); estsort = tuplesort_estimate_shared(scantuplesortstates); shm_toc_estimate_chunk(&pcxt->estimator, estsort); @@ -1028,8 +1025,6 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, { if (need_pop_active_snapshot) PopActiveSnapshot(); - if (IsMVCCSnapshot(snapshot)) - UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); ExitParallelMode(); return; @@ -1053,7 +1048,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, table_parallelscan_initialize(heap, ParallelTableScanFromGinBuildShared(ginshared), - snapshot); + isconcurrent ? InvalidSnapshot : SnapshotAny, + isconcurrent); /* * Store shared tuplesort-private state, for which we reserved space. @@ -1095,7 +1091,6 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, ginleader->nparticipanttuplesorts++; ginleader->ginshared = ginshared; ginleader->sharedsort = sharedsort; - ginleader->snapshot = snapshot; ginleader->walusage = walusage; ginleader->bufferusage = bufferusage; @@ -1111,6 +1106,13 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, /* Save leader state now that it's clear build will be parallel */ buildstate->bs_leader = ginleader; + /* + * In case of concurrent build snapshots are going to be reset periodically. + * We need to wait until all workers imported initial snapshot. + */ + if (isconcurrent) + WaitForParallelWorkersToAttach(pcxt, true); + /* Join heap scan ourselves */ if (leaderparticipates) _gin_leader_participate_as_worker(buildstate, heap, index); @@ -1119,9 +1121,12 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - WaitForParallelWorkersToAttach(pcxt); + if (!isconcurrent) + WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); + InvalidateCatalogSnapshot(); + Assert(!ginleader->ginshared->isconcurrent || !TransactionIdIsValid(MyProc->xmin)); } /* @@ -1142,9 +1147,6 @@ _gin_end_parallel(GinLeader *ginleader, GinBuildState *state) for (i = 0; i < ginleader->pcxt->nworkers_launched; i++) InstrAccumParallelQuery(&ginleader->bufferusage[i], &ginleader->walusage[i]); - /* Free last reference to MVCC snapshot, if one was used */ - if (IsMVCCSnapshot(ginleader->snapshot)) - UnregisterSnapshot(ginleader->snapshot); DestroyParallelContext(ginleader->pcxt); ExitParallelMode(); } @@ -1819,14 +1821,14 @@ _gin_parallel_merge(GinBuildState *state) /* * Returns size of shared memory required to store state for a parallel - * gin index build based on the snapshot its parallel scan will use. + * gin index build. */ static Size -_gin_parallel_estimate_shared(Relation heap, Snapshot snapshot) +_gin_parallel_estimate_shared(Relation heap) { /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */ return add_size(BUFFERALIGN(sizeof(GinBuildShared)), - table_parallelscan_estimate(heap, snapshot)); + table_parallelscan_estimate(heap, InvalidSnapshot)); } /* @@ -2211,6 +2213,13 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) _gin_parallel_scan_and_build(&buildstate, ginshared, sharedsort, heapRel, indexRel, sortmem, false); + if (ginshared->isconcurrent) + { + PopActiveSnapshot(); + InvalidateCatalogSnapshot(); + Assert(!TransactionIdIsValid(MyProc->xmin)); + PushActiveSnapshot(GetTransactionSnapshot()); + } /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 16c460aa23f..24a554a10d4 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1251,14 +1251,13 @@ heapam_index_build_range_scan(Relation heapRelation, * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, or during bootstrap, we take a regular MVCC snapshot - * and index whatever's live according to that. + * and index whatever's live according to that while that snapshot is reset + * every so often (in case of non-unique index). */ OldestXmin = InvalidTransactionId; /* * For unique index we need consistent snapshot for the whole scan. - * In case of parallel scan some additional infrastructure required - * to perform scan with SO_RESET_SNAPSHOT which is not yet ready. */ reset_snapshots = indexInfo->ii_Concurrent && !indexInfo->ii_Unique && @@ -1320,8 +1319,11 @@ heapam_index_build_range_scan(Relation heapRelation, Assert(!IsBootstrapProcessingMode()); Assert(allow_sync); snapshot = scan->rs_snapshot; - PushActiveSnapshot(snapshot); - need_pop_active_snapshot = true; + if (!reset_snapshots) + { + PushActiveSnapshot(snapshot); + need_pop_active_snapshot = true; + } } hscan = (HeapScanDesc) scan; diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 6fc72bd2c94..9c395d1ac38 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -324,22 +324,20 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) RelationGetRelationName(index)); reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo); - Assert(indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique || - !indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); /* * Finish the build by (1) completing the sort of the spool file, (2) * inserting the sorted tuples into btree pages and (3) building the upper * levels. Finally, it may also be necessary to end use of parallelism. */ - _bt_leafbuild(buildstate.spool, buildstate.spool2, !indexInfo->ii_ParallelWorkers && indexInfo->ii_Concurrent); + _bt_leafbuild(buildstate.spool, buildstate.spool2, !indexInfo->ii_Unique && indexInfo->ii_Concurrent); _bt_spooldestroy(buildstate.spool); if (buildstate.spool2) _bt_spooldestroy(buildstate.spool2); if (buildstate.btleader) _bt_end_parallel(buildstate.btleader); - Assert(indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique || - !indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); result = palloc_object(IndexBuildResult); @@ -488,8 +486,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, reltuples = _bt_parallel_heapscan(buildstate, &indexInfo->ii_BrokenHotChain); InvalidateCatalogSnapshot(); - Assert(indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique || - !indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); /* * Set the progress target for the next phase. Reset the block number @@ -1421,6 +1418,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) BufferUsage *bufferusage; bool leaderparticipates = true; bool need_pop_active_snapshot = true; + bool reset_snapshot; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -1438,12 +1436,21 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) scantuplesortstates = leaderparticipates ? request + 1 : request; + /* + * For concurrent non-unique index builds, we can periodically reset snapshots + * to allow the xmin horizon to advance. This is safe since these builds don't + * require a consistent view across the entire scan. Unique indexes still need + * a stable snapshot to properly enforce uniqueness constraints. + */ + reset_snapshot = isconcurrent && !btspool->isunique; + /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that. + * live according to that, while that snapshot may be reset periodically in + * case of non-unique index. */ if (!isconcurrent) { @@ -1451,6 +1458,11 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) snapshot = SnapshotAny; need_pop_active_snapshot = false; } + else if (reset_snapshot) + { + snapshot = InvalidSnapshot; + PushActiveSnapshot(GetTransactionSnapshot()); + } else { snapshot = RegisterSnapshot(GetTransactionSnapshot()); @@ -1511,7 +1523,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) { if (need_pop_active_snapshot) PopActiveSnapshot(); - if (IsMVCCSnapshot(snapshot)) + if (snapshot != InvalidSnapshot && IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); ExitParallelMode(); @@ -1538,7 +1550,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) btshared->brokenhotchain = false; table_parallelscan_initialize(btspool->heap, ParallelTableScanFromBTShared(btshared), - snapshot); + snapshot, + reset_snapshot); /* * Store shared tuplesort-private state, for which we reserved space. @@ -1614,6 +1627,13 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) /* Save leader state now that it's clear build will be parallel */ buildstate->btleader = btleader; + /* + * In case of concurrent build snapshots are going to be reset periodically. + * Wait until all workers imported initial snapshot. + */ + if (reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, true); + /* Join heap scan ourselves */ if (leaderparticipates) _bt_leader_participate_as_worker(buildstate); @@ -1622,9 +1642,13 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - WaitForParallelWorkersToAttach(pcxt); + if (!reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); + + InvalidateCatalogSnapshot(); + Assert(!reset_snapshot|| !TransactionIdIsValid(MyProc->xmin)); } /* @@ -1646,7 +1670,7 @@ _bt_end_parallel(BTLeader *btleader) InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]); /* Free last reference to MVCC snapshot, if one was used */ - if (IsMVCCSnapshot(btleader->snapshot)) + if (btleader->snapshot != InvalidSnapshot && IsMVCCSnapshot(btleader->snapshot)) UnregisterSnapshot(btleader->snapshot); DestroyParallelContext(btleader->pcxt); ExitParallelMode(); @@ -1896,6 +1920,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, SortCoordinate coordinate; BTBuildState buildstate; TableScanDesc scan; + ParallelTableScanDesc pscan; double reltuples; IndexInfo *indexInfo; @@ -1950,11 +1975,15 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, /* Join parallel scan */ indexInfo = BuildIndexInfo(btspool->index); indexInfo->ii_Concurrent = btshared->isconcurrent; - scan = table_beginscan_parallel(btspool->heap, - ParallelTableScanFromBTShared(btshared)); + pscan = ParallelTableScanFromBTShared(btshared); + scan = table_beginscan_parallel(btspool->heap, pscan); reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo, true, progress, _bt_build_callback, &buildstate, scan); + InvalidateCatalogSnapshot(); + if (pscan->phs_reset_snapshot) + PopActiveSnapshot(); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); /* Execute this worker's part of the sort */ if (progress) @@ -1990,4 +2019,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, tuplesort_end(btspool->sortstate); if (btspool2) tuplesort_end(btspool2->sortstate); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); + if (pscan->phs_reset_snapshot) + PushActiveSnapshot(GetTransactionSnapshot()); } diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index dfda1af412e..26df5638921 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -132,10 +132,10 @@ table_parallelscan_estimate(Relation rel, Snapshot snapshot) { Size sz = 0; - if (IsMVCCSnapshot(snapshot)) + if (snapshot != InvalidSnapshot && IsMVCCSnapshot(snapshot)) sz = add_size(sz, EstimateSnapshotSpace(snapshot)); else - Assert(snapshot == SnapshotAny); + Assert(snapshot == SnapshotAny || snapshot == InvalidSnapshot); sz = add_size(sz, rel->rd_tableam->parallelscan_estimate(rel)); @@ -144,21 +144,36 @@ table_parallelscan_estimate(Relation rel, Snapshot snapshot) void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, - Snapshot snapshot) + Snapshot snapshot, bool reset_snapshot) { Size snapshot_off = rel->rd_tableam->parallelscan_initialize(rel, pscan); pscan->phs_snapshot_off = snapshot_off; - if (IsMVCCSnapshot(snapshot)) + /* + * Initialize parallel scan description. For normal scans with a regular + * MVCC snapshot, serialize the snapshot info. For scans that use periodic + * snapshot resets, mark the scan accordingly. + */ + if (reset_snapshot) + { + Assert(snapshot == InvalidSnapshot); + pscan->phs_snapshot_any = false; + pscan->phs_reset_snapshot = true; + INJECTION_POINT("table_parallelscan_initialize", NULL); + } + else if (IsMVCCSnapshot(snapshot)) { SerializeSnapshot(snapshot, (char *) pscan + pscan->phs_snapshot_off); pscan->phs_snapshot_any = false; + pscan->phs_reset_snapshot = false; } else { Assert(snapshot == SnapshotAny); + Assert(!reset_snapshot); pscan->phs_snapshot_any = true; + pscan->phs_reset_snapshot = false; } } @@ -171,7 +186,19 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan) Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator)); - if (!pscan->phs_snapshot_any) + /* + * For scans that + * use periodic snapshot resets, mark the scan accordingly and use the active + * snapshot as the initial state. + */ + if (pscan->phs_reset_snapshot) + { + Assert(ActiveSnapshotSet()); + flags |= SO_RESET_SNAPSHOT; + /* Start with current active snapshot. */ + snapshot = GetActiveSnapshot(); + } + else if (!pscan->phs_snapshot_any) { /* Snapshot was serialized -- restore it */ snapshot = RestoreSnapshot((char *) pscan + pscan->phs_snapshot_off); diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c index ab1dfb30e73..0498c07c37b 100644 --- a/src/backend/access/transam/parallel.c +++ b/src/backend/access/transam/parallel.c @@ -79,6 +79,7 @@ #define PARALLEL_KEY_RELMAPPER_STATE UINT64CONST(0xFFFFFFFFFFFF000D) #define PARALLEL_KEY_UNCOMMITTEDENUMS UINT64CONST(0xFFFFFFFFFFFF000E) #define PARALLEL_KEY_CLIENTCONNINFO UINT64CONST(0xFFFFFFFFFFFF000F) +#define PARALLEL_KEY_SNAPSHOT_RESTORED UINT64CONST(0xFFFFFFFFFFFF0010) /* Fixed-size parallel state. */ typedef struct FixedParallelState @@ -308,6 +309,10 @@ InitializeParallelDSM(ParallelContext *pcxt) pcxt->nworkers)); shm_toc_estimate_keys(&pcxt->estimator, 1); + shm_toc_estimate_chunk(&pcxt->estimator, mul_size(sizeof(bool), + pcxt->nworkers)); + shm_toc_estimate_keys(&pcxt->estimator, 1); + /* Estimate how much we'll need for the entrypoint info. */ shm_toc_estimate_chunk(&pcxt->estimator, strlen(pcxt->library_name) + strlen(pcxt->function_name) + 2); @@ -379,6 +384,7 @@ InitializeParallelDSM(ParallelContext *pcxt) char *entrypointstate; char *uncommittedenumsspace; char *clientconninfospace; + bool *snapshot_set_flag_space; Size lnamelen; /* Serialize shared libraries we have loaded. */ @@ -494,6 +500,19 @@ InitializeParallelDSM(ParallelContext *pcxt) strcpy(entrypointstate, pcxt->library_name); strcpy(entrypointstate + lnamelen + 1, pcxt->function_name); shm_toc_insert(pcxt->toc, PARALLEL_KEY_ENTRYPOINT, entrypointstate); + + /* + * Establish dynamic shared memory to pass information about importing + * of snapshot. + */ + snapshot_set_flag_space = + shm_toc_allocate(pcxt->toc, mul_size(sizeof(bool), pcxt->nworkers)); + for (i = 0; i < pcxt->nworkers; ++i) + { + pcxt->worker[i].snapshot_restored = &snapshot_set_flag_space[i]; + *pcxt->worker[i].snapshot_restored = false; + } + shm_toc_insert(pcxt->toc, PARALLEL_KEY_SNAPSHOT_RESTORED, snapshot_set_flag_space); } /* Update nworkers_to_launch, in case we changed nworkers above. */ @@ -670,6 +689,10 @@ LaunchParallelWorkers(ParallelContext *pcxt) * Wait for all workers to attach to their error queues, and throw an error if * any worker fails to do this. * + * wait_for_snapshot: track whether each parallel worker has successfully restored + * its snapshot. This is needed when using periodic snapshot resets to ensure all + * workers have a valid initial snapshot before proceeding with the scan. + * * Callers can assume that if this function returns successfully, then the * number of workers given by pcxt->nworkers_launched have initialized and * attached to their error queues. Whether or not these workers are guaranteed @@ -699,7 +722,7 @@ LaunchParallelWorkers(ParallelContext *pcxt) * call this function at all. */ void -WaitForParallelWorkersToAttach(ParallelContext *pcxt) +WaitForParallelWorkersToAttach(ParallelContext *pcxt, bool wait_for_snapshot) { int i; @@ -743,9 +766,12 @@ WaitForParallelWorkersToAttach(ParallelContext *pcxt) mq = shm_mq_get_queue(pcxt->worker[i].error_mqh); if (shm_mq_get_sender(mq) != NULL) { - /* Yes, so it is known to be attached. */ - pcxt->known_attached_workers[i] = true; - ++pcxt->nknown_attached_workers; + if (!wait_for_snapshot || *(pcxt->worker[i].snapshot_restored)) + { + /* Yes, so it is known to be attached. */ + pcxt->known_attached_workers[i] = true; + ++pcxt->nknown_attached_workers; + } } } else if (status == BGWH_STOPPED) @@ -788,6 +814,16 @@ WaitForParallelWorkersToAttach(ParallelContext *pcxt) break; } } + + /* Set snapshot restored flag to false. */ + if (pcxt->nworkers > 0) + { + bool *snapshot_restored_space; + snapshot_restored_space = + shm_toc_lookup(pcxt->toc, PARALLEL_KEY_SNAPSHOT_RESTORED, false); + for (i = 0; i < pcxt->nworkers; ++i) + snapshot_restored_space[i] = false; + } } /* @@ -1304,6 +1340,7 @@ ParallelWorkerMain(Datum main_arg) shm_toc *toc; FixedParallelState *fps; char *error_queue_space; + bool *snapshot_restored_space; shm_mq *mq; shm_mq_handle *mqh; char *libraryspace; @@ -1507,6 +1544,10 @@ ParallelWorkerMain(Datum main_arg) fps->parallel_leader_pgproc); PushActiveSnapshot(asnapshot); + /* Snapshot is restored, set flag to make leader know about it. */ + snapshot_restored_space = shm_toc_lookup(toc, PARALLEL_KEY_SNAPSHOT_RESTORED, false); + snapshot_restored_space[ParallelWorkerNumber] = true; + /* * We've changed which tuples we can see, and must therefore invalidate * system caches. diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 2ace60c1f07..b54921ad546 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -1531,7 +1531,7 @@ index_concurrently_build(Oid heapRelationId, index_build(heapRel, indexRelation, indexInfo, false, true); InvalidateCatalogSnapshot(); - Assert((indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique) || !TransactionIdIsValid(MyProc->xmin)); + Assert(indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); /* Roll back any GUC changes executed by index functions */ AtEOXact_GUC(false, save_nestlevel); diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c index af3c788ce8b..01f9f330eee 100644 --- a/src/backend/executor/nodeSeqscan.c +++ b/src/backend/executor/nodeSeqscan.c @@ -371,7 +371,8 @@ ExecSeqScanInitializeDSM(SeqScanState *node, pscan = shm_toc_allocate(pcxt->toc, node->pscan_len); table_parallelscan_initialize(node->ss.ss_currentRelation, pscan, - estate->es_snapshot); + estate->es_snapshot, + false); shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); node->ss.ss_currentScanDesc = table_beginscan_parallel(node->ss.ss_currentRelation, pscan); diff --git a/src/backend/executor/nodeTidrangescan.c b/src/backend/executor/nodeTidrangescan.c index 503817da65b..f31667117ad 100644 --- a/src/backend/executor/nodeTidrangescan.c +++ b/src/backend/executor/nodeTidrangescan.c @@ -455,7 +455,8 @@ ExecTidRangeScanInitializeDSM(TidRangeScanState *node, ParallelContext *pcxt) pscan = shm_toc_allocate(pcxt->toc, node->trss_pscanlen); table_parallelscan_initialize(node->ss.ss_currentRelation, pscan, - estate->es_snapshot); + estate->es_snapshot, + false); shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); node->ss.ss_currentScanDesc = table_beginscan_parallel_tidrange(node->ss.ss_currentRelation, diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 2e6197f5f35..df9116532c0 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -353,14 +353,6 @@ GetTransactionSnapshot(void) Snapshot GetLatestSnapshot(void) { - /* - * We might be able to relax this, but nothing that could otherwise work - * needs it. - */ - if (IsInParallelMode()) - elog(ERROR, - "cannot update SecondarySnapshot during a parallel operation"); - /* * So far there are no cases requiring support for GetLatestSnapshot() * during logical decoding, but it wouldn't be hard to add if required. diff --git a/src/include/access/parallel.h b/src/include/access/parallel.h index 60f857675e0..9a3e14e0c0e 100644 --- a/src/include/access/parallel.h +++ b/src/include/access/parallel.h @@ -28,6 +28,7 @@ typedef struct ParallelWorkerInfo { BackgroundWorkerHandle *bgwhandle; shm_mq_handle *error_mqh; + bool *snapshot_restored; } ParallelWorkerInfo; typedef struct ParallelContext @@ -67,7 +68,7 @@ extern void InitializeParallelDSM(ParallelContext *pcxt); extern void ReinitializeParallelDSM(ParallelContext *pcxt); extern void ReinitializeParallelWorkers(ParallelContext *pcxt, int nworkers_to_launch); extern void LaunchParallelWorkers(ParallelContext *pcxt); -extern void WaitForParallelWorkersToAttach(ParallelContext *pcxt); +extern void WaitForParallelWorkersToAttach(ParallelContext *pcxt, bool wait_for_snapshot); extern void WaitForParallelWorkersToFinish(ParallelContext *pcxt); extern void DestroyParallelContext(ParallelContext *pcxt); extern bool ParallelContextActive(void); diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index ce340c076f8..acfa06aed78 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -81,6 +81,7 @@ typedef struct ParallelTableScanDescData RelFileLocator phs_locator; /* physical relation to scan */ bool phs_syncscan; /* report location to syncscan logic? */ bool phs_snapshot_any; /* SnapshotAny, not phs_snapshot_data? */ + bool phs_reset_snapshot; /* use SO_RESET_SNAPSHOT? */ Size phs_snapshot_off; /* data for snapshot */ } ParallelTableScanDescData; typedef struct ParallelTableScanDescData *ParallelTableScanDesc; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index d0709782dde..84b06ffa42f 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -1151,7 +1151,8 @@ extern Size table_parallelscan_estimate(Relation rel, Snapshot snapshot); */ extern void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, - Snapshot snapshot); + Snapshot snapshot, + bool reset_snapshot); /* * Begin a parallel scan. `pscan` needs to have been initialized with @@ -1783,7 +1784,7 @@ table_scan_analyze_next_tuple(TableScanDesc scan, * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. * - * In case of non-unique index and non-parallel concurrent build, + * In case of non-unique concurrent index build, * SO_RESET_SNAPSHOT is applied for the scan. That leads to changing snapshots * on the fly to allow xmin horizon propagate. */ diff --git a/src/test/modules/injection_points/sql/cic_reset_snapshots.sql b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql index 7f7dffa5be4..37819bf0fb7 100644 --- a/src/test/modules/injection_points/sql/cic_reset_snapshots.sql +++ b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql @@ -3,7 +3,7 @@ CREATE EXTENSION injection_points; SELECT injection_points_set_local(); SELECT injection_points_attach('heap_reset_scan_snapshot_effective', 'notice'); SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice'); - +SELECT injection_points_attach('table_parallelscan_initialize', 'notice'); CREATE SCHEMA cic_reset_snap; CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); @@ -53,6 +53,9 @@ DROP INDEX CONCURRENTLY cic_reset_snap.idx; -- The same in parallel mode ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); +-- Detach to keep test stable, since parallel worker may complete scan before leader +SELECT injection_points_detach('heap_reset_scan_snapshot_effective'); + CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; DROP INDEX CONCURRENTLY cic_reset_snap.idx; -- 2.43.0 ^ permalink raw reply [nested|flat] 6+ messages in thread
* Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY 2026-03-09 00:03 Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY Mihail Nikalayeu <[email protected]> @ 2026-03-20 01:15 ` Mihail Nikalayeu <[email protected]> 2026-03-21 23:50 ` Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY Mihail Nikalayeu <[email protected]> 0 siblings, 1 reply; 6+ messages in thread From: Mihail Nikalayeu @ 2026-03-20 01:15 UTC (permalink / raw) To: PostgreSQL Hackers <[email protected]>; +Cc: Matthias van de Meent <[email protected]>; Álvaro Herrera <[email protected]>; Antonin Houska <[email protected]>; Sergey Sargsyan <[email protected]>; Hannu Krosing <[email protected]> Rebased. Attachments: [application/octet-stream] v3-0001-Add-stress-tests-for-concurrent-index-builds.patch (9.3K, 2-v3-0001-Add-stress-tests-for-concurrent-index-builds.patch) download | inline diff: From 7702556fe08d2387254ab30409ea3f1113eba0f1 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <[email protected]> Date: Sat, 30 Nov 2024 16:24:20 +0100 Subject: [PATCH v3 1/4] Add stress tests for concurrent index builds Introduce stress tests for concurrent index operations: - test concurrent inserts/updates during CREATE/REINDEX INDEX CONCURRENTLY - cover various index types (btree, gin, gist, brin, hash, spgist) - test unique and non-unique indexes - test with expressions and predicates - test both parallel and non-parallel operations These tests verify the behavior of the following commits. --- src/bin/pg_amcheck/meson.build | 1 + src/bin/pg_amcheck/t/006_cic.pl | 225 ++++++++++++++++++++++++++++++++ 2 files changed, 226 insertions(+) create mode 100644 src/bin/pg_amcheck/t/006_cic.pl diff --git a/src/bin/pg_amcheck/meson.build b/src/bin/pg_amcheck/meson.build index 592cef74ecb..51a62dccb7b 100644 --- a/src/bin/pg_amcheck/meson.build +++ b/src/bin/pg_amcheck/meson.build @@ -28,6 +28,7 @@ tests += { 't/003_check.pl', 't/004_verify_heapam.pl', 't/005_opclass_damage.pl', + 't/006_cic.pl', ], }, } diff --git a/src/bin/pg_amcheck/t/006_cic.pl b/src/bin/pg_amcheck/t/006_cic.pl new file mode 100644 index 00000000000..70b185212e7 --- /dev/null +++ b/src/bin/pg_amcheck/t/006_cic.pl @@ -0,0 +1,225 @@ +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Test REINDEX CONCURRENTLY with concurrent modifications and HOT updates +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +Test::More->builder->todo_start('filesystem bug') + if PostgreSQL::Test::Utils::has_wal_read_bug; + +my ($node, $result); + +# +# Test set-up +# +$node = PostgreSQL::Test::Cluster->new('RC_test'); +$node->init; +$node->append_conf('postgresql.conf', + 'lock_timeout = ' . (1000 * $PostgreSQL::Test::Utils::timeout_default)); +$node->append_conf('postgresql.conf', 'fsync = off'); +$node->append_conf('postgresql.conf', 'maintenance_work_mem = 32MB'); # to avoid OOM +$node->append_conf('postgresql.conf', 'shared_buffers = 32MB'); # to avoid OOM +$node->start; +$node->safe_psql('postgres', q(CREATE EXTENSION amcheck)); +$node->safe_psql('postgres', q(CREATE TABLE tbl(i int primary key, + c1 money default 0, c2 money default 0, + c3 money default 0, updated_at timestamp, + ia int4[], p point))); +# uncomment to force non-HOT -> $node->safe_psql('postgres', q(CREATE INDEX CONCURRENTLY idx ON tbl(i, updated_at);)); +# create sequence +$node->safe_psql('postgres', q(CREATE UNLOGGED SEQUENCE in_row_rebuild START 1 INCREMENT 1;)); +$node->safe_psql('postgres', q(SELECT nextval('in_row_rebuild');)); + +# Create helper functions for predicate tests +$node->safe_psql('postgres', q( + CREATE FUNCTION predicate_stable() RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ + BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN true; + END; $$; +)); + +$node->safe_psql('postgres', q( + CREATE FUNCTION predicate_const(integer) RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ + BEGIN + RETURN MOD($1, 2) = 0; + END; $$; +)); + +# Run CIC/RIC in different options concurrently with upserts +$node->pgbench( + '--no-vacuum --client=15 --jobs=4 --exit-on-abort --transactions=1000', + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY', + { + 'concurrent_ops' => q( + SET debug_parallel_query = off; -- this is because predicate_stable implementation + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set variant random(0, 5) + \set parallels random(0, 4) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + \if :variant = 0 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at); + \elif :variant = 1 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at) WHERE predicate_stable(); + \elif :variant = 2 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at) WHERE MOD(i, 2) = 0; + \elif :variant = 3 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at) WHERE predicate_const(i); + \elif :variant = 4 + CREATE INDEX CONCURRENTLY new_idx ON tbl(predicate_const(i)); + \elif :variant = 5 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, predicate_const(i), updated_at) WHERE predicate_const(i); + \endif + \sleep 10 ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + REINDEX INDEX CONCURRENTLY new_idx; + \sleep 10 ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + DROP INDEX CONCURRENTLY new_idx; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1000, 100000) + BEGIN; + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now()) + ON CONFLICT(i) DO UPDATE SET updated_at = now(); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now()) + ON CONFLICT(i) DO UPDATE SET updated_at = now(); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now()) + ON CONFLICT(i) DO UPDATE SET updated_at = now(); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now()) + ON CONFLICT(i) DO UPDATE SET updated_at = now(); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now()) + ON CONFLICT(i) DO UPDATE SET updated_at = now(); + SELECT setval('in_row_rebuild', 1); + COMMIT; + \endif + ) + }); + +$node->safe_psql('postgres', q(TRUNCATE TABLE tbl;)); + +# Run CIC/RIC for unique index concurrently with upserts +$node->pgbench( + '--no-vacuum --client=15 --jobs=4 --exit-on-abort --transactions=1000', + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for unique BTREE', + { + 'concurrent_ops_unique_idx' => q( + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set parallels random(0, 4) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + CREATE UNIQUE INDEX CONCURRENTLY new_idx ON tbl(i); + \sleep 10 ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + REINDEX INDEX CONCURRENTLY new_idx; + \sleep 10 ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + DROP INDEX CONCURRENTLY new_idx; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1, power(10, random(1, 5))) + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now()) + ON CONFLICT(i) DO UPDATE SET updated_at = now(); + SELECT setval('in_row_rebuild', 1); + \endif + ) + }); + +$node->safe_psql('postgres', q(TRUNCATE TABLE tbl;)); + +# Run CIC/RIC for GIN with upserts +$node->pgbench( + '--no-vacuum --client=15 --jobs=4 --exit-on-abort --transactions=1000', + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for GIN/GIST/BRIN/HASH/SPGIST', + { + 'concurrent_ops_gin_idx' => q( + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set parallels random(0, 4) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + CREATE INDEX CONCURRENTLY new_idx ON tbl USING GIN (ia); + \sleep 10 ms + SELECT gin_index_check('new_idx'); + REINDEX INDEX CONCURRENTLY new_idx; + \sleep 10 ms + SELECT gin_index_check('new_idx'); + DROP INDEX CONCURRENTLY new_idx; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1, power(10, random(1, 5))) + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now()) + ON CONFLICT(i) DO UPDATE SET updated_at = now(); + SELECT setval('in_row_rebuild', 1); + \endif + ) + }); + +$node->safe_psql('postgres', q(TRUNCATE TABLE tbl;)); + +# Run CIC/RIC for GIST/BRIN/HASH/SPGIST index concurrently with upserts +$node->pgbench( + '--no-vacuum --client=15 --jobs=4 --exit-on-abort --transactions=1000', + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for GIN/GIST/BRIN/HASH/SPGIST', + { + 'concurrent_ops_other_idx' => q( + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set parallels random(0, 4) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + \set variant random(0, 3) + \if :variant = 0 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING GIST (p); + \elif :variant = 1 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING BRIN (updated_at); + \elif :variant = 2 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING HASH (updated_at); + \elif :variant = 3 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING SPGIST (p); + \endif + \sleep 10 ms + REINDEX INDEX CONCURRENTLY new_idx; + \sleep 10 ms + DROP INDEX CONCURRENTLY new_idx; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1, power(10, random(1, 5))) + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now()) + ON CONFLICT(i) DO UPDATE SET updated_at = now(); + SELECT setval('in_row_rebuild', 1); + \endif + ) + }); + +$node->stop; +done_testing(); \ No newline at end of file -- 2.43.0 [application/octet-stream] v3-0003-Support-snapshot-resets-in-parallel-concurrent-in.patch (37.6K, 3-v3-0003-Support-snapshot-resets-in-parallel-concurrent-in.patch) download | inline diff: From 0a837e3e2fac5efa137aceb67468229aa58b1ec9 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <[email protected]> Date: Wed, 14 Jan 2026 18:45:56 +0300 Subject: [PATCH v3 3/4] Support snapshot resets in parallel concurrent index builds Extend periodic snapshot reset support to parallel builds, previously limited to non-parallel operations. This allows the xmin horizon to advance during parallel concurrent index builds as well. The main limitation of applying that technique to parallel builds was a requirement to wait until worker processes restore their initial snapshot from leader. To address this, following changes applied: - add infrastructure to track snapshot restoration in parallel workers - extend parallel scan initialization to support periodic snapshot resets - wait for parallel workers to restore their initial snapshots before proceeding with scan - relax limitation for parallel worker to call GetLatestSnapshot --- src/backend/access/brin/brin.c | 54 ++++++++++------- src/backend/access/gin/gininsert.c | 51 +++++++++------- src/backend/access/heap/heapam_handler.c | 12 ++-- src/backend/access/nbtree/nbtsort.c | 60 ++++++++++++++----- src/backend/access/table/tableam.c | 37 ++++++++++-- src/backend/access/transam/parallel.c | 49 +++++++++++++-- src/backend/catalog/index.c | 2 +- src/backend/executor/nodeSeqscan.c | 3 +- src/backend/executor/nodeTidrangescan.c | 3 +- src/backend/utils/time/snapmgr.c | 8 --- src/include/access/parallel.h | 3 +- src/include/access/relscan.h | 1 + src/include/access/tableam.h | 5 +- .../sql/cic_reset_snapshots.sql | 5 +- 14 files changed, 208 insertions(+), 85 deletions(-) diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 8534d7125a0..d3fc8c14a79 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -147,7 +147,6 @@ typedef struct BrinLeader */ BrinShared *brinshared; Sharedsort *sharedsort; - Snapshot snapshot; WalUsage *walusage; BufferUsage *bufferusage; } BrinLeader; @@ -235,7 +234,7 @@ static void brin_fill_empty_ranges(BrinBuildState *state, static void _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, bool isconcurrent, int request); static void _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state); -static Size _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot); +static Size _brin_parallel_estimate_shared(Relation heap); static double _brin_parallel_heapscan(BrinBuildState *state); static double _brin_parallel_merge(BrinBuildState *state); static void _brin_leader_participate_as_worker(BrinBuildState *buildstate, @@ -1226,7 +1225,6 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) reltuples = _brin_parallel_merge(state); _brin_end_parallel(state->bs_leader, state); - Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } else /* no parallel index build */ { @@ -1259,7 +1257,6 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) brin_fill_empty_ranges(state, state->bs_currRangeStart, state->bs_maxRangeStart); - Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } /* release resources */ @@ -1275,6 +1272,9 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = idxtuples; + InvalidateCatalogSnapshot(); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); + return result; } @@ -2384,7 +2384,6 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, { ParallelContext *pcxt; int scantuplesortstates; - Snapshot snapshot; Size estbrinshared; Size estsort; BrinShared *brinshared; @@ -2415,25 +2414,25 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a - * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that. + * concurrent build, we take a regular MVCC snapshot and push it as active. + * Later we index whatever's live according to that snapshot while that + * snapshot is reset periodically. */ if (!isconcurrent) { Assert(ActiveSnapshotSet()); - snapshot = SnapshotAny; need_pop_active_snapshot = false; } else { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); + Assert(!ActiveSnapshotSet()); PushActiveSnapshot(GetTransactionSnapshot()); } /* * Estimate size for our own PARALLEL_KEY_BRIN_SHARED workspace. */ - estbrinshared = _brin_parallel_estimate_shared(heap, snapshot); + estbrinshared = _brin_parallel_estimate_shared(heap); shm_toc_estimate_chunk(&pcxt->estimator, estbrinshared); estsort = tuplesort_estimate_shared(scantuplesortstates); shm_toc_estimate_chunk(&pcxt->estimator, estsort); @@ -2473,8 +2472,6 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, { if (need_pop_active_snapshot) PopActiveSnapshot(); - if (IsMVCCSnapshot(snapshot)) - UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); ExitParallelMode(); return; @@ -2499,7 +2496,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, table_parallelscan_initialize(heap, ParallelTableScanFromBrinShared(brinshared), - snapshot); + isconcurrent ? InvalidSnapshot : SnapshotAny, + isconcurrent); /* * Store shared tuplesort-private state, for which we reserved space. @@ -2545,7 +2543,6 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, brinleader->nparticipanttuplesorts++; brinleader->brinshared = brinshared; brinleader->sharedsort = sharedsort; - brinleader->snapshot = snapshot; brinleader->walusage = walusage; brinleader->bufferusage = bufferusage; @@ -2561,6 +2558,13 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, /* Save leader state now that it's clear build will be parallel */ buildstate->bs_leader = brinleader; + /* + * In case of concurrent build snapshots are going to be reset periodically. + * We need to wait until all workers imported initial snapshot. + */ + if (isconcurrent) + WaitForParallelWorkersToAttach(pcxt, true); + /* Join heap scan ourselves */ if (leaderparticipates) _brin_leader_participate_as_worker(buildstate, heap, index); @@ -2569,9 +2573,13 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - WaitForParallelWorkersToAttach(pcxt); + if (!isconcurrent) + WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); + + InvalidateCatalogSnapshot(); + Assert(!brinleader->brinshared->isconcurrent || !TransactionIdIsValid(MyProc->xmin)); } /* @@ -2592,9 +2600,6 @@ _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state) for (i = 0; i < brinleader->pcxt->nworkers_launched; i++) InstrAccumParallelQuery(&brinleader->bufferusage[i], &brinleader->walusage[i]); - /* Free last reference to MVCC snapshot, if one was used */ - if (IsMVCCSnapshot(brinleader->snapshot)) - UnregisterSnapshot(brinleader->snapshot); DestroyParallelContext(brinleader->pcxt); ExitParallelMode(); } @@ -2794,14 +2799,14 @@ _brin_parallel_merge(BrinBuildState *state) /* * Returns size of shared memory required to store state for a parallel - * brin index build based on the snapshot its parallel scan will use. + * brin index build. */ static Size -_brin_parallel_estimate_shared(Relation heap, Snapshot snapshot) +_brin_parallel_estimate_shared(Relation heap) { /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */ return add_size(BUFFERALIGN(sizeof(BrinShared)), - table_parallelscan_estimate(heap, snapshot)); + table_parallelscan_estimate(heap, InvalidSnapshot)); } /* @@ -2963,6 +2968,13 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) _brin_parallel_scan_and_build(buildstate, brinshared, sharedsort, heapRel, indexRel, sortmem, false); + if (brinshared->isconcurrent) + { + PopActiveSnapshot(); + InvalidateCatalogSnapshot(); + Assert(!TransactionIdIsValid(MyProc->xmin)); + PushActiveSnapshot(GetTransactionSnapshot()); + } /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index c3085d174c7..e9220b0a72e 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -138,7 +138,6 @@ typedef struct GinLeader */ GinBuildShared *ginshared; Sharedsort *sharedsort; - Snapshot snapshot; WalUsage *walusage; BufferUsage *bufferusage; } GinLeader; @@ -188,7 +187,7 @@ typedef struct static void _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, bool isconcurrent, int request); static void _gin_end_parallel(GinLeader *ginleader, GinBuildState *state); -static Size _gin_parallel_estimate_shared(Relation heap, Snapshot snapshot); +static Size _gin_parallel_estimate_shared(Relation heap); static double _gin_parallel_heapscan(GinBuildState *state); static double _gin_parallel_merge(GinBuildState *state); static void _gin_leader_participate_as_worker(GinBuildState *buildstate, @@ -755,7 +754,6 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) reltuples = _gin_parallel_merge(state); _gin_end_parallel(state->bs_leader, state); - Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } else /* no parallel index build */ { @@ -779,7 +777,6 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) list, nlist, &buildstate.buildStats); } MemoryContextSwitchTo(oldCtx); - Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } MemoryContextDelete(buildstate.funcCtx); @@ -809,6 +806,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); return result; } @@ -947,7 +945,6 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, { ParallelContext *pcxt; int scantuplesortstates; - Snapshot snapshot; Size estginshared; Size estsort; GinBuildShared *ginshared; @@ -977,25 +974,25 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a - * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that. + * concurrent build, we take a regular MVCC snapshot and push it as active. + * Later we index whatever's live according to that snapshot while that + * snapshot is reset periodically. */ if (!isconcurrent) { Assert(ActiveSnapshotSet()); - snapshot = SnapshotAny; need_pop_active_snapshot = false; } else { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); + Assert(!ActiveSnapshotSet()); PushActiveSnapshot(GetTransactionSnapshot()); } /* * Estimate size for our own PARALLEL_KEY_GIN_SHARED workspace. */ - estginshared = _gin_parallel_estimate_shared(heap, snapshot); + estginshared = _gin_parallel_estimate_shared(heap); shm_toc_estimate_chunk(&pcxt->estimator, estginshared); estsort = tuplesort_estimate_shared(scantuplesortstates); shm_toc_estimate_chunk(&pcxt->estimator, estsort); @@ -1035,8 +1032,6 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, { if (need_pop_active_snapshot) PopActiveSnapshot(); - if (IsMVCCSnapshot(snapshot)) - UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); ExitParallelMode(); return; @@ -1060,7 +1055,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, table_parallelscan_initialize(heap, ParallelTableScanFromGinBuildShared(ginshared), - snapshot); + isconcurrent ? InvalidSnapshot : SnapshotAny, + isconcurrent); /* * Store shared tuplesort-private state, for which we reserved space. @@ -1102,7 +1098,6 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, ginleader->nparticipanttuplesorts++; ginleader->ginshared = ginshared; ginleader->sharedsort = sharedsort; - ginleader->snapshot = snapshot; ginleader->walusage = walusage; ginleader->bufferusage = bufferusage; @@ -1118,6 +1113,13 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, /* Save leader state now that it's clear build will be parallel */ buildstate->bs_leader = ginleader; + /* + * In case of concurrent build snapshots are going to be reset periodically. + * We need to wait until all workers imported initial snapshot. + */ + if (isconcurrent) + WaitForParallelWorkersToAttach(pcxt, true); + /* Join heap scan ourselves */ if (leaderparticipates) _gin_leader_participate_as_worker(buildstate, heap, index); @@ -1126,9 +1128,12 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - WaitForParallelWorkersToAttach(pcxt); + if (!isconcurrent) + WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); + InvalidateCatalogSnapshot(); + Assert(!ginleader->ginshared->isconcurrent || !TransactionIdIsValid(MyProc->xmin)); } /* @@ -1149,9 +1154,6 @@ _gin_end_parallel(GinLeader *ginleader, GinBuildState *state) for (i = 0; i < ginleader->pcxt->nworkers_launched; i++) InstrAccumParallelQuery(&ginleader->bufferusage[i], &ginleader->walusage[i]); - /* Free last reference to MVCC snapshot, if one was used */ - if (IsMVCCSnapshot(ginleader->snapshot)) - UnregisterSnapshot(ginleader->snapshot); DestroyParallelContext(ginleader->pcxt); ExitParallelMode(); } @@ -1826,14 +1828,14 @@ _gin_parallel_merge(GinBuildState *state) /* * Returns size of shared memory required to store state for a parallel - * gin index build based on the snapshot its parallel scan will use. + * gin index build. */ static Size -_gin_parallel_estimate_shared(Relation heap, Snapshot snapshot) +_gin_parallel_estimate_shared(Relation heap) { /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */ return add_size(BUFFERALIGN(sizeof(GinBuildShared)), - table_parallelscan_estimate(heap, snapshot)); + table_parallelscan_estimate(heap, InvalidSnapshot)); } /* @@ -2218,6 +2220,13 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) _gin_parallel_scan_and_build(&buildstate, ginshared, sharedsort, heapRel, indexRel, sortmem, false); + if (ginshared->isconcurrent) + { + PopActiveSnapshot(); + InvalidateCatalogSnapshot(); + Assert(!TransactionIdIsValid(MyProc->xmin)); + PushActiveSnapshot(GetTransactionSnapshot()); + } /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 9cad2e115bd..a639e8b31e0 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1260,14 +1260,13 @@ heapam_index_build_range_scan(Relation heapRelation, * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, or during bootstrap, we take a regular MVCC snapshot - * and index whatever's live according to that. + * and index whatever's live according to that while that snapshot is reset + * every so often (in case of non-unique index). */ OldestXmin = InvalidTransactionId; /* * For unique index we need consistent snapshot for the whole scan. - * In case of parallel scan some additional infrastructure required - * to perform scan with SO_RESET_SNAPSHOT which is not yet ready. */ reset_snapshots = indexInfo->ii_Concurrent && !indexInfo->ii_Unique && @@ -1329,8 +1328,11 @@ heapam_index_build_range_scan(Relation heapRelation, Assert(!IsBootstrapProcessingMode()); Assert(allow_sync); snapshot = scan->rs_snapshot; - PushActiveSnapshot(snapshot); - need_pop_active_snapshot = true; + if (!reset_snapshots) + { + PushActiveSnapshot(snapshot); + need_pop_active_snapshot = true; + } } hscan = (HeapScanDesc) scan; diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 6f091d39813..01e338fb562 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -325,22 +325,20 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) RelationGetRelationName(index)); reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo); - Assert(indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique || - !indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); /* * Finish the build by (1) completing the sort of the spool file, (2) * inserting the sorted tuples into btree pages and (3) building the upper * levels. Finally, it may also be necessary to end use of parallelism. */ - _bt_leafbuild(buildstate.spool, buildstate.spool2, !indexInfo->ii_ParallelWorkers && indexInfo->ii_Concurrent); + _bt_leafbuild(buildstate.spool, buildstate.spool2, !indexInfo->ii_Unique && indexInfo->ii_Concurrent); _bt_spooldestroy(buildstate.spool); if (buildstate.spool2) _bt_spooldestroy(buildstate.spool2); if (buildstate.btleader) _bt_end_parallel(buildstate.btleader); - Assert(indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique || - !indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); result = palloc_object(IndexBuildResult); @@ -489,8 +487,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, reltuples = _bt_parallel_heapscan(buildstate, &indexInfo->ii_BrokenHotChain); InvalidateCatalogSnapshot(); - Assert(indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique || - !indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); /* * Set the progress target for the next phase. Reset the block number @@ -1422,6 +1419,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) BufferUsage *bufferusage; bool leaderparticipates = true; bool need_pop_active_snapshot = true; + bool reset_snapshot; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -1439,12 +1437,21 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) scantuplesortstates = leaderparticipates ? request + 1 : request; + /* + * For concurrent non-unique index builds, we can periodically reset snapshots + * to allow the xmin horizon to advance. This is safe since these builds don't + * require a consistent view across the entire scan. Unique indexes still need + * a stable snapshot to properly enforce uniqueness constraints. + */ + reset_snapshot = isconcurrent && !btspool->isunique; + /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that. + * live according to that, while that snapshot may be reset periodically in + * case of non-unique index. */ if (!isconcurrent) { @@ -1452,6 +1459,11 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) snapshot = SnapshotAny; need_pop_active_snapshot = false; } + else if (reset_snapshot) + { + snapshot = InvalidSnapshot; + PushActiveSnapshot(GetTransactionSnapshot()); + } else { snapshot = RegisterSnapshot(GetTransactionSnapshot()); @@ -1512,7 +1524,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) { if (need_pop_active_snapshot) PopActiveSnapshot(); - if (IsMVCCSnapshot(snapshot)) + if (snapshot != InvalidSnapshot && IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); ExitParallelMode(); @@ -1539,7 +1551,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) btshared->brokenhotchain = false; table_parallelscan_initialize(btspool->heap, ParallelTableScanFromBTShared(btshared), - snapshot); + snapshot, + reset_snapshot); /* * Store shared tuplesort-private state, for which we reserved space. @@ -1615,6 +1628,13 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) /* Save leader state now that it's clear build will be parallel */ buildstate->btleader = btleader; + /* + * In case of concurrent build snapshots are going to be reset periodically. + * Wait until all workers imported initial snapshot. + */ + if (reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, true); + /* Join heap scan ourselves */ if (leaderparticipates) _bt_leader_participate_as_worker(buildstate); @@ -1623,9 +1643,13 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - WaitForParallelWorkersToAttach(pcxt); + if (!reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); + + InvalidateCatalogSnapshot(); + Assert(!reset_snapshot|| !TransactionIdIsValid(MyProc->xmin)); } /* @@ -1647,7 +1671,7 @@ _bt_end_parallel(BTLeader *btleader) InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]); /* Free last reference to MVCC snapshot, if one was used */ - if (IsMVCCSnapshot(btleader->snapshot)) + if (btleader->snapshot != InvalidSnapshot && IsMVCCSnapshot(btleader->snapshot)) UnregisterSnapshot(btleader->snapshot); DestroyParallelContext(btleader->pcxt); ExitParallelMode(); @@ -1897,6 +1921,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, SortCoordinate coordinate; BTBuildState buildstate; TableScanDesc scan; + ParallelTableScanDesc pscan; double reltuples; IndexInfo *indexInfo; @@ -1951,11 +1976,15 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, /* Join parallel scan */ indexInfo = BuildIndexInfo(btspool->index); indexInfo->ii_Concurrent = btshared->isconcurrent; - scan = table_beginscan_parallel(btspool->heap, - ParallelTableScanFromBTShared(btshared)); + pscan = ParallelTableScanFromBTShared(btshared); + scan = table_beginscan_parallel(btspool->heap, pscan); reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo, true, progress, _bt_build_callback, &buildstate, scan); + InvalidateCatalogSnapshot(); + if (pscan->phs_reset_snapshot) + PopActiveSnapshot(); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); /* Execute this worker's part of the sort */ if (progress) @@ -1991,4 +2020,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, tuplesort_end(btspool->sortstate); if (btspool2) tuplesort_end(btspool2->sortstate); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); + if (pscan->phs_reset_snapshot) + PushActiveSnapshot(GetTransactionSnapshot()); } diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index dfda1af412e..26df5638921 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -132,10 +132,10 @@ table_parallelscan_estimate(Relation rel, Snapshot snapshot) { Size sz = 0; - if (IsMVCCSnapshot(snapshot)) + if (snapshot != InvalidSnapshot && IsMVCCSnapshot(snapshot)) sz = add_size(sz, EstimateSnapshotSpace(snapshot)); else - Assert(snapshot == SnapshotAny); + Assert(snapshot == SnapshotAny || snapshot == InvalidSnapshot); sz = add_size(sz, rel->rd_tableam->parallelscan_estimate(rel)); @@ -144,21 +144,36 @@ table_parallelscan_estimate(Relation rel, Snapshot snapshot) void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, - Snapshot snapshot) + Snapshot snapshot, bool reset_snapshot) { Size snapshot_off = rel->rd_tableam->parallelscan_initialize(rel, pscan); pscan->phs_snapshot_off = snapshot_off; - if (IsMVCCSnapshot(snapshot)) + /* + * Initialize parallel scan description. For normal scans with a regular + * MVCC snapshot, serialize the snapshot info. For scans that use periodic + * snapshot resets, mark the scan accordingly. + */ + if (reset_snapshot) + { + Assert(snapshot == InvalidSnapshot); + pscan->phs_snapshot_any = false; + pscan->phs_reset_snapshot = true; + INJECTION_POINT("table_parallelscan_initialize", NULL); + } + else if (IsMVCCSnapshot(snapshot)) { SerializeSnapshot(snapshot, (char *) pscan + pscan->phs_snapshot_off); pscan->phs_snapshot_any = false; + pscan->phs_reset_snapshot = false; } else { Assert(snapshot == SnapshotAny); + Assert(!reset_snapshot); pscan->phs_snapshot_any = true; + pscan->phs_reset_snapshot = false; } } @@ -171,7 +186,19 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan) Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator)); - if (!pscan->phs_snapshot_any) + /* + * For scans that + * use periodic snapshot resets, mark the scan accordingly and use the active + * snapshot as the initial state. + */ + if (pscan->phs_reset_snapshot) + { + Assert(ActiveSnapshotSet()); + flags |= SO_RESET_SNAPSHOT; + /* Start with current active snapshot. */ + snapshot = GetActiveSnapshot(); + } + else if (!pscan->phs_snapshot_any) { /* Snapshot was serialized -- restore it */ snapshot = RestoreSnapshot((char *) pscan + pscan->phs_snapshot_off); diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c index ab1dfb30e73..0498c07c37b 100644 --- a/src/backend/access/transam/parallel.c +++ b/src/backend/access/transam/parallel.c @@ -79,6 +79,7 @@ #define PARALLEL_KEY_RELMAPPER_STATE UINT64CONST(0xFFFFFFFFFFFF000D) #define PARALLEL_KEY_UNCOMMITTEDENUMS UINT64CONST(0xFFFFFFFFFFFF000E) #define PARALLEL_KEY_CLIENTCONNINFO UINT64CONST(0xFFFFFFFFFFFF000F) +#define PARALLEL_KEY_SNAPSHOT_RESTORED UINT64CONST(0xFFFFFFFFFFFF0010) /* Fixed-size parallel state. */ typedef struct FixedParallelState @@ -308,6 +309,10 @@ InitializeParallelDSM(ParallelContext *pcxt) pcxt->nworkers)); shm_toc_estimate_keys(&pcxt->estimator, 1); + shm_toc_estimate_chunk(&pcxt->estimator, mul_size(sizeof(bool), + pcxt->nworkers)); + shm_toc_estimate_keys(&pcxt->estimator, 1); + /* Estimate how much we'll need for the entrypoint info. */ shm_toc_estimate_chunk(&pcxt->estimator, strlen(pcxt->library_name) + strlen(pcxt->function_name) + 2); @@ -379,6 +384,7 @@ InitializeParallelDSM(ParallelContext *pcxt) char *entrypointstate; char *uncommittedenumsspace; char *clientconninfospace; + bool *snapshot_set_flag_space; Size lnamelen; /* Serialize shared libraries we have loaded. */ @@ -494,6 +500,19 @@ InitializeParallelDSM(ParallelContext *pcxt) strcpy(entrypointstate, pcxt->library_name); strcpy(entrypointstate + lnamelen + 1, pcxt->function_name); shm_toc_insert(pcxt->toc, PARALLEL_KEY_ENTRYPOINT, entrypointstate); + + /* + * Establish dynamic shared memory to pass information about importing + * of snapshot. + */ + snapshot_set_flag_space = + shm_toc_allocate(pcxt->toc, mul_size(sizeof(bool), pcxt->nworkers)); + for (i = 0; i < pcxt->nworkers; ++i) + { + pcxt->worker[i].snapshot_restored = &snapshot_set_flag_space[i]; + *pcxt->worker[i].snapshot_restored = false; + } + shm_toc_insert(pcxt->toc, PARALLEL_KEY_SNAPSHOT_RESTORED, snapshot_set_flag_space); } /* Update nworkers_to_launch, in case we changed nworkers above. */ @@ -670,6 +689,10 @@ LaunchParallelWorkers(ParallelContext *pcxt) * Wait for all workers to attach to their error queues, and throw an error if * any worker fails to do this. * + * wait_for_snapshot: track whether each parallel worker has successfully restored + * its snapshot. This is needed when using periodic snapshot resets to ensure all + * workers have a valid initial snapshot before proceeding with the scan. + * * Callers can assume that if this function returns successfully, then the * number of workers given by pcxt->nworkers_launched have initialized and * attached to their error queues. Whether or not these workers are guaranteed @@ -699,7 +722,7 @@ LaunchParallelWorkers(ParallelContext *pcxt) * call this function at all. */ void -WaitForParallelWorkersToAttach(ParallelContext *pcxt) +WaitForParallelWorkersToAttach(ParallelContext *pcxt, bool wait_for_snapshot) { int i; @@ -743,9 +766,12 @@ WaitForParallelWorkersToAttach(ParallelContext *pcxt) mq = shm_mq_get_queue(pcxt->worker[i].error_mqh); if (shm_mq_get_sender(mq) != NULL) { - /* Yes, so it is known to be attached. */ - pcxt->known_attached_workers[i] = true; - ++pcxt->nknown_attached_workers; + if (!wait_for_snapshot || *(pcxt->worker[i].snapshot_restored)) + { + /* Yes, so it is known to be attached. */ + pcxt->known_attached_workers[i] = true; + ++pcxt->nknown_attached_workers; + } } } else if (status == BGWH_STOPPED) @@ -788,6 +814,16 @@ WaitForParallelWorkersToAttach(ParallelContext *pcxt) break; } } + + /* Set snapshot restored flag to false. */ + if (pcxt->nworkers > 0) + { + bool *snapshot_restored_space; + snapshot_restored_space = + shm_toc_lookup(pcxt->toc, PARALLEL_KEY_SNAPSHOT_RESTORED, false); + for (i = 0; i < pcxt->nworkers; ++i) + snapshot_restored_space[i] = false; + } } /* @@ -1304,6 +1340,7 @@ ParallelWorkerMain(Datum main_arg) shm_toc *toc; FixedParallelState *fps; char *error_queue_space; + bool *snapshot_restored_space; shm_mq *mq; shm_mq_handle *mqh; char *libraryspace; @@ -1507,6 +1544,10 @@ ParallelWorkerMain(Datum main_arg) fps->parallel_leader_pgproc); PushActiveSnapshot(asnapshot); + /* Snapshot is restored, set flag to make leader know about it. */ + snapshot_restored_space = shm_toc_lookup(toc, PARALLEL_KEY_SNAPSHOT_RESTORED, false); + snapshot_restored_space[ParallelWorkerNumber] = true; + /* * We've changed which tuples we can see, and must therefore invalidate * system caches. diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index f4dc4563c91..7c7a41fd09e 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -1534,7 +1534,7 @@ index_concurrently_build(Oid heapRelationId, index_build(heapRel, indexRelation, indexInfo, false, true); InvalidateCatalogSnapshot(); - Assert((indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique) || !TransactionIdIsValid(MyProc->xmin)); + Assert(indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); /* Roll back any GUC changes executed by index functions */ AtEOXact_GUC(false, save_nestlevel); diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c index 8f219f60a93..06a91692072 100644 --- a/src/backend/executor/nodeSeqscan.c +++ b/src/backend/executor/nodeSeqscan.c @@ -372,7 +372,8 @@ ExecSeqScanInitializeDSM(SeqScanState *node, pscan = shm_toc_allocate(pcxt->toc, node->pscan_len); table_parallelscan_initialize(node->ss.ss_currentRelation, pscan, - estate->es_snapshot); + estate->es_snapshot, + false); shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); node->ss.ss_currentScanDesc = table_beginscan_parallel(node->ss.ss_currentRelation, pscan); diff --git a/src/backend/executor/nodeTidrangescan.c b/src/backend/executor/nodeTidrangescan.c index 617713bde04..f1bf737f395 100644 --- a/src/backend/executor/nodeTidrangescan.c +++ b/src/backend/executor/nodeTidrangescan.c @@ -456,7 +456,8 @@ ExecTidRangeScanInitializeDSM(TidRangeScanState *node, ParallelContext *pcxt) pscan = shm_toc_allocate(pcxt->toc, node->trss_pscanlen); table_parallelscan_initialize(node->ss.ss_currentRelation, pscan, - estate->es_snapshot); + estate->es_snapshot, + false); shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); node->ss.ss_currentScanDesc = table_beginscan_parallel_tidrange(node->ss.ss_currentRelation, diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 2e6197f5f35..df9116532c0 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -353,14 +353,6 @@ GetTransactionSnapshot(void) Snapshot GetLatestSnapshot(void) { - /* - * We might be able to relax this, but nothing that could otherwise work - * needs it. - */ - if (IsInParallelMode()) - elog(ERROR, - "cannot update SecondarySnapshot during a parallel operation"); - /* * So far there are no cases requiring support for GetLatestSnapshot() * during logical decoding, but it wouldn't be hard to add if required. diff --git a/src/include/access/parallel.h b/src/include/access/parallel.h index 60f857675e0..9a3e14e0c0e 100644 --- a/src/include/access/parallel.h +++ b/src/include/access/parallel.h @@ -28,6 +28,7 @@ typedef struct ParallelWorkerInfo { BackgroundWorkerHandle *bgwhandle; shm_mq_handle *error_mqh; + bool *snapshot_restored; } ParallelWorkerInfo; typedef struct ParallelContext @@ -67,7 +68,7 @@ extern void InitializeParallelDSM(ParallelContext *pcxt); extern void ReinitializeParallelDSM(ParallelContext *pcxt); extern void ReinitializeParallelWorkers(ParallelContext *pcxt, int nworkers_to_launch); extern void LaunchParallelWorkers(ParallelContext *pcxt); -extern void WaitForParallelWorkersToAttach(ParallelContext *pcxt); +extern void WaitForParallelWorkersToAttach(ParallelContext *pcxt, bool wait_for_snapshot); extern void WaitForParallelWorkersToFinish(ParallelContext *pcxt); extern void DestroyParallelContext(ParallelContext *pcxt); extern bool ParallelContextActive(void); diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index ce340c076f8..acfa06aed78 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -81,6 +81,7 @@ typedef struct ParallelTableScanDescData RelFileLocator phs_locator; /* physical relation to scan */ bool phs_syncscan; /* report location to syncscan logic? */ bool phs_snapshot_any; /* SnapshotAny, not phs_snapshot_data? */ + bool phs_reset_snapshot; /* use SO_RESET_SNAPSHOT? */ Size phs_snapshot_off; /* data for snapshot */ } ParallelTableScanDescData; typedef struct ParallelTableScanDescData *ParallelTableScanDesc; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index d0709782dde..84b06ffa42f 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -1151,7 +1151,8 @@ extern Size table_parallelscan_estimate(Relation rel, Snapshot snapshot); */ extern void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, - Snapshot snapshot); + Snapshot snapshot, + bool reset_snapshot); /* * Begin a parallel scan. `pscan` needs to have been initialized with @@ -1783,7 +1784,7 @@ table_scan_analyze_next_tuple(TableScanDesc scan, * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. * - * In case of non-unique index and non-parallel concurrent build, + * In case of non-unique concurrent index build, * SO_RESET_SNAPSHOT is applied for the scan. That leads to changing snapshots * on the fly to allow xmin horizon propagate. */ diff --git a/src/test/modules/injection_points/sql/cic_reset_snapshots.sql b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql index 7f7dffa5be4..37819bf0fb7 100644 --- a/src/test/modules/injection_points/sql/cic_reset_snapshots.sql +++ b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql @@ -3,7 +3,7 @@ CREATE EXTENSION injection_points; SELECT injection_points_set_local(); SELECT injection_points_attach('heap_reset_scan_snapshot_effective', 'notice'); SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice'); - +SELECT injection_points_attach('table_parallelscan_initialize', 'notice'); CREATE SCHEMA cic_reset_snap; CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); @@ -53,6 +53,9 @@ DROP INDEX CONCURRENTLY cic_reset_snap.idx; -- The same in parallel mode ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); +-- Detach to keep test stable, since parallel worker may complete scan before leader +SELECT injection_points_detach('heap_reset_scan_snapshot_effective'); + CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; DROP INDEX CONCURRENTLY cic_reset_snap.idx; -- 2.43.0 [application/octet-stream] v3-0002-Reset-snapshots-periodically-in-non-unique-non-pa.patch (51.1K, 4-v3-0002-Reset-snapshots-periodically-in-non-unique-non-pa.patch) download | inline diff: From a2510b475a7d28db2004507501a49e650814c9c6 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <[email protected]> Date: Wed, 14 Jan 2026 15:59:47 +0300 Subject: [PATCH v3 2/4] Reset snapshots periodically in non-unique non-parallel concurrent index builds Long-living snapshots used by CREATE INDEX CONCURRENTLY and REINDEX CONCURRENTLY can hold back the global xmin horizon. Commit d9d076222f5b attempted to allow VACUUM to ignore such snapshots to mitigate this problem. However, this was reverted in commit e28bb8851969 because it could cause indexes to miss heap tuples that were HOT-updated and HOT-pruned during the index creation, leading to index corruption. This patch introduces an alternative by periodically resetting the snapshot used during the first phase. By resetting the snapshot every N pages during the heap scan, it allows the xmin horizon to advance. Currently, this technique is applied to: - only during the first scan of the heap: The second scan during index validation still uses a single snapshot to ensure index correctness - non-parallel index builds: Parallel index builds are not yet supported and will be addressed in following commits - non-unique indexes: Unique index builds still require a consistent snapshot to enforce uniqueness constraints, will be addressed in following commits A new scan option SO_RESET_SNAPSHOT is introduced. When set, it causes the snapshot to be reset "between" every SO_RESET_SNAPSHOT_EACH_N_PAGE pages during the scan. The heap scan code is adjusted to support this option, and the index build code is modified to use it for applicable concurrent index builds that are not on system catalogs and not using parallel workers. --- contrib/amcheck/verify_nbtree.c | 3 +- contrib/pgstattuple/pgstattuple.c | 2 +- src/backend/access/brin/brin.c | 19 ++- src/backend/access/gin/gininsert.c | 22 ++++ src/backend/access/gist/gistbuild.c | 4 + src/backend/access/hash/hash.c | 3 + src/backend/access/heap/heapam.c | 47 ++++++- src/backend/access/heap/heapam_handler.c | 57 +++++++-- src/backend/access/index/genam.c | 2 +- src/backend/access/nbtree/nbtsort.c | 30 ++++- src/backend/access/spgist/spginsert.c | 3 + src/backend/access/transam/xact.c | 11 ++ src/backend/catalog/index.c | 31 ++++- src/backend/commands/indexcmds.c | 17 +-- src/backend/optimizer/plan/planner.c | 10 ++ src/backend/tcop/utility.c | 3 + src/backend/utils/errcodes.txt | 1 + src/bin/pg_amcheck/t/006_cic.pl | 2 +- src/include/access/heapam.h | 2 + src/include/access/tableam.h | 28 ++++- src/include/access/xact.h | 1 + src/test/modules/injection_points/Makefile | 2 +- .../expected/cic_reset_snapshots.out | 115 ++++++++++++++++++ src/test/modules/injection_points/meson.build | 1 + .../sql/cic_reset_snapshots.sql | 95 +++++++++++++++ 25 files changed, 474 insertions(+), 37 deletions(-) create mode 100644 src/test/modules/injection_points/expected/cic_reset_snapshots.out create mode 100644 src/test/modules/injection_points/sql/cic_reset_snapshots.sql diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index b74ab5f7a05..40874167631 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -557,7 +557,8 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - true); /* syncscan OK? */ + true, /* syncscan OK? */ + false); /* * Scan will behave as the first scan of a CREATE INDEX CONCURRENTLY diff --git a/contrib/pgstattuple/pgstattuple.c b/contrib/pgstattuple/pgstattuple.c index 6a7f8cb4a7c..a0e7ed2e137 100644 --- a/contrib/pgstattuple/pgstattuple.c +++ b/contrib/pgstattuple/pgstattuple.c @@ -335,7 +335,7 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo) errmsg("only heap AM is supported"))); /* Disable syncscan because we assume we scan from block zero upwards */ - scan = table_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false); + scan = table_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false, false); hscan = (HeapScanDesc) scan; InitDirtySnapshot(SnapshotDirty); diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 2a0f8c8e3b8..8534d7125a0 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -1221,11 +1221,12 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) state->bs_sortstate = tuplesort_begin_index_brin(maintenance_work_mem, coordinate, TUPLESORT_NONE); - + InvalidateCatalogSnapshot(); /* scan the relation and merge per-worker results */ reltuples = _brin_parallel_merge(state); _brin_end_parallel(state->bs_leader, state); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } else /* no parallel index build */ { @@ -1238,6 +1239,7 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) reltuples = table_index_build_scan(heap, index, indexInfo, false, true, brinbuildCallback, state, NULL); + InvalidateCatalogSnapshot(); /* * process the final batch * @@ -1257,6 +1259,7 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) brin_fill_empty_ranges(state, state->bs_currRangeStart, state->bs_maxRangeStart); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } /* release resources */ @@ -2390,6 +2393,7 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; + bool need_pop_active_snapshot = true; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -2415,9 +2419,16 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * live according to that. */ if (!isconcurrent) + { + Assert(ActiveSnapshotSet()); snapshot = SnapshotAny; + need_pop_active_snapshot = false; + } else + { snapshot = RegisterSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(GetTransactionSnapshot()); + } /* * Estimate size for our own PARALLEL_KEY_BRIN_SHARED workspace. @@ -2460,6 +2471,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, /* If no DSM segment was available, back out (do serial build) */ if (pcxt->seg == NULL) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); if (IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); @@ -2539,6 +2552,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); _brin_end_parallel(brinleader, NULL); return; } @@ -2555,6 +2570,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * sure that the failure-to-start case will not hang forever. */ WaitForParallelWorkersToAttach(pcxt); + if (need_pop_active_snapshot) + PopActiveSnapshot(); } /* diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index e54782d9dd8..c3085d174c7 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -31,6 +31,7 @@ #include "storage/condition_variable.h" #include "storage/proc.h" #include "storage/predicate.h" +#include "storage/proc.h" #include "tcop/tcopprot.h" #include "utils/datum.h" #include "utils/memutils.h" @@ -683,6 +684,9 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) buildstate.accum.ginstate = &buildstate.ginstate; ginInitBA(&buildstate.accum); + InvalidateCatalogSnapshot(); + Assert(!indexInfo->ii_Concurrent || indexInfo->ii_ParallelWorkers || !TransactionIdIsValid(MyProc->xmin)); + /* Report table scan phase started */ pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_GIN_PHASE_INDEXBUILD_TABLESCAN); @@ -745,11 +749,13 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) tuplesort_begin_index_gin(heap, index, maintenance_work_mem, coordinate, TUPLESORT_NONE); + InvalidateCatalogSnapshot(); /* scan the relation in parallel and merge per-worker results */ reltuples = _gin_parallel_merge(state); _gin_end_parallel(state->bs_leader, state); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } else /* no parallel index build */ { @@ -759,6 +765,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) */ reltuples = table_index_build_scan(heap, index, indexInfo, false, true, ginBuildCallback, &buildstate, NULL); + InvalidateCatalogSnapshot(); /* dump remaining entries to the index */ oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx); @@ -772,6 +779,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) list, nlist, &buildstate.buildStats); } MemoryContextSwitchTo(oldCtx); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } MemoryContextDelete(buildstate.funcCtx); @@ -948,6 +956,7 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; + bool need_pop_active_snapshot = true; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -972,9 +981,16 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * live according to that. */ if (!isconcurrent) + { + Assert(ActiveSnapshotSet()); snapshot = SnapshotAny; + need_pop_active_snapshot = false; + } else + { snapshot = RegisterSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(GetTransactionSnapshot()); + } /* * Estimate size for our own PARALLEL_KEY_GIN_SHARED workspace. @@ -1017,6 +1033,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, /* If no DSM segment was available, back out (do serial build) */ if (pcxt->seg == NULL) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); if (IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); @@ -1091,6 +1109,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); _gin_end_parallel(ginleader, NULL); return; } @@ -1107,6 +1127,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * sure that the failure-to-start case will not hang forever. */ WaitForParallelWorkersToAttach(pcxt); + if (need_pop_active_snapshot) + PopActiveSnapshot(); } /* diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 7f57c787f4c..6207e9f5d81 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -43,6 +43,7 @@ #include "optimizer/optimizer.h" #include "storage/bufmgr.h" #include "storage/bulk_write.h" +#include "storage/proc.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -259,6 +260,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) buildstate.indtuples = 0; buildstate.indtuplesSize = 0; + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); if (buildstate.buildMode == GIST_SORTED_BUILD) { /* @@ -350,6 +352,8 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = (double) buildstate.indtuples; + InvalidateCatalogSnapshot(); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); return result; } diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 6df5e7ccbd1..fdf4a3659b8 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -30,6 +30,7 @@ #include "nodes/execnodes.h" #include "optimizer/plancat.h" #include "pgstat.h" +#include "storage/proc.h" #include "storage/read_stream.h" #include "utils/fmgrprotos.h" #include "utils/index_selfuncs.h" @@ -210,6 +211,8 @@ hashbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; + InvalidateCatalogSnapshot(); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); return result; } diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index e5bd062de77..52b27f0384b 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -54,6 +54,7 @@ #include "utils/inval.h" #include "utils/spccache.h" #include "utils/syscache.h" +#include "utils/injection_point.h" static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, @@ -697,6 +698,36 @@ heap_prepare_pagescan(TableScanDesc sscan) LockBuffer(buffer, BUFFER_LOCK_UNLOCK); } +/* + * Reset the active snapshot during a scan. + * This ensures the xmin horizon can advance while maintaining safe tuple visibility. + * Note: No other snapshot should be active during this operation. + */ +static inline void +heap_reset_scan_snapshot(TableScanDesc sscan) +{ + /* Make sure no other snapshot was set as active. */ + Assert(GetActiveSnapshot() == sscan->rs_snapshot); + /* And make sure active snapshot is not registered. */ + Assert(GetActiveSnapshot()->regd_count == 0); + PopActiveSnapshot(); + + sscan->rs_snapshot = InvalidSnapshot; /* just to be tidy */ + Assert(!HaveRegisteredOrActiveSnapshot()); + InvalidateCatalogSnapshot(); + + /* The goal of snapshot reset is to allow horizon to advance. */ + Assert(!TransactionIdIsValid(MyProc->xmin)); +#if USE_INJECTION_POINTS + /* In some cases it is still not possible due xid assign. */ + if (!TransactionIdIsValid(MyProc->xid)) + INJECTION_POINT("heap_reset_scan_snapshot_effective", NULL); +#endif + + PushActiveSnapshot(GetLatestSnapshot()); + sscan->rs_snapshot = GetActiveSnapshot(); +} + /* * heap_fetch_next_buffer - read and pin the next block from MAIN_FORKNUM. * @@ -738,7 +769,12 @@ heap_fetch_next_buffer(HeapScanDesc scan, ScanDirection dir) scan->rs_cbuf = read_stream_next_buffer(scan->rs_read_stream, NULL); if (BufferIsValid(scan->rs_cbuf)) + { scan->rs_cblock = BufferGetBlockNumber(scan->rs_cbuf); + if (scan->rs_base.rs_flags & SO_RESET_SNAPSHOT && + scan->rs_cblock % SO_RESET_SNAPSHOT_EACH_N_PAGE == 0) + heap_reset_scan_snapshot((TableScanDesc) scan); + } } /* @@ -1409,7 +1445,16 @@ heap_endscan(TableScanDesc sscan) if (scan->rs_parallelworkerdata != NULL) pfree(scan->rs_parallelworkerdata); - +#ifdef USE_ASSERT_CHECKING + if (scan->rs_base.rs_flags & SO_RESET_SNAPSHOT) + { + Assert(!(scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)); + /* Make sure no other snapshot was set as active. */ + Assert(GetActiveSnapshot() == sscan->rs_snapshot); + /* And make sure snapshot is not registered. */ + Assert(GetActiveSnapshot()->regd_count == 0); + } +#endif if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT) UnregisterSnapshot(scan->rs_base.rs_snapshot); diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 253a735b6c1..9cad2e115bd 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1219,6 +1219,8 @@ heapam_index_build_range_scan(Relation heapRelation, ExprContext *econtext; Snapshot snapshot; bool need_unregister_snapshot = false; + bool need_pop_active_snapshot = false; + bool reset_snapshots = false; TransactionId OldestXmin; BlockNumber previous_blkno = InvalidBlockNumber; BlockNumber root_blkno = InvalidBlockNumber; @@ -1253,9 +1255,6 @@ heapam_index_build_range_scan(Relation heapRelation, /* Arrange for econtext's scan tuple to be the tuple under test */ econtext->ecxt_scantuple = slot; - /* Set up execution state for predicate, if any. */ - predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); - /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time @@ -1265,6 +1264,15 @@ heapam_index_build_range_scan(Relation heapRelation, */ OldestXmin = InvalidTransactionId; + /* + * For unique index we need consistent snapshot for the whole scan. + * In case of parallel scan some additional infrastructure required + * to perform scan with SO_RESET_SNAPSHOT which is not yet ready. + */ + reset_snapshots = indexInfo->ii_Concurrent && + !indexInfo->ii_Unique && + !is_system_catalog; /* just for the case */ + /* okay to ignore lazy VACUUMs here */ if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent) OldestXmin = GetOldestNonRemovableTransactionId(heapRelation); @@ -1273,24 +1281,41 @@ heapam_index_build_range_scan(Relation heapRelation, { /* * Serial index build. - * - * Must begin our own heap scan in this case. We may also need to - * register a snapshot whose lifetime is under our direct control. */ if (!TransactionIdIsValid(OldestXmin)) { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); - need_unregister_snapshot = true; + snapshot = GetTransactionSnapshot(); + /* + * Must begin our own heap scan in this case. We may also need to + * register a snapshot whose lifetime is under our direct control. + * In case of resetting of snapshot during the scan registration is + * not allowed because snapshot is going to be changed every so + * often. + */ + if (!reset_snapshots) + { + snapshot = RegisterSnapshot(snapshot); + need_unregister_snapshot = true; + } + Assert(!ActiveSnapshotSet()); + PushActiveSnapshot(snapshot); + /* store link to snapshot because it may be copied */ + snapshot = GetActiveSnapshot(); + need_pop_active_snapshot = true; } else + { + Assert(!indexInfo->ii_Concurrent); snapshot = SnapshotAny; + } scan = table_beginscan_strat(heapRelation, /* relation */ snapshot, /* snapshot */ 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - allow_sync); /* syncscan OK? */ + allow_sync, /* syncscan OK? */ + reset_snapshots /* reset snapshots? */); } else { @@ -1304,6 +1329,8 @@ heapam_index_build_range_scan(Relation heapRelation, Assert(!IsBootstrapProcessingMode()); Assert(allow_sync); snapshot = scan->rs_snapshot; + PushActiveSnapshot(snapshot); + need_pop_active_snapshot = true; } hscan = (HeapScanDesc) scan; @@ -1318,6 +1345,13 @@ heapam_index_build_range_scan(Relation heapRelation, Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) : !TransactionIdIsValid(OldestXmin)); Assert(snapshot == SnapshotAny || !anyvisible); + Assert(snapshot == SnapshotAny || ActiveSnapshotSet()); + + /* Set up execution state for predicate, if any. */ + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + /* Clear reference to snapshot since it may be changed by the scan itself. */ + if (reset_snapshots) + snapshot = InvalidSnapshot; /* Publish number of blocks to scan */ if (progress) @@ -1753,6 +1787,8 @@ heapam_index_build_range_scan(Relation heapRelation, table_endscan(scan); + if (need_pop_active_snapshot) + PopActiveSnapshot(); /* we can now forget our snapshot, if set and registered by us */ if (need_unregister_snapshot) UnregisterSnapshot(snapshot); @@ -1825,7 +1861,8 @@ heapam_index_validate_scan(Relation heapRelation, 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - false); /* syncscan not OK */ + false, /* syncscan not OK */ + false); hscan = (HeapScanDesc) scan; pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL, diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 5e89b86a62c..e4284181738 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -472,7 +472,7 @@ systable_beginscan(Relation heapRelation, */ sysscan->scan = table_beginscan_strat(heapRelation, snapshot, nkeys, key, - true, false); + true, false, false); sysscan->iscan = NULL; } diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 47a9bda30c9..6f091d39813 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -262,7 +262,7 @@ static double _bt_spools_heapscan(Relation heap, Relation index, static void _bt_spooldestroy(BTSpool *btspool); static void _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull); -static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2); +static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots); static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state); static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level); @@ -325,18 +325,22 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) RelationGetRelationName(index)); reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo); + Assert(indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique || + !indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); /* * Finish the build by (1) completing the sort of the spool file, (2) * inserting the sorted tuples into btree pages and (3) building the upper * levels. Finally, it may also be necessary to end use of parallelism. */ - _bt_leafbuild(buildstate.spool, buildstate.spool2); + _bt_leafbuild(buildstate.spool, buildstate.spool2, !indexInfo->ii_ParallelWorkers && indexInfo->ii_Concurrent); _bt_spooldestroy(buildstate.spool); if (buildstate.spool2) _bt_spooldestroy(buildstate.spool2); if (buildstate.btleader) _bt_end_parallel(buildstate.btleader); + Assert(indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique || + !indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); result = palloc_object(IndexBuildResult); @@ -484,6 +488,9 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, else reltuples = _bt_parallel_heapscan(buildstate, &indexInfo->ii_BrokenHotChain); + InvalidateCatalogSnapshot(); + Assert(indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique || + !indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); /* * Set the progress target for the next phase. Reset the block number @@ -539,7 +546,7 @@ _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, co * create an entire btree. */ static void -_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) +_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots) { BTWriteState wstate; @@ -561,18 +568,21 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) PROGRESS_BTREE_PHASE_PERFORMSORT_2); tuplesort_performsort(btspool2->sortstate); } + Assert(!reset_snapshots || !TransactionIdIsValid(MyProc->xmin)); wstate.heap = btspool->heap; wstate.index = btspool->index; wstate.inskey = _bt_mkscankey(wstate.index, NULL); /* _bt_mkscankey() won't set allequalimage without metapage */ wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true); + InvalidateCatalogSnapshot(); /* reserve the metapage */ wstate.btws_pages_alloced = BTREE_METAPAGE + 1; pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_BTREE_PHASE_LEAF_LOAD); + Assert(!reset_snapshots || !TransactionIdIsValid(MyProc->xmin)); _bt_load(&wstate, btspool, btspool2); } @@ -1411,6 +1421,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; + bool need_pop_active_snapshot = true; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -1436,9 +1447,16 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * live according to that. */ if (!isconcurrent) + { + Assert(ActiveSnapshotSet()); snapshot = SnapshotAny; + need_pop_active_snapshot = false; + } else + { snapshot = RegisterSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(snapshot); + } /* * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and @@ -1492,6 +1510,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) /* If no DSM segment was available, back out (do serial build) */ if (pcxt->seg == NULL) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); if (IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); @@ -1586,6 +1606,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); _bt_end_parallel(btleader); return; } @@ -1602,6 +1624,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * sure that the failure-to-start case will not hang forever. */ WaitForParallelWorkersToAttach(pcxt); + if (need_pop_active_snapshot) + PopActiveSnapshot(); } /* diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 780ef646a54..49832a36470 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -24,6 +24,7 @@ #include "nodes/execnodes.h" #include "storage/bufmgr.h" #include "storage/bulk_write.h" +#include "storage/proc.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -143,6 +144,8 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) result = palloc0_object(IndexBuildResult); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; + InvalidateCatalogSnapshot(); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); return result; } diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index aafc53e0164..7772d1379bc 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -3708,6 +3708,17 @@ PreventInTransactionBlock(bool isTopLevel, const char *stmtType) MyXactFlags |= XACT_FLAGS_NEEDIMMEDIATECOMMIT; } +void +PreventIsolationUsesXactSnapshot(const char *stmtType) +{ + if (IsolationUsesXactSnapshot()) + ereport(ERROR, + (errcode(ERRCODE_INAPPROPRIATE_ISOLATION_LEVEL_FOR_COMMAND), + /* translator: %s represents an SQL statement name */ + errmsg("%s does not support transaction isolation higher than READ COMMITTED", + stmtType))); +} + /* * WarnNoTransactionBlock * RequireTransactionBlock diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index d8219b18c48..f4dc4563c91 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -81,6 +81,7 @@ #include "utils/snapmgr.h" #include "utils/syscache.h" #include "utils/tuplesort.h" +#include "storage/proc.h" /* Potentially set by pg_upgrade_support functions */ Oid binary_upgrade_next_index_pg_class_oid = InvalidOid; @@ -1493,8 +1494,8 @@ index_concurrently_build(Oid heapRelationId, Relation indexRelation; IndexInfo *indexInfo; - /* This had better make sure that a snapshot is active */ - Assert(ActiveSnapshotSet()); + Assert(!TransactionIdIsValid(MyProc->xmin)); + Assert(!TransactionIdIsValid(MyProc->xid)); /* Open and lock the parent heap relation */ heapRel = table_open(heapRelationId, ShareUpdateExclusiveLock); @@ -1512,19 +1513,29 @@ index_concurrently_build(Oid heapRelationId, indexRelation = index_open(indexRelationId, RowExclusiveLock); + /* BuildIndexInfo may require as snapshot for expressions and predicates */ + PushActiveSnapshot(GetTransactionSnapshot()); /* * We have to re-build the IndexInfo struct, since it was lost in the * commit of the transaction where this concurrent index was created at * the catalog level. */ indexInfo = BuildIndexInfo(indexRelation); + /* Done with snapshot */ + PopActiveSnapshot(); Assert(!indexInfo->ii_ReadyForInserts); indexInfo->ii_Concurrent = true; indexInfo->ii_BrokenHotChain = false; + InvalidateCatalogSnapshot(); + Assert(!TransactionIdIsValid(MyProc->xmin)); + /* Now build the index */ index_build(heapRel, indexRelation, indexInfo, false, true); + InvalidateCatalogSnapshot(); + Assert((indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique) || !TransactionIdIsValid(MyProc->xmin)); + /* Roll back any GUC changes executed by index functions */ AtEOXact_GUC(false, save_nestlevel); @@ -1535,12 +1546,19 @@ index_concurrently_build(Oid heapRelationId, table_close(heapRel, NoLock); index_close(indexRelation, NoLock); + /* + * Updating pg_index might involve TOAST table access, so ensure we + * have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); /* * Update the pg_index row to mark the index as ready for inserts. Once we * commit this transaction, any new transactions that open the table must * insert new entries into the index for insertions and non-HOT updates. */ index_set_state_flags(indexRelationId, INDEX_CREATE_SET_READY); + /* we can do away with our snapshot */ + PopActiveSnapshot(); } /* @@ -3237,7 +3255,8 @@ IndexCheckExclusion(Relation heapRelation, 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - true); /* syncscan OK */ + true, /* syncscan OK */ + false); while (table_scan_getnextslot(scan, ForwardScanDirection, slot)) { @@ -3300,12 +3319,16 @@ IndexCheckExclusion(Relation heapRelation, * as of the start of the scan (see table_index_build_scan), whereas a normal * build takes care to include recently-dead tuples. This is OK because * we won't mark the index valid until all transactions that might be able - * to see those tuples are gone. The reason for doing that is to avoid + * to see those tuples are gone. One of reasons for doing that is to avoid * bogus unique-index failures due to concurrent UPDATEs (we might see * different versions of the same row as being valid when we pass over them, * if we used HeapTupleSatisfiesVacuum). This leaves us with an index that * does not contain any tuples added to the table while we built the index. * + * Furthermore, in case of non-unique index we set SO_RESET_SNAPSHOT for the + * scan, which causes new snapshot to be set as active every so often. The reason + * for that is to propagate the xmin horizon forward. + * * Next, we mark the index "indisready" (but still not "indisvalid") and * commit the second transaction and start a third. Again we wait for all * transactions that could have been modifying the table to terminate. Now diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index b89c6855364..415d0236c46 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1702,23 +1702,17 @@ DefineIndex(ParseState *pstate, * chains can be created where the new tuple and the old tuple in the * chain have different index keys. * - * We now take a new snapshot, and build the index using all tuples that - * are visible in this snapshot. We can be sure that any HOT updates to + * We build the index using all tuples that are visible using single or + * multiple refreshing snapshots. We can be sure that any HOT updates to * these tuples will be compatible with the index, since any updates made * by transactions that didn't know about the index are now committed or * rolled back. Thus, each visible tuple is either the end of its * HOT-chain or the extension of the chain is HOT-safe for this index. */ - /* Set ActiveSnapshot since functions in the indexes may need it */ - PushActiveSnapshot(GetTransactionSnapshot()); - /* Perform concurrent build of index */ index_concurrently_build(tableId, indexRelationId); - /* we can do away with our snapshot */ - PopActiveSnapshot(); - /* * Commit this transaction to make the indisready update visible. */ @@ -2875,8 +2869,11 @@ ExecReindex(ParseState *pstate, const ReindexStmt *stmt, bool isTopLevel) } if (concurrently) + { PreventInTransactionBlock(isTopLevel, "REINDEX CONCURRENTLY"); + PreventIsolationUsesXactSnapshot("REINDEX CONCURRENTLY"); + } params.options = (verbose ? REINDEXOPT_VERBOSE : 0) | @@ -4132,9 +4129,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein if (newidx->safe) set_indexsafe_procflags(); - /* Set ActiveSnapshot since functions in the indexes may need it */ - PushActiveSnapshot(GetTransactionSnapshot()); - /* * Update progress for the index to build, with the correct parent * table involved. @@ -4149,7 +4143,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein /* Perform concurrent build of new index */ index_concurrently_build(newidx->tableId, newidx->indexId); - PopActiveSnapshot(); CommitTransactionCommand(); } diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 42604a0f75c..d6bb3546bea 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -63,6 +63,7 @@ #include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/selfuncs.h" +#include "utils/snapmgr.h" /* GUC parameters */ double cursor_tuple_fraction = DEFAULT_CURSOR_TUPLE_FRACTION; @@ -7027,6 +7028,7 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) Relation heap; Relation index; RelOptInfo *rel; + bool need_pop_active_snapshot = false; int parallel_workers; BlockNumber heap_blocks; double reltuples; @@ -7082,6 +7084,12 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) heap = table_open(tableOid, NoLock); index = index_open(indexOid, NoLock); + /* Set ActiveSnapshot since functions in the indexes may need it */ + if (!ActiveSnapshotSet()) + { + PushActiveSnapshot(GetTransactionSnapshot()); + need_pop_active_snapshot = true; + } /* * Determine if it's safe to proceed. * @@ -7139,6 +7147,8 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) parallel_workers--; done: + if (need_pop_active_snapshot) + PopActiveSnapshot(); index_close(index, NoLock); table_close(heap, NoLock); diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 2b609bfc824..b55232116de 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -1463,8 +1463,11 @@ ProcessUtilitySlow(ParseState *pstate, bool is_alter_table; if (stmt->concurrent) + { PreventInTransactionBlock(isTopLevel, "CREATE INDEX CONCURRENTLY"); + PreventIsolationUsesXactSnapshot("CREATE INDEX CONCURRENTLY"); + } /* * Look up the relation OID just once, right here at the diff --git a/src/backend/utils/errcodes.txt b/src/backend/utils/errcodes.txt index 5b25402ebbe..a80f08af241 100644 --- a/src/backend/utils/errcodes.txt +++ b/src/backend/utils/errcodes.txt @@ -259,6 +259,7 @@ Section: Class 25 - Invalid Transaction State 25P02 E ERRCODE_IN_FAILED_SQL_TRANSACTION in_failed_sql_transaction 25P03 E ERRCODE_IDLE_IN_TRANSACTION_SESSION_TIMEOUT idle_in_transaction_session_timeout 25P04 E ERRCODE_TRANSACTION_TIMEOUT transaction_timeout +25P05 E ERRCODE_INAPPROPRIATE_ISOLATION_LEVEL_FOR_COMMAND inappropriate_isolation_level_for_command Section: Class 26 - Invalid SQL Statement Name diff --git a/src/bin/pg_amcheck/t/006_cic.pl b/src/bin/pg_amcheck/t/006_cic.pl index 70b185212e7..4ae7b796c6e 100644 --- a/src/bin/pg_amcheck/t/006_cic.pl +++ b/src/bin/pg_amcheck/t/006_cic.pl @@ -152,7 +152,7 @@ $node->pgbench( 0, [qr{actually processed}], [qr{^$}], - 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for GIN/GIST/BRIN/HASH/SPGIST', + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for GIN', { 'concurrent_ops_gin_idx' => q( SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 2fdc50b865b..fd0c2cf0bf0 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -43,6 +43,8 @@ #define HEAP_PAGE_PRUNE_MARK_UNUSED_NOW (1 << 0) #define HEAP_PAGE_PRUNE_FREEZE (1 << 1) +#define SO_RESET_SNAPSHOT_EACH_N_PAGE 4096 + typedef struct BulkInsertStateData *BulkInsertState; typedef struct GlobalVisState GlobalVisState; typedef struct TupleTableSlot TupleTableSlot; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 06084752245..d0709782dde 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -25,6 +25,7 @@ #include "storage/read_stream.h" #include "utils/rel.h" #include "utils/snapshot.h" +#include "utils/injection_point.h" #define DEFAULT_TABLE_ACCESS_METHOD "heap" @@ -63,6 +64,17 @@ typedef enum ScanOptions /* unregister snapshot at scan end? */ SO_TEMP_SNAPSHOT = 1 << 9, + /* + * Reset scan and catalog snapshot every so often? If so, each + * SO_RESET_SNAPSHOT_EACH_N_PAGE pages active snapshot is popped, + * catalog snapshot invalidated, latest snapshot pushed as active. + * + * At the end of the scan snapshot is not popped. + * Goal of such mode is keep xmin propagating horizon forward. + * + * see heap_reset_scan_snapshot for details. + */ + SO_RESET_SNAPSHOT = 1 << 10, } ScanOptions; /* @@ -919,7 +931,8 @@ extern TableScanDesc table_beginscan_catalog(Relation relation, int nkeys, static inline TableScanDesc table_beginscan_strat(Relation rel, Snapshot snapshot, int nkeys, ScanKeyData *key, - bool allow_strat, bool allow_sync) + bool allow_strat, bool allow_sync, + bool reset_snapshot) { uint32 flags = SO_TYPE_SEQSCAN | SO_ALLOW_PAGEMODE; @@ -927,6 +940,15 @@ table_beginscan_strat(Relation rel, Snapshot snapshot, flags |= SO_ALLOW_STRAT; if (allow_sync) flags |= SO_ALLOW_SYNC; + if (reset_snapshot) + { + INJECTION_POINT("table_beginscan_strat_reset_snapshots", NULL); + /* Active snapshot is required on start. */ + Assert(GetActiveSnapshot() == snapshot); + /* Active snapshot should not be registered to keep xmin propagating. */ + Assert(GetActiveSnapshot()->regd_count == 0); + flags |= (SO_RESET_SNAPSHOT); + } return table_beginscan_common(rel, snapshot, nkeys, key, NULL, flags); } @@ -1760,6 +1782,10 @@ table_scan_analyze_next_tuple(TableScanDesc scan, * very hard to detect whether they're really incompatible with the chain tip. * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. + * + * In case of non-unique index and non-parallel concurrent build, + * SO_RESET_SNAPSHOT is applied for the scan. That leads to changing snapshots + * on the fly to allow xmin horizon propagate. */ static inline double table_index_build_scan(Relation table_rel, diff --git a/src/include/access/xact.h b/src/include/access/xact.h index f0b4d795071..2e3f617a949 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -488,6 +488,7 @@ extern bool IsTransactionOrTransactionBlock(void); extern char TransactionBlockStatusCode(void); extern void AbortOutOfAnyTransaction(void); extern void PreventInTransactionBlock(bool isTopLevel, const char *stmtType); +extern void PreventIsolationUsesXactSnapshot(const char *stmtType); extern void RequireTransactionBlock(bool isTopLevel, const char *stmtType); extern void WarnNoTransactionBlock(bool isTopLevel, const char *stmtType); extern bool IsInTransactionBlock(bool isTopLevel); diff --git a/src/test/modules/injection_points/Makefile b/src/test/modules/injection_points/Makefile index a41d781f8c9..eeaeaaf2cc6 100644 --- a/src/test/modules/injection_points/Makefile +++ b/src/test/modules/injection_points/Makefile @@ -9,7 +9,7 @@ EXTENSION = injection_points DATA = injection_points--1.0.sql PGFILEDESC = "injection_points - facility for injection points" -REGRESS = injection_points hashagg reindex_conc vacuum +REGRESS = injection_points hashagg reindex_conc vacuum cic_reset_snapshots REGRESS_OPTS = --dlpath=$(top_builddir)/src/test/regress ISOLATION = basic \ diff --git a/src/test/modules/injection_points/expected/cic_reset_snapshots.out b/src/test/modules/injection_points/expected/cic_reset_snapshots.out new file mode 100644 index 00000000000..b4ad90eb339 --- /dev/null +++ b/src/test/modules/injection_points/expected/cic_reset_snapshots.out @@ -0,0 +1,115 @@ +CREATE EXTENSION injection_points; +SELECT injection_points_set_local(); + injection_points_set_local +---------------------------- + +(1 row) + +SELECT injection_points_attach('heap_reset_scan_snapshot_effective', 'notice'); + injection_points_attach +------------------------- + +(1 row) + +SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice'); + injection_points_attach +------------------------- + +(1 row) + +CREATE SCHEMA cic_reset_snap; +CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); +INSERT INTO cic_reset_snap.tbl SELECT i, i * I FROM generate_series(1, 200) s(i); +CREATE FUNCTION cic_reset_snap.predicate_stable(integer) RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN MOD($1, 2) = 0; +END; $$; +CREATE FUNCTION cic_reset_snap.predicate_stable_no_param() RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN false; +END; $$; +---------------- +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +-- The same in parallel mode +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i DESC NULLS LAST); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +BEGIN TRANSACTION; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +ERROR: CREATE INDEX CONCURRENTLY cannot run inside a transaction block +ROLLBACK ; +SET default_transaction_isolation = serializable; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +ERROR: CREATE INDEX CONCURRENTLY does not support transaction isolation higher than READ COMMITTED +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +ERROR: REINDEX CONCURRENTLY does not support transaction isolation higher than READ COMMITTED +RESET default_transaction_isolation; +DROP SCHEMA cic_reset_snap CASCADE; +NOTICE: drop cascades to 3 other objects +DETAIL: drop cascades to table cic_reset_snap.tbl +drop cascades to function cic_reset_snap.predicate_stable(integer) +drop cascades to function cic_reset_snap.predicate_stable_no_param() +DROP EXTENSION injection_points; diff --git a/src/test/modules/injection_points/meson.build b/src/test/modules/injection_points/meson.build index fcc85414515..5d298cd2710 100644 --- a/src/test/modules/injection_points/meson.build +++ b/src/test/modules/injection_points/meson.build @@ -36,6 +36,7 @@ tests += { 'hashagg', 'reindex_conc', 'vacuum', + 'cic_reset_snapshots', ], 'regress_args': ['--dlpath', meson.project_build_root() / 'src/test/regress'], # The injection points are cluster-wide, so disable installcheck diff --git a/src/test/modules/injection_points/sql/cic_reset_snapshots.sql b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql new file mode 100644 index 00000000000..7f7dffa5be4 --- /dev/null +++ b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql @@ -0,0 +1,95 @@ +CREATE EXTENSION injection_points; + +SELECT injection_points_set_local(); +SELECT injection_points_attach('heap_reset_scan_snapshot_effective', 'notice'); +SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice'); + + +CREATE SCHEMA cic_reset_snap; +CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); +INSERT INTO cic_reset_snap.tbl SELECT i, i * I FROM generate_series(1, 200) s(i); + +CREATE FUNCTION cic_reset_snap.predicate_stable(integer) RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN MOD($1, 2) = 0; +END; $$; + +CREATE FUNCTION cic_reset_snap.predicate_stable_no_param() RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN false; +END; $$; + +---------------- +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); + +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +-- The same in parallel mode +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); + +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i DESC NULLS LAST); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +BEGIN TRANSACTION; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +ROLLBACK ; + +SET default_transaction_isolation = serializable; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +RESET default_transaction_isolation; + +DROP SCHEMA cic_reset_snap CASCADE; + +DROP EXTENSION injection_points; -- 2.43.0 [application/octet-stream] v3-0004-Support-snapshot-resets-in-concurrent-builds-of-u.patch (42.6K, 5-v3-0004-Support-snapshot-resets-in-concurrent-builds-of-u.patch) download | inline diff: From 197d221cce2c722928323f046a7c15bc0fd04473 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <[email protected]> Date: Wed, 14 Jan 2026 19:02:34 +0300 Subject: [PATCH v3 4/4] Support snapshot resets in concurrent builds of unique indexes Previously, concurrent builds of unique index used a fixed snapshot for the entire scan to ensure proper uniqueness checks. Now reset snapshots periodically during concurrent unique index builds, while still maintaining uniqueness by: - ignoring SnapshotSelf dead tuples during uniqueness checks in tuplesort as not a guarantee, but a fail-fast mechanics - adding a uniqueness check in _bt_load that detects multiple alive tuples with the same key values as a guarantee of correctness Tuples are SnapshotSelf tested only in the case of equal index key values, otherwise _bt_load works like before. --- src/backend/access/heap/README.HOT | 12 +- src/backend/access/heap/heapam_handler.c | 6 +- src/backend/access/nbtree/nbtdedup.c | 8 +- src/backend/access/nbtree/nbtreadpage.c | 2 +- src/backend/access/nbtree/nbtsort.c | 195 ++++++++++++++---- src/backend/access/nbtree/nbtsplitloc.c | 12 +- src/backend/access/nbtree/nbtutils.c | 30 ++- src/backend/catalog/index.c | 8 +- src/backend/commands/indexcmds.c | 4 +- src/backend/utils/sort/tuplesortvariants.c | 71 +++++-- src/include/access/nbtree.h | 4 +- src/include/access/tableam.h | 2 +- src/include/utils/tuplesort.h | 1 + .../expected/cic_reset_snapshots.out | 31 ++- 14 files changed, 290 insertions(+), 96 deletions(-) diff --git a/src/backend/access/heap/README.HOT b/src/backend/access/heap/README.HOT index 74e407f375a..829dad1194e 100644 --- a/src/backend/access/heap/README.HOT +++ b/src/backend/access/heap/README.HOT @@ -386,12 +386,12 @@ have the HOT-safety property enforced before we start to build the new index. After waiting for transactions which had the table open, we build the index -for all rows that are valid in a fresh snapshot. Any tuples visible in the -snapshot will have only valid forward-growing HOT chains. (They might have -older HOT updates behind them which are broken, but this is OK for the same -reason it's OK in a regular index build.) As above, we point the index -entry at the root of the HOT-update chain but we use the key value from the -live tuple. +for all rows that are valid in a fresh snapshot, which is updated every so +often. Any tuples visible in the snapshot will have only valid forward-growing +HOT chains. (They might have older HOT updates behind them which are broken, +but this is OK for the same reason it's OK in a regular index build.) +As above, we point the index entry at the root of the HOT-update chain but we +use the key value from the live tuple. We mark the index open for inserts (but still not ready for reads) then we again wait for transactions which have the table open. Then we take diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index a639e8b31e0..04738d59b91 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1261,15 +1261,15 @@ heapam_index_build_range_scan(Relation heapRelation, * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, or during bootstrap, we take a regular MVCC snapshot * and index whatever's live according to that while that snapshot is reset - * every so often (in case of non-unique index). + * every so often. */ OldestXmin = InvalidTransactionId; /* - * For unique index we need consistent snapshot for the whole scan. + * For concurrent builds of non-system indexes, we may want to periodically + * reset snapshots to allow vacuum to clean up tuples. */ reset_snapshots = indexInfo->ii_Concurrent && - !indexInfo->ii_Unique && !is_system_catalog; /* just for the case */ /* okay to ignore lazy VACUUMs here */ diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c index af7affdf409..10f4f7eeba9 100644 --- a/src/backend/access/nbtree/nbtdedup.c +++ b/src/backend/access/nbtree/nbtdedup.c @@ -149,7 +149,7 @@ _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, _bt_dedup_start_pending(state, itup, offnum); } else if (state->deduplicate && - _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + _bt_keep_natts_fast(rel, state->base, itup, NULL) > nkeyatts && _bt_dedup_save_htid(state, itup)) { /* @@ -376,7 +376,7 @@ _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, /* itup starts first pending interval */ _bt_dedup_start_pending(state, itup, offnum); } - else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + else if (_bt_keep_natts_fast(rel, state->base, itup, NULL) > nkeyatts && _bt_dedup_save_htid(state, itup)) { /* Tuple is equal; just added its TIDs to pending interval */ @@ -789,12 +789,12 @@ _bt_do_singleval(Relation rel, Page page, BTDedupState state, itemid = PageGetItemId(page, minoff); itup = (IndexTuple) PageGetItem(page, itemid); - if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + if (_bt_keep_natts_fast(rel, newitem, itup, NULL) > nkeyatts) { itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page)); itup = (IndexTuple) PageGetItem(page, itemid); - if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + if (_bt_keep_natts_fast(rel, newitem, itup, NULL) > nkeyatts) return true; } diff --git a/src/backend/access/nbtree/nbtreadpage.c b/src/backend/access/nbtree/nbtreadpage.c index 2ba1ca66023..c4c278e777a 100644 --- a/src/backend/access/nbtree/nbtreadpage.c +++ b/src/backend/access/nbtree/nbtreadpage.c @@ -623,7 +623,7 @@ _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate) lasttup = (IndexTuple) PageGetItem(pstate->page, iid); /* Determine the first attribute whose values change on caller's page */ - firstchangingattnum = _bt_keep_natts_fast(rel, firsttup, lasttup); + firstchangingattnum = _bt_keep_natts_fast(rel, firsttup, lasttup, NULL); for (; startikey < so->numberOfKeys; startikey++) { diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 01e338fb562..7bff89e0928 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -87,6 +87,7 @@ typedef struct BTSpool Relation index; bool isunique; bool nulls_not_distinct; + bool unique_dead_ignored; } BTSpool; /* @@ -105,6 +106,7 @@ typedef struct BTShared Oid indexrelid; bool isunique; bool nulls_not_distinct; + bool unique_dead_ignored; bool isconcurrent; int scantuplesortstates; @@ -207,15 +209,13 @@ typedef struct BTLeader */ typedef struct BTBuildState { - bool isunique; - bool nulls_not_distinct; bool havedead; Relation heap; BTSpool *spool; /* - * spool2 is needed only when the index is a unique index. Dead tuples are - * put into spool2 instead of spool in order to avoid uniqueness check. + * spool2 is needed only when the index is a unique index and built non-concurrently. + * Dead tuples are put into spool2 instead of spool in order to avoid uniqueness check. */ BTSpool *spool2; double indtuples; @@ -262,7 +262,7 @@ static double _bt_spools_heapscan(Relation heap, Relation index, static void _bt_spooldestroy(BTSpool *btspool); static void _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull); -static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots); +static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool isconcurrent); static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state); static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level); @@ -307,8 +307,6 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) ResetUsage(); #endif /* BTREE_BUILD_STATS */ - buildstate.isunique = indexInfo->ii_Unique; - buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct; buildstate.havedead = false; buildstate.heap = heap; buildstate.spool = NULL; @@ -325,20 +323,20 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) RelationGetRelationName(index)); reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo); - Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); /* * Finish the build by (1) completing the sort of the spool file, (2) * inserting the sorted tuples into btree pages and (3) building the upper * levels. Finally, it may also be necessary to end use of parallelism. */ - _bt_leafbuild(buildstate.spool, buildstate.spool2, !indexInfo->ii_Unique && indexInfo->ii_Concurrent); + _bt_leafbuild(buildstate.spool, buildstate.spool2, indexInfo->ii_Concurrent); _bt_spooldestroy(buildstate.spool); if (buildstate.spool2) _bt_spooldestroy(buildstate.spool2); if (buildstate.btleader) _bt_end_parallel(buildstate.btleader); - Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); result = palloc_object(IndexBuildResult); @@ -385,6 +383,11 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, btspool->index = index; btspool->isunique = indexInfo->ii_Unique; btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct; + /* + * We need to ignore dead tuples for unique checks in case of concurrent build. + * It is required because of periodic reset of snapshot. + */ + btspool->unique_dead_ignored = indexInfo->ii_Concurrent && indexInfo->ii_Unique; /* Save as primary spool */ buildstate->spool = btspool; @@ -433,8 +436,9 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * the use of parallelism or any other factor. */ buildstate->spool->sortstate = - tuplesort_begin_index_btree(heap, index, buildstate->isunique, - buildstate->nulls_not_distinct, + tuplesort_begin_index_btree(heap, index, btspool->isunique, + btspool->nulls_not_distinct, + btspool->unique_dead_ignored, maintenance_work_mem, coordinate, TUPLESORT_NONE); @@ -442,8 +446,12 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * If building a unique index, put dead tuples in a second spool to keep * them out of the uniqueness check. We expect that the second spool (for * dead tuples) won't get very full, so we give it only work_mem. + * + * In case of concurrent build dead tuples do not need to be put into index + * since we wait for all snapshots older than reference snapshot during the + * validation phase. */ - if (indexInfo->ii_Unique) + if (indexInfo->ii_Unique && !indexInfo->ii_Concurrent) { BTSpool *btspool2 = palloc0_object(BTSpool); SortCoordinate coordinate2 = NULL; @@ -474,7 +482,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * full, so we give it only work_mem */ buildstate->spool2->sortstate = - tuplesort_begin_index_btree(heap, index, false, false, work_mem, + tuplesort_begin_index_btree(heap, index, false, false, false, work_mem, coordinate2, TUPLESORT_NONE); } @@ -487,7 +495,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, reltuples = _bt_parallel_heapscan(buildstate, &indexInfo->ii_BrokenHotChain); InvalidateCatalogSnapshot(); - Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); /* * Set the progress target for the next phase. Reset the block number @@ -543,7 +551,7 @@ _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, co * create an entire btree. */ static void -_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots) +_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool isconcurrent) { BTWriteState wstate; @@ -565,7 +573,7 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots) PROGRESS_BTREE_PHASE_PERFORMSORT_2); tuplesort_performsort(btspool2->sortstate); } - Assert(!reset_snapshots || !TransactionIdIsValid(MyProc->xmin)); + Assert(!isconcurrent || !TransactionIdIsValid(MyProc->xmin)); wstate.heap = btspool->heap; wstate.index = btspool->index; @@ -579,7 +587,7 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots) pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_BTREE_PHASE_LEAF_LOAD); - Assert(!reset_snapshots || !TransactionIdIsValid(MyProc->xmin)); + Assert(!isconcurrent || !TransactionIdIsValid(MyProc->xmin)); _bt_load(&wstate, btspool, btspool2); } @@ -1156,13 +1164,118 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) SortSupport sortKeys; int64 tuples_done = 0; bool deduplicate; + bool fail_on_alive_duplicate; wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM); deduplicate = wstate->inskey->allequalimage && !btspool->isunique && BTGetDeduplicateItems(wstate->index); + /* + * The unique_dead_ignored does not guarantee absence of multiple alive + * tuples with same values exists in the spool. Such thing may happen if + * alive tuples are located between a few dead tuples, like this: addda. + */ + fail_on_alive_duplicate = btspool->unique_dead_ignored; - if (merge) + if (fail_on_alive_duplicate) + { + bool seen_alive = false, + prev_tested = false; + IndexTuple prev = NULL; + TupleTableSlot *slot = MakeSingleTupleTableSlot(RelationGetDescr(wstate->heap), + &TTSOpsBufferHeapTuple); + IndexFetchTableData *fetch = table_index_fetch_begin(wstate->heap); + + Assert(btspool->isunique); + Assert(!btspool2); + + while ((itup = tuplesort_getindextuple(btspool->sortstate, true)) != NULL) + { + bool tuples_equal = false; + + /* When we see first tuple, create first index page */ + if (state == NULL) + state = _bt_pagestate(wstate, 0); + + if (prev != NULL) /* if is not the first tuple */ + { + bool has_nulls = false, + call_again, /* just to pass something */ + ignored, /* just to pass something */ + now_alive; + ItemPointerData tid; + + /* if this tuples equal to previous one? */ + if (wstate->inskey->allequalimage) + tuples_equal = _bt_keep_natts_fast(wstate->index, prev, itup, &has_nulls) > keysz; + else + tuples_equal = _bt_keep_natts(wstate->index, prev, itup, wstate->inskey, &has_nulls) > keysz; + + /* handle null values correctly */ + if (has_nulls && !btspool->nulls_not_distinct) + tuples_equal = false; + + if (tuples_equal) + { + /* check previous tuple if not yet */ + if (!prev_tested) + { + call_again = false; + tid = prev->t_tid; + seen_alive = table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + prev_tested = true; + } + + call_again = false; + tid = itup->t_tid; + now_alive = table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + + /* are multiple alive tuples detected in equal group? */ + if (seen_alive && now_alive) + { + char *key_desc; + TupleDesc tupDes = RelationGetDescr(wstate->index); + bool isnull[INDEX_MAX_KEYS]; + Datum values[INDEX_MAX_KEYS]; + + index_deform_tuple(itup, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(wstate->index, values, isnull); + + /* keep this message in sync with the same in comparetup_index_btree_tiebreak */ + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(wstate->index)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(wstate->heap, + RelationGetRelationName(wstate->index)))); + } + seen_alive |= now_alive; + } + } + + if (!tuples_equal) + { + seen_alive = false; + prev_tested = false; + } + + _bt_buildadd(wstate, state, itup, 0); + if (prev) pfree(prev); + prev = CopyIndexTuple(itup); + + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + if (prev) pfree(prev); + ExecDropSingleTupleTableSlot(slot); + table_index_fetch_end(fetch); + Assert(!TransactionIdIsValid(MyProc->xmin)); + } + else if (merge) { /* * Another BTSpool for dead tuples exists. Now we have to merge @@ -1322,7 +1435,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) InvalidOffsetNumber); } else if (_bt_keep_natts_fast(wstate->index, dstate->base, - itup) > keysz && + itup, NULL) > keysz && _bt_dedup_save_htid(dstate, itup)) { /* @@ -1419,7 +1532,6 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) BufferUsage *bufferusage; bool leaderparticipates = true; bool need_pop_active_snapshot = true; - bool reset_snapshot; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -1437,21 +1549,12 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) scantuplesortstates = leaderparticipates ? request + 1 : request; - /* - * For concurrent non-unique index builds, we can periodically reset snapshots - * to allow the xmin horizon to advance. This is safe since these builds don't - * require a consistent view across the entire scan. Unique indexes still need - * a stable snapshot to properly enforce uniqueness constraints. - */ - reset_snapshot = isconcurrent && !btspool->isunique; - /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that, while that snapshot may be reset periodically in - * case of non-unique index. + * live according to that, while that snapshot may be reset periodically. */ if (!isconcurrent) { @@ -1459,16 +1562,16 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) snapshot = SnapshotAny; need_pop_active_snapshot = false; } - else if (reset_snapshot) + else { + /* + * For concurrent index builds, we can periodically reset snapshots to allow + * the xmin horizon to advance. This is safe since these builds don't + * require a consistent view across the entire scan. + */ snapshot = InvalidSnapshot; PushActiveSnapshot(GetTransactionSnapshot()); } - else - { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); - PushActiveSnapshot(snapshot); - } /* * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and @@ -1538,6 +1641,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) btshared->indexrelid = RelationGetRelid(btspool->index); btshared->isunique = btspool->isunique; btshared->nulls_not_distinct = btspool->nulls_not_distinct; + btshared->unique_dead_ignored = btspool->unique_dead_ignored; btshared->isconcurrent = isconcurrent; btshared->scantuplesortstates = scantuplesortstates; btshared->queryid = pgstat_get_my_query_id(); @@ -1552,7 +1656,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) table_parallelscan_initialize(btspool->heap, ParallelTableScanFromBTShared(btshared), snapshot, - reset_snapshot); + isconcurrent); /* * Store shared tuplesort-private state, for which we reserved space. @@ -1632,7 +1736,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * In case of concurrent build snapshots are going to be reset periodically. * Wait until all workers imported initial snapshot. */ - if (reset_snapshot) + if (isconcurrent) WaitForParallelWorkersToAttach(pcxt, true); /* Join heap scan ourselves */ @@ -1643,13 +1747,13 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - if (!reset_snapshot) + if (!isconcurrent) WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); InvalidateCatalogSnapshot(); - Assert(!reset_snapshot|| !TransactionIdIsValid(MyProc->xmin)); + Assert(!isconcurrent || !TransactionIdIsValid(MyProc->xmin)); } /* @@ -1749,6 +1853,7 @@ _bt_leader_participate_as_worker(BTBuildState *buildstate) leaderworker->index = buildstate->spool->index; leaderworker->isunique = buildstate->spool->isunique; leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct; + leaderworker->unique_dead_ignored = buildstate->spool->unique_dead_ignored; /* Initialize second spool, if required */ if (!btleader->btshared->isunique) @@ -1852,11 +1957,12 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) btspool->index = indexRel; btspool->isunique = btshared->isunique; btspool->nulls_not_distinct = btshared->nulls_not_distinct; + btspool->unique_dead_ignored = btshared->unique_dead_ignored; /* Look up shared state private to tuplesort.c */ sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false); tuplesort_attach_shared(sharedsort, seg); - if (!btshared->isunique) + if (!btshared->isunique || btshared->isconcurrent) { btspool2 = NULL; sharedsort2 = NULL; @@ -1936,6 +2042,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, btspool->index, btspool->isunique, btspool->nulls_not_distinct, + btspool->unique_dead_ignored, sortmem, coordinate, TUPLESORT_NONE); @@ -1958,14 +2065,12 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, coordinate2->nParticipants = -1; coordinate2->sharedsort = sharedsort2; btspool2->sortstate = - tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false, + tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false, false, Min(sortmem, work_mem), coordinate2, false); } /* Fill in buildstate for _bt_build_callback() */ - buildstate.isunique = btshared->isunique; - buildstate.nulls_not_distinct = btshared->nulls_not_distinct; buildstate.havedead = false; buildstate.heap = btspool->heap; buildstate.spool = btspool; diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index de9eca3c8b2..b174fb8d3ee 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -688,7 +688,7 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, { itemid = PageGetItemId(state->origpage, maxoff); tup = (IndexTuple) PageGetItem(state->origpage, itemid); - keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); + keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem, NULL); if (keepnatts > 1 && keepnatts <= nkeyatts) { @@ -719,7 +719,7 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) return false; /* Check same conditions as rightmost item case, too */ - keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); + keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem, NULL); if (keepnatts > 1 && keepnatts <= nkeyatts) { @@ -968,7 +968,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, * avoid appending a heap TID in new high key, we're done. Finish split * with default strategy and initial split interval. */ - perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost, NULL); if (perfectpenalty <= indnkeyatts) return perfectpenalty; @@ -989,7 +989,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, * If page is entirely full of duplicates, a single value strategy split * will be performed. */ - perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost, NULL); if (perfectpenalty <= indnkeyatts) { *strategy = SPLIT_MANY_DUPLICATES; @@ -1028,7 +1028,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, itemid = PageGetItemId(state->origpage, P_HIKEY); hikey = (IndexTuple) PageGetItem(state->origpage, itemid); perfectpenalty = _bt_keep_natts_fast(state->rel, hikey, - state->newitem); + state->newitem, NULL); if (perfectpenalty <= indnkeyatts) *strategy = SPLIT_SINGLE_VALUE; else @@ -1150,7 +1150,7 @@ _bt_split_penalty(FindSplitData *state, SplitPoint *split) lastleft = _bt_split_lastleft(state, split); firstright = _bt_split_firstright(state, split); - return _bt_keep_natts_fast(state->rel, lastleft, firstright); + return _bt_keep_natts_fast(state->rel, lastleft, firstright, NULL); } /* diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 9b091858997..13fb676e389 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -30,9 +30,6 @@ static int _bt_compare_int(const void *va, const void *vb); -static int _bt_keep_natts(Relation rel, IndexTuple lastleft, - IndexTuple firstright, BTScanInsert itup_key); - /* * _bt_mkscankey @@ -708,7 +705,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright)); /* Determine how many attributes must be kept in truncated tuple */ - keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key); + keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key, NULL); #ifdef DEBUG_NO_TRUNCATE /* Force truncation to be ineffective for testing purposes */ @@ -826,17 +823,24 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, /* * _bt_keep_natts - how many key attributes to keep when truncating. * + * This is exported to be used as comparison function during concurrent + * unique index build in case _bt_keep_natts_fast is not suitable because + * collation is not "allequalimage"/deduplication-safe. + * * Caller provides two tuples that enclose a split point. Caller's insertion * scankey is used to compare the tuples; the scankey's argument values are * not considered here. * + * hasnulls value set to true in case of any null column in any tuple. + * * This can return a number of attributes that is one greater than the * number of key attributes for the index relation. This indicates that the * caller must use a heap TID as a unique-ifier in new pivot tuple. */ -static int +int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, - BTScanInsert itup_key) + BTScanInsert itup_key, + bool *hasnulls) { int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); TupleDesc itupdesc = RelationGetDescr(rel); @@ -862,6 +866,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + if (hasnulls) + *hasnulls |= isNull1 | isNull2; if (isNull1 != isNull2) break; @@ -881,7 +887,7 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * expected in an allequalimage index. */ Assert(!itup_key->allequalimage || - keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright)); + keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright, NULL)); return keepnatts; } @@ -892,7 +898,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * This is exported so that a candidate split point can have its effect on * suffix truncation inexpensively evaluated ahead of time when finding a * split location. A naive bitwise approach to datum comparisons is used to - * save cycles. + * save cycles. Also, it may be used as comparison function during concurrent + * build of unique index. * * The approach taken here usually provides the same answer as _bt_keep_natts * will (for the same pair of tuples from a heapkeyspace index), since the @@ -901,6 +908,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * "equal image" columns, routine is guaranteed to give the same result as * _bt_keep_natts would. * + * hasnulls value set to true in case of any null column in any tuple. + * * Callers can rely on the fact that attributes considered equal here are * definitely also equal according to _bt_keep_natts, even when the index uses * an opclass or collation that is not "allequalimage"/deduplication-safe. @@ -909,7 +918,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * more balanced split point. */ int -_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) +_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright, + bool *hasnulls) { TupleDesc itupdesc = RelationGetDescr(rel); int keysz = IndexRelationGetNumberOfKeyAttributes(rel); @@ -926,6 +936,8 @@ _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + if (hasnulls) + *hasnulls |= isNull1 | isNull2; att = TupleDescCompactAttr(itupdesc, attnum - 1); if (isNull1 != isNull2) diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 7c7a41fd09e..228e6bce65d 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -1534,7 +1534,7 @@ index_concurrently_build(Oid heapRelationId, index_build(heapRel, indexRelation, indexInfo, false, true); InvalidateCatalogSnapshot(); - Assert(indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); + Assert(!TransactionIdIsValid(MyProc->xmin)); /* Roll back any GUC changes executed by index functions */ AtEOXact_GUC(false, save_nestlevel); @@ -3325,9 +3325,9 @@ IndexCheckExclusion(Relation heapRelation, * if we used HeapTupleSatisfiesVacuum). This leaves us with an index that * does not contain any tuples added to the table while we built the index. * - * Furthermore, in case of non-unique index we set SO_RESET_SNAPSHOT for the - * scan, which causes new snapshot to be set as active every so often. The reason - * for that is to propagate the xmin horizon forward. + * Furthermore, we set SO_RESET_SNAPSHOT for the scan, which causes new + * snapshot to be set as active every so often. The reason for that is to + * propagate the xmin horizon forward. * * Next, we mark the index "indisready" (but still not "indisvalid") and * commit the second transaction and start a third. Again we wait for all diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 415d0236c46..97029be378f 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1702,8 +1702,8 @@ DefineIndex(ParseState *pstate, * chains can be created where the new tuple and the old tuple in the * chain have different index keys. * - * We build the index using all tuples that are visible using single or - * multiple refreshing snapshots. We can be sure that any HOT updates to + * We build the index using all tuples that are visible using multiple + * refreshing snapshots. We can be sure that any HOT updates to * these tuples will be compatible with the index, since any updates made * by transactions that didn't know about the index are now committed or * rolled back. Thus, each visible tuple is either the end of its diff --git a/src/backend/utils/sort/tuplesortvariants.c b/src/backend/utils/sort/tuplesortvariants.c index 2509ac3e3a4..1ecf745c663 100644 --- a/src/backend/utils/sort/tuplesortvariants.c +++ b/src/backend/utils/sort/tuplesortvariants.c @@ -25,6 +25,8 @@ #include "access/hash.h" #include "access/htup_details.h" #include "access/nbtree.h" +#include "access/relscan.h" +#include "access/tableam.h" #include "catalog/index.h" #include "catalog/pg_collation.h" #include "executor/executor.h" @@ -35,6 +37,7 @@ #include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/tuplesort.h" +#include "storage/proc.h" /* sort-type codes for sort__start probes */ @@ -136,6 +139,7 @@ typedef struct bool enforceUnique; /* complain if we find duplicate tuples */ bool uniqueNullsNotDistinct; /* unique constraint null treatment */ + bool uniqueDeadIgnored; /* ignore dead tuples in unique check */ } TuplesortIndexBTreeArg; /* @@ -361,6 +365,7 @@ tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, + bool uniqueDeadIgnored, int workMem, SortCoordinate coordinate, int sortopt) @@ -403,6 +408,7 @@ tuplesort_begin_index_btree(Relation heapRel, arg->index.indexRel = indexRel; arg->enforceUnique = enforceUnique; arg->uniqueNullsNotDistinct = uniqueNullsNotDistinct; + arg->uniqueDeadIgnored = uniqueDeadIgnored; indexScanKey = _bt_mkscankey(indexRel, NULL); @@ -1670,6 +1676,7 @@ comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; char *key_desc; + bool uniqueCheckFail = true; /* * Some rather brain-dead implementations of qsort (such as the one in @@ -1679,18 +1686,58 @@ comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, */ Assert(tuple1 != tuple2); - index_deform_tuple(tuple1, tupDes, values, isnull); - - key_desc = BuildIndexValueDescription(arg->index.indexRel, values, isnull); - - ereport(ERROR, - (errcode(ERRCODE_UNIQUE_VIOLATION), - errmsg("could not create unique index \"%s\"", - RelationGetRelationName(arg->index.indexRel)), - key_desc ? errdetail("Key %s is duplicated.", key_desc) : - errdetail("Duplicate keys exist."), - errtableconstraint(arg->index.heapRel, - RelationGetRelationName(arg->index.indexRel)))); + /* This is fail-fast check, see _bt_load for details. */ + if (arg->uniqueDeadIgnored) + { + bool any_tuple_dead, + call_again = false, + ignored; + + TupleTableSlot *slot = MakeSingleTupleTableSlot(RelationGetDescr(arg->index.heapRel), + &TTSOpsBufferHeapTuple); + ItemPointerData tid = tuple1->t_tid; + + IndexFetchTableData *fetch = table_index_fetch_begin(arg->index.heapRel); + any_tuple_dead = !table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + + if (!any_tuple_dead) + { + call_again = false; + tid = tuple2->t_tid; + any_tuple_dead = !table_index_fetch_tuple(fetch, &tuple2->t_tid, SnapshotSelf, slot, &call_again, + &ignored); + } + + if (any_tuple_dead) + { + elog(DEBUG5, "skipping duplicate values because some of them are dead: (%u,%u) vs (%u,%u)", + ItemPointerGetBlockNumber(&tuple1->t_tid), + ItemPointerGetOffsetNumber(&tuple1->t_tid), + ItemPointerGetBlockNumber(&tuple2->t_tid), + ItemPointerGetOffsetNumber(&tuple2->t_tid)); + + uniqueCheckFail = false; + } + ExecDropSingleTupleTableSlot(slot); + table_index_fetch_end(fetch); + Assert(!TransactionIdIsValid(MyProc->xmin)); + } + if (uniqueCheckFail) + { + index_deform_tuple(tuple1, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(arg->index.indexRel, values, isnull); + + /* keep this error message in sync with the same in _bt_load */ + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(arg->index.indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(arg->index.heapRel, + RelationGetRelationName(arg->index.indexRel)))); + } } /* diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index da7503c57b6..b72445a7610 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1309,8 +1309,10 @@ extern bool btproperty(Oid index_oid, int attno, extern char *btbuildphasename(int64 phasenum); extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key); +extern int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, + BTScanInsert itup_key, bool *hasnulls); extern int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, - IndexTuple firstright); + IndexTuple firstright, bool *hasnulls); extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum); extern void _bt_check_third_page(Relation rel, Relation heap, diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 84b06ffa42f..92290e79591 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -1784,7 +1784,7 @@ table_scan_analyze_next_tuple(TableScanDesc scan, * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. * - * In case of non-unique concurrent index build, + * In case of concurrent index build, * SO_RESET_SNAPSHOT is applied for the scan. That leads to changing snapshots * on the fly to allow xmin horizon propagate. */ diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index da68f45acf2..448dc83aa58 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -396,6 +396,7 @@ extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, + bool uniqueDeadIgnored, int workMem, SortCoordinate coordinate, int sortopt); extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel, diff --git a/src/test/modules/injection_points/expected/cic_reset_snapshots.out b/src/test/modules/injection_points/expected/cic_reset_snapshots.out index b4ad90eb339..bb84d61f40d 100644 --- a/src/test/modules/injection_points/expected/cic_reset_snapshots.out +++ b/src/test/modules/injection_points/expected/cic_reset_snapshots.out @@ -17,6 +17,12 @@ SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice' (1 row) +SELECT injection_points_attach('table_parallelscan_initialize', 'notice'); + injection_points_attach +------------------------- + +(1 row) + CREATE SCHEMA cic_reset_snap; CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); INSERT INTO cic_reset_snap.tbl SELECT i, i * I FROM generate_series(1, 200) s(i); @@ -35,7 +41,11 @@ END; $$; ---------------- ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots @@ -72,30 +82,47 @@ NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective DROP INDEX CONCURRENTLY cic_reset_snap.idx; -- The same in parallel mode ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); +-- Detach to keep test stable, since parallel worker may complete scan before leader +SELECT injection_points_detach('heap_reset_scan_snapshot_effective'); + injection_points_detach +------------------------- + +(1 row) + CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots -NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots -NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i DESC NULLS LAST); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; BEGIN TRANSACTION; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); -- 2.43.0 ^ permalink raw reply [nested|flat] 6+ messages in thread
* Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY 2026-03-09 00:03 Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY Mihail Nikalayeu <[email protected]> 2026-03-20 01:15 ` Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY Mihail Nikalayeu <[email protected]> @ 2026-03-21 23:50 ` Mihail Nikalayeu <[email protected]> 2026-04-06 17:55 ` Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY Mihail Nikalayeu <[email protected]> 0 siblings, 1 reply; 6+ messages in thread From: Mihail Nikalayeu @ 2026-03-21 23:50 UTC (permalink / raw) To: Álvaro Herrera <[email protected]>; +Cc: PostgreSQL Hackers <[email protected]>; Matthias van de Meent <[email protected]>; Antonin Houska <[email protected]>; Sergey Sargsyan <[email protected]>; Hannu Krosing <[email protected]> Hello, Álvaro! Thanks for looking into it. > Or should we just consider the test as not-for-commit, and only a > development aid? Initially I thought so, but looks like it is possible to make it committable. > The test in 0001 is a bit on the slow side; should we make it > optional with PG_TEST_EXTRA? I made *parameters* to be depended on PG_TEST_EXTRA ~= stress. It is possible to apply the same pattern for other stress tests too. > The last pgbench subtest mentions GIN in the test name but doesn't > actually run it. Do we care? Would it be good to make the table be > unlogged? Fixed, it has its own pgbench because it has its own gin_index_check. > Would it be good to make the table be unlogged? Good idea, done. > I think all-but-one backends will >complete all the 999 transactions in the first 10ms sleep that the one > backend running the CIC does. Am I right about this? It actually has enough time to do multiple CIC (I see it from the log). I updated the test to random delay, for non-stress variants - from 0 to 1, for stress - up to 10. Also, fixed a few small styling issues + added additional fixes for waiting for a snapshot to be restored by a parallel worker. Best regards, Mikhail. Attachments: [application/octet-stream] v4-0002-Reset-snapshots-periodically-in-non-unique-non-pa.patch (51.1K, 2-v4-0002-Reset-snapshots-periodically-in-non-unique-non-pa.patch) download | inline diff: From 24a1346995a2fba88fc9a13f71738986f5fc2d0f Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <[email protected]> Date: Wed, 14 Jan 2026 15:59:47 +0300 Subject: [PATCH v4 2/4] Reset snapshots periodically in non-unique non-parallel concurrent index builds Long-living snapshots used by CREATE INDEX CONCURRENTLY and REINDEX CONCURRENTLY can hold back the global xmin horizon. Commit d9d076222f5b attempted to allow VACUUM to ignore such snapshots to mitigate this problem. However, this was reverted in commit e28bb8851969 because it could cause indexes to miss heap tuples that were HOT-updated and HOT-pruned during the index creation, leading to index corruption. This patch introduces an alternative by periodically resetting the snapshot used during the first phase. By resetting the snapshot every N pages during the heap scan, it allows the xmin horizon to advance. Currently, this technique is applied to: - only during the first scan of the heap: The second scan during index validation still uses a single snapshot to ensure index correctness - non-parallel index builds: Parallel index builds are not yet supported and will be addressed in following commits - non-unique indexes: Unique index builds still require a consistent snapshot to enforce uniqueness constraints, will be addressed in following commits A new scan option SO_RESET_SNAPSHOT is introduced. When set, it causes the snapshot to be reset "between" every SO_RESET_SNAPSHOT_EACH_N_PAGE pages during the scan. The heap scan code is adjusted to support this option, and the index build code is modified to use it for applicable concurrent index builds that are not on system catalogs and not using parallel workers. --- contrib/amcheck/verify_nbtree.c | 3 +- contrib/pgstattuple/pgstattuple.c | 2 +- src/backend/access/brin/brin.c | 19 ++- src/backend/access/gin/gininsert.c | 22 ++++ src/backend/access/gist/gistbuild.c | 4 + src/backend/access/hash/hash.c | 3 + src/backend/access/heap/heapam.c | 47 ++++++- src/backend/access/heap/heapam_handler.c | 57 +++++++-- src/backend/access/index/genam.c | 2 +- src/backend/access/nbtree/nbtsort.c | 30 ++++- src/backend/access/spgist/spginsert.c | 3 + src/backend/access/transam/xact.c | 11 ++ src/backend/catalog/index.c | 31 ++++- src/backend/commands/indexcmds.c | 17 +-- src/backend/optimizer/plan/planner.c | 10 ++ src/backend/tcop/utility.c | 3 + src/backend/utils/errcodes.txt | 1 + src/bin/pg_amcheck/t/006_cic.pl | 2 +- src/include/access/heapam.h | 2 + src/include/access/tableam.h | 28 ++++- src/include/access/xact.h | 1 + src/test/modules/injection_points/Makefile | 2 +- .../expected/cic_reset_snapshots.out | 115 ++++++++++++++++++ src/test/modules/injection_points/meson.build | 1 + .../sql/cic_reset_snapshots.sql | 95 +++++++++++++++ 25 files changed, 474 insertions(+), 37 deletions(-) create mode 100644 src/test/modules/injection_points/expected/cic_reset_snapshots.out create mode 100644 src/test/modules/injection_points/sql/cic_reset_snapshots.sql diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index b74ab5f7a05..40874167631 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -557,7 +557,8 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - true); /* syncscan OK? */ + true, /* syncscan OK? */ + false); /* * Scan will behave as the first scan of a CREATE INDEX CONCURRENTLY diff --git a/contrib/pgstattuple/pgstattuple.c b/contrib/pgstattuple/pgstattuple.c index 6a7f8cb4a7c..a0e7ed2e137 100644 --- a/contrib/pgstattuple/pgstattuple.c +++ b/contrib/pgstattuple/pgstattuple.c @@ -335,7 +335,7 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo) errmsg("only heap AM is supported"))); /* Disable syncscan because we assume we scan from block zero upwards */ - scan = table_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false); + scan = table_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false, false); hscan = (HeapScanDesc) scan; InitDirtySnapshot(SnapshotDirty); diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 2a0f8c8e3b8..8534d7125a0 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -1221,11 +1221,12 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) state->bs_sortstate = tuplesort_begin_index_brin(maintenance_work_mem, coordinate, TUPLESORT_NONE); - + InvalidateCatalogSnapshot(); /* scan the relation and merge per-worker results */ reltuples = _brin_parallel_merge(state); _brin_end_parallel(state->bs_leader, state); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } else /* no parallel index build */ { @@ -1238,6 +1239,7 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) reltuples = table_index_build_scan(heap, index, indexInfo, false, true, brinbuildCallback, state, NULL); + InvalidateCatalogSnapshot(); /* * process the final batch * @@ -1257,6 +1259,7 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) brin_fill_empty_ranges(state, state->bs_currRangeStart, state->bs_maxRangeStart); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } /* release resources */ @@ -2390,6 +2393,7 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; + bool need_pop_active_snapshot = true; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -2415,9 +2419,16 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * live according to that. */ if (!isconcurrent) + { + Assert(ActiveSnapshotSet()); snapshot = SnapshotAny; + need_pop_active_snapshot = false; + } else + { snapshot = RegisterSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(GetTransactionSnapshot()); + } /* * Estimate size for our own PARALLEL_KEY_BRIN_SHARED workspace. @@ -2460,6 +2471,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, /* If no DSM segment was available, back out (do serial build) */ if (pcxt->seg == NULL) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); if (IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); @@ -2539,6 +2552,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); _brin_end_parallel(brinleader, NULL); return; } @@ -2555,6 +2570,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * sure that the failure-to-start case will not hang forever. */ WaitForParallelWorkersToAttach(pcxt); + if (need_pop_active_snapshot) + PopActiveSnapshot(); } /* diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index e54782d9dd8..c3085d174c7 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -31,6 +31,7 @@ #include "storage/condition_variable.h" #include "storage/proc.h" #include "storage/predicate.h" +#include "storage/proc.h" #include "tcop/tcopprot.h" #include "utils/datum.h" #include "utils/memutils.h" @@ -683,6 +684,9 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) buildstate.accum.ginstate = &buildstate.ginstate; ginInitBA(&buildstate.accum); + InvalidateCatalogSnapshot(); + Assert(!indexInfo->ii_Concurrent || indexInfo->ii_ParallelWorkers || !TransactionIdIsValid(MyProc->xmin)); + /* Report table scan phase started */ pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_GIN_PHASE_INDEXBUILD_TABLESCAN); @@ -745,11 +749,13 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) tuplesort_begin_index_gin(heap, index, maintenance_work_mem, coordinate, TUPLESORT_NONE); + InvalidateCatalogSnapshot(); /* scan the relation in parallel and merge per-worker results */ reltuples = _gin_parallel_merge(state); _gin_end_parallel(state->bs_leader, state); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } else /* no parallel index build */ { @@ -759,6 +765,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) */ reltuples = table_index_build_scan(heap, index, indexInfo, false, true, ginBuildCallback, &buildstate, NULL); + InvalidateCatalogSnapshot(); /* dump remaining entries to the index */ oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx); @@ -772,6 +779,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) list, nlist, &buildstate.buildStats); } MemoryContextSwitchTo(oldCtx); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } MemoryContextDelete(buildstate.funcCtx); @@ -948,6 +956,7 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; + bool need_pop_active_snapshot = true; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -972,9 +981,16 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * live according to that. */ if (!isconcurrent) + { + Assert(ActiveSnapshotSet()); snapshot = SnapshotAny; + need_pop_active_snapshot = false; + } else + { snapshot = RegisterSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(GetTransactionSnapshot()); + } /* * Estimate size for our own PARALLEL_KEY_GIN_SHARED workspace. @@ -1017,6 +1033,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, /* If no DSM segment was available, back out (do serial build) */ if (pcxt->seg == NULL) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); if (IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); @@ -1091,6 +1109,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); _gin_end_parallel(ginleader, NULL); return; } @@ -1107,6 +1127,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * sure that the failure-to-start case will not hang forever. */ WaitForParallelWorkersToAttach(pcxt); + if (need_pop_active_snapshot) + PopActiveSnapshot(); } /* diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 7f57c787f4c..6207e9f5d81 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -43,6 +43,7 @@ #include "optimizer/optimizer.h" #include "storage/bufmgr.h" #include "storage/bulk_write.h" +#include "storage/proc.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -259,6 +260,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) buildstate.indtuples = 0; buildstate.indtuplesSize = 0; + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); if (buildstate.buildMode == GIST_SORTED_BUILD) { /* @@ -350,6 +352,8 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = (double) buildstate.indtuples; + InvalidateCatalogSnapshot(); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); return result; } diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 6df5e7ccbd1..fdf4a3659b8 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -30,6 +30,7 @@ #include "nodes/execnodes.h" #include "optimizer/plancat.h" #include "pgstat.h" +#include "storage/proc.h" #include "storage/read_stream.h" #include "utils/fmgrprotos.h" #include "utils/index_selfuncs.h" @@ -210,6 +211,8 @@ hashbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; + InvalidateCatalogSnapshot(); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); return result; } diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index e5bd062de77..52b27f0384b 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -54,6 +54,7 @@ #include "utils/inval.h" #include "utils/spccache.h" #include "utils/syscache.h" +#include "utils/injection_point.h" static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, @@ -697,6 +698,36 @@ heap_prepare_pagescan(TableScanDesc sscan) LockBuffer(buffer, BUFFER_LOCK_UNLOCK); } +/* + * Reset the active snapshot during a scan. + * This ensures the xmin horizon can advance while maintaining safe tuple visibility. + * Note: No other snapshot should be active during this operation. + */ +static inline void +heap_reset_scan_snapshot(TableScanDesc sscan) +{ + /* Make sure no other snapshot was set as active. */ + Assert(GetActiveSnapshot() == sscan->rs_snapshot); + /* And make sure active snapshot is not registered. */ + Assert(GetActiveSnapshot()->regd_count == 0); + PopActiveSnapshot(); + + sscan->rs_snapshot = InvalidSnapshot; /* just to be tidy */ + Assert(!HaveRegisteredOrActiveSnapshot()); + InvalidateCatalogSnapshot(); + + /* The goal of snapshot reset is to allow horizon to advance. */ + Assert(!TransactionIdIsValid(MyProc->xmin)); +#if USE_INJECTION_POINTS + /* In some cases it is still not possible due xid assign. */ + if (!TransactionIdIsValid(MyProc->xid)) + INJECTION_POINT("heap_reset_scan_snapshot_effective", NULL); +#endif + + PushActiveSnapshot(GetLatestSnapshot()); + sscan->rs_snapshot = GetActiveSnapshot(); +} + /* * heap_fetch_next_buffer - read and pin the next block from MAIN_FORKNUM. * @@ -738,7 +769,12 @@ heap_fetch_next_buffer(HeapScanDesc scan, ScanDirection dir) scan->rs_cbuf = read_stream_next_buffer(scan->rs_read_stream, NULL); if (BufferIsValid(scan->rs_cbuf)) + { scan->rs_cblock = BufferGetBlockNumber(scan->rs_cbuf); + if (scan->rs_base.rs_flags & SO_RESET_SNAPSHOT && + scan->rs_cblock % SO_RESET_SNAPSHOT_EACH_N_PAGE == 0) + heap_reset_scan_snapshot((TableScanDesc) scan); + } } /* @@ -1409,7 +1445,16 @@ heap_endscan(TableScanDesc sscan) if (scan->rs_parallelworkerdata != NULL) pfree(scan->rs_parallelworkerdata); - +#ifdef USE_ASSERT_CHECKING + if (scan->rs_base.rs_flags & SO_RESET_SNAPSHOT) + { + Assert(!(scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)); + /* Make sure no other snapshot was set as active. */ + Assert(GetActiveSnapshot() == sscan->rs_snapshot); + /* And make sure snapshot is not registered. */ + Assert(GetActiveSnapshot()->regd_count == 0); + } +#endif if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT) UnregisterSnapshot(scan->rs_base.rs_snapshot); diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 253a735b6c1..9cad2e115bd 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1219,6 +1219,8 @@ heapam_index_build_range_scan(Relation heapRelation, ExprContext *econtext; Snapshot snapshot; bool need_unregister_snapshot = false; + bool need_pop_active_snapshot = false; + bool reset_snapshots = false; TransactionId OldestXmin; BlockNumber previous_blkno = InvalidBlockNumber; BlockNumber root_blkno = InvalidBlockNumber; @@ -1253,9 +1255,6 @@ heapam_index_build_range_scan(Relation heapRelation, /* Arrange for econtext's scan tuple to be the tuple under test */ econtext->ecxt_scantuple = slot; - /* Set up execution state for predicate, if any. */ - predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); - /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time @@ -1265,6 +1264,15 @@ heapam_index_build_range_scan(Relation heapRelation, */ OldestXmin = InvalidTransactionId; + /* + * For unique index we need consistent snapshot for the whole scan. + * In case of parallel scan some additional infrastructure required + * to perform scan with SO_RESET_SNAPSHOT which is not yet ready. + */ + reset_snapshots = indexInfo->ii_Concurrent && + !indexInfo->ii_Unique && + !is_system_catalog; /* just for the case */ + /* okay to ignore lazy VACUUMs here */ if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent) OldestXmin = GetOldestNonRemovableTransactionId(heapRelation); @@ -1273,24 +1281,41 @@ heapam_index_build_range_scan(Relation heapRelation, { /* * Serial index build. - * - * Must begin our own heap scan in this case. We may also need to - * register a snapshot whose lifetime is under our direct control. */ if (!TransactionIdIsValid(OldestXmin)) { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); - need_unregister_snapshot = true; + snapshot = GetTransactionSnapshot(); + /* + * Must begin our own heap scan in this case. We may also need to + * register a snapshot whose lifetime is under our direct control. + * In case of resetting of snapshot during the scan registration is + * not allowed because snapshot is going to be changed every so + * often. + */ + if (!reset_snapshots) + { + snapshot = RegisterSnapshot(snapshot); + need_unregister_snapshot = true; + } + Assert(!ActiveSnapshotSet()); + PushActiveSnapshot(snapshot); + /* store link to snapshot because it may be copied */ + snapshot = GetActiveSnapshot(); + need_pop_active_snapshot = true; } else + { + Assert(!indexInfo->ii_Concurrent); snapshot = SnapshotAny; + } scan = table_beginscan_strat(heapRelation, /* relation */ snapshot, /* snapshot */ 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - allow_sync); /* syncscan OK? */ + allow_sync, /* syncscan OK? */ + reset_snapshots /* reset snapshots? */); } else { @@ -1304,6 +1329,8 @@ heapam_index_build_range_scan(Relation heapRelation, Assert(!IsBootstrapProcessingMode()); Assert(allow_sync); snapshot = scan->rs_snapshot; + PushActiveSnapshot(snapshot); + need_pop_active_snapshot = true; } hscan = (HeapScanDesc) scan; @@ -1318,6 +1345,13 @@ heapam_index_build_range_scan(Relation heapRelation, Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) : !TransactionIdIsValid(OldestXmin)); Assert(snapshot == SnapshotAny || !anyvisible); + Assert(snapshot == SnapshotAny || ActiveSnapshotSet()); + + /* Set up execution state for predicate, if any. */ + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + /* Clear reference to snapshot since it may be changed by the scan itself. */ + if (reset_snapshots) + snapshot = InvalidSnapshot; /* Publish number of blocks to scan */ if (progress) @@ -1753,6 +1787,8 @@ heapam_index_build_range_scan(Relation heapRelation, table_endscan(scan); + if (need_pop_active_snapshot) + PopActiveSnapshot(); /* we can now forget our snapshot, if set and registered by us */ if (need_unregister_snapshot) UnregisterSnapshot(snapshot); @@ -1825,7 +1861,8 @@ heapam_index_validate_scan(Relation heapRelation, 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - false); /* syncscan not OK */ + false, /* syncscan not OK */ + false); hscan = (HeapScanDesc) scan; pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL, diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 5e89b86a62c..e4284181738 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -472,7 +472,7 @@ systable_beginscan(Relation heapRelation, */ sysscan->scan = table_beginscan_strat(heapRelation, snapshot, nkeys, key, - true, false); + true, false, false); sysscan->iscan = NULL; } diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 47a9bda30c9..6f091d39813 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -262,7 +262,7 @@ static double _bt_spools_heapscan(Relation heap, Relation index, static void _bt_spooldestroy(BTSpool *btspool); static void _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull); -static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2); +static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots); static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state); static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level); @@ -325,18 +325,22 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) RelationGetRelationName(index)); reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo); + Assert(indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique || + !indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); /* * Finish the build by (1) completing the sort of the spool file, (2) * inserting the sorted tuples into btree pages and (3) building the upper * levels. Finally, it may also be necessary to end use of parallelism. */ - _bt_leafbuild(buildstate.spool, buildstate.spool2); + _bt_leafbuild(buildstate.spool, buildstate.spool2, !indexInfo->ii_ParallelWorkers && indexInfo->ii_Concurrent); _bt_spooldestroy(buildstate.spool); if (buildstate.spool2) _bt_spooldestroy(buildstate.spool2); if (buildstate.btleader) _bt_end_parallel(buildstate.btleader); + Assert(indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique || + !indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); result = palloc_object(IndexBuildResult); @@ -484,6 +488,9 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, else reltuples = _bt_parallel_heapscan(buildstate, &indexInfo->ii_BrokenHotChain); + InvalidateCatalogSnapshot(); + Assert(indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique || + !indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); /* * Set the progress target for the next phase. Reset the block number @@ -539,7 +546,7 @@ _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, co * create an entire btree. */ static void -_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) +_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots) { BTWriteState wstate; @@ -561,18 +568,21 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) PROGRESS_BTREE_PHASE_PERFORMSORT_2); tuplesort_performsort(btspool2->sortstate); } + Assert(!reset_snapshots || !TransactionIdIsValid(MyProc->xmin)); wstate.heap = btspool->heap; wstate.index = btspool->index; wstate.inskey = _bt_mkscankey(wstate.index, NULL); /* _bt_mkscankey() won't set allequalimage without metapage */ wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true); + InvalidateCatalogSnapshot(); /* reserve the metapage */ wstate.btws_pages_alloced = BTREE_METAPAGE + 1; pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_BTREE_PHASE_LEAF_LOAD); + Assert(!reset_snapshots || !TransactionIdIsValid(MyProc->xmin)); _bt_load(&wstate, btspool, btspool2); } @@ -1411,6 +1421,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; + bool need_pop_active_snapshot = true; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -1436,9 +1447,16 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * live according to that. */ if (!isconcurrent) + { + Assert(ActiveSnapshotSet()); snapshot = SnapshotAny; + need_pop_active_snapshot = false; + } else + { snapshot = RegisterSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(snapshot); + } /* * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and @@ -1492,6 +1510,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) /* If no DSM segment was available, back out (do serial build) */ if (pcxt->seg == NULL) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); if (IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); @@ -1586,6 +1606,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); _bt_end_parallel(btleader); return; } @@ -1602,6 +1624,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * sure that the failure-to-start case will not hang forever. */ WaitForParallelWorkersToAttach(pcxt); + if (need_pop_active_snapshot) + PopActiveSnapshot(); } /* diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 780ef646a54..49832a36470 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -24,6 +24,7 @@ #include "nodes/execnodes.h" #include "storage/bufmgr.h" #include "storage/bulk_write.h" +#include "storage/proc.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -143,6 +144,8 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) result = palloc0_object(IndexBuildResult); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; + InvalidateCatalogSnapshot(); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); return result; } diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index aafc53e0164..7772d1379bc 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -3708,6 +3708,17 @@ PreventInTransactionBlock(bool isTopLevel, const char *stmtType) MyXactFlags |= XACT_FLAGS_NEEDIMMEDIATECOMMIT; } +void +PreventIsolationUsesXactSnapshot(const char *stmtType) +{ + if (IsolationUsesXactSnapshot()) + ereport(ERROR, + (errcode(ERRCODE_INAPPROPRIATE_ISOLATION_LEVEL_FOR_COMMAND), + /* translator: %s represents an SQL statement name */ + errmsg("%s does not support transaction isolation higher than READ COMMITTED", + stmtType))); +} + /* * WarnNoTransactionBlock * RequireTransactionBlock diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index d8219b18c48..f4dc4563c91 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -81,6 +81,7 @@ #include "utils/snapmgr.h" #include "utils/syscache.h" #include "utils/tuplesort.h" +#include "storage/proc.h" /* Potentially set by pg_upgrade_support functions */ Oid binary_upgrade_next_index_pg_class_oid = InvalidOid; @@ -1493,8 +1494,8 @@ index_concurrently_build(Oid heapRelationId, Relation indexRelation; IndexInfo *indexInfo; - /* This had better make sure that a snapshot is active */ - Assert(ActiveSnapshotSet()); + Assert(!TransactionIdIsValid(MyProc->xmin)); + Assert(!TransactionIdIsValid(MyProc->xid)); /* Open and lock the parent heap relation */ heapRel = table_open(heapRelationId, ShareUpdateExclusiveLock); @@ -1512,19 +1513,29 @@ index_concurrently_build(Oid heapRelationId, indexRelation = index_open(indexRelationId, RowExclusiveLock); + /* BuildIndexInfo may require as snapshot for expressions and predicates */ + PushActiveSnapshot(GetTransactionSnapshot()); /* * We have to re-build the IndexInfo struct, since it was lost in the * commit of the transaction where this concurrent index was created at * the catalog level. */ indexInfo = BuildIndexInfo(indexRelation); + /* Done with snapshot */ + PopActiveSnapshot(); Assert(!indexInfo->ii_ReadyForInserts); indexInfo->ii_Concurrent = true; indexInfo->ii_BrokenHotChain = false; + InvalidateCatalogSnapshot(); + Assert(!TransactionIdIsValid(MyProc->xmin)); + /* Now build the index */ index_build(heapRel, indexRelation, indexInfo, false, true); + InvalidateCatalogSnapshot(); + Assert((indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique) || !TransactionIdIsValid(MyProc->xmin)); + /* Roll back any GUC changes executed by index functions */ AtEOXact_GUC(false, save_nestlevel); @@ -1535,12 +1546,19 @@ index_concurrently_build(Oid heapRelationId, table_close(heapRel, NoLock); index_close(indexRelation, NoLock); + /* + * Updating pg_index might involve TOAST table access, so ensure we + * have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); /* * Update the pg_index row to mark the index as ready for inserts. Once we * commit this transaction, any new transactions that open the table must * insert new entries into the index for insertions and non-HOT updates. */ index_set_state_flags(indexRelationId, INDEX_CREATE_SET_READY); + /* we can do away with our snapshot */ + PopActiveSnapshot(); } /* @@ -3237,7 +3255,8 @@ IndexCheckExclusion(Relation heapRelation, 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - true); /* syncscan OK */ + true, /* syncscan OK */ + false); while (table_scan_getnextslot(scan, ForwardScanDirection, slot)) { @@ -3300,12 +3319,16 @@ IndexCheckExclusion(Relation heapRelation, * as of the start of the scan (see table_index_build_scan), whereas a normal * build takes care to include recently-dead tuples. This is OK because * we won't mark the index valid until all transactions that might be able - * to see those tuples are gone. The reason for doing that is to avoid + * to see those tuples are gone. One of reasons for doing that is to avoid * bogus unique-index failures due to concurrent UPDATEs (we might see * different versions of the same row as being valid when we pass over them, * if we used HeapTupleSatisfiesVacuum). This leaves us with an index that * does not contain any tuples added to the table while we built the index. * + * Furthermore, in case of non-unique index we set SO_RESET_SNAPSHOT for the + * scan, which causes new snapshot to be set as active every so often. The reason + * for that is to propagate the xmin horizon forward. + * * Next, we mark the index "indisready" (but still not "indisvalid") and * commit the second transaction and start a third. Again we wait for all * transactions that could have been modifying the table to terminate. Now diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index b89c6855364..415d0236c46 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1702,23 +1702,17 @@ DefineIndex(ParseState *pstate, * chains can be created where the new tuple and the old tuple in the * chain have different index keys. * - * We now take a new snapshot, and build the index using all tuples that - * are visible in this snapshot. We can be sure that any HOT updates to + * We build the index using all tuples that are visible using single or + * multiple refreshing snapshots. We can be sure that any HOT updates to * these tuples will be compatible with the index, since any updates made * by transactions that didn't know about the index are now committed or * rolled back. Thus, each visible tuple is either the end of its * HOT-chain or the extension of the chain is HOT-safe for this index. */ - /* Set ActiveSnapshot since functions in the indexes may need it */ - PushActiveSnapshot(GetTransactionSnapshot()); - /* Perform concurrent build of index */ index_concurrently_build(tableId, indexRelationId); - /* we can do away with our snapshot */ - PopActiveSnapshot(); - /* * Commit this transaction to make the indisready update visible. */ @@ -2875,8 +2869,11 @@ ExecReindex(ParseState *pstate, const ReindexStmt *stmt, bool isTopLevel) } if (concurrently) + { PreventInTransactionBlock(isTopLevel, "REINDEX CONCURRENTLY"); + PreventIsolationUsesXactSnapshot("REINDEX CONCURRENTLY"); + } params.options = (verbose ? REINDEXOPT_VERBOSE : 0) | @@ -4132,9 +4129,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein if (newidx->safe) set_indexsafe_procflags(); - /* Set ActiveSnapshot since functions in the indexes may need it */ - PushActiveSnapshot(GetTransactionSnapshot()); - /* * Update progress for the index to build, with the correct parent * table involved. @@ -4149,7 +4143,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein /* Perform concurrent build of new index */ index_concurrently_build(newidx->tableId, newidx->indexId); - PopActiveSnapshot(); CommitTransactionCommand(); } diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 42604a0f75c..d6bb3546bea 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -63,6 +63,7 @@ #include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/selfuncs.h" +#include "utils/snapmgr.h" /* GUC parameters */ double cursor_tuple_fraction = DEFAULT_CURSOR_TUPLE_FRACTION; @@ -7027,6 +7028,7 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) Relation heap; Relation index; RelOptInfo *rel; + bool need_pop_active_snapshot = false; int parallel_workers; BlockNumber heap_blocks; double reltuples; @@ -7082,6 +7084,12 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) heap = table_open(tableOid, NoLock); index = index_open(indexOid, NoLock); + /* Set ActiveSnapshot since functions in the indexes may need it */ + if (!ActiveSnapshotSet()) + { + PushActiveSnapshot(GetTransactionSnapshot()); + need_pop_active_snapshot = true; + } /* * Determine if it's safe to proceed. * @@ -7139,6 +7147,8 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) parallel_workers--; done: + if (need_pop_active_snapshot) + PopActiveSnapshot(); index_close(index, NoLock); table_close(heap, NoLock); diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 2b609bfc824..b55232116de 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -1463,8 +1463,11 @@ ProcessUtilitySlow(ParseState *pstate, bool is_alter_table; if (stmt->concurrent) + { PreventInTransactionBlock(isTopLevel, "CREATE INDEX CONCURRENTLY"); + PreventIsolationUsesXactSnapshot("CREATE INDEX CONCURRENTLY"); + } /* * Look up the relation OID just once, right here at the diff --git a/src/backend/utils/errcodes.txt b/src/backend/utils/errcodes.txt index 5b25402ebbe..a80f08af241 100644 --- a/src/backend/utils/errcodes.txt +++ b/src/backend/utils/errcodes.txt @@ -259,6 +259,7 @@ Section: Class 25 - Invalid Transaction State 25P02 E ERRCODE_IN_FAILED_SQL_TRANSACTION in_failed_sql_transaction 25P03 E ERRCODE_IDLE_IN_TRANSACTION_SESSION_TIMEOUT idle_in_transaction_session_timeout 25P04 E ERRCODE_TRANSACTION_TIMEOUT transaction_timeout +25P05 E ERRCODE_INAPPROPRIATE_ISOLATION_LEVEL_FOR_COMMAND inappropriate_isolation_level_for_command Section: Class 26 - Invalid SQL Statement Name diff --git a/src/bin/pg_amcheck/t/006_cic.pl b/src/bin/pg_amcheck/t/006_cic.pl index 0495ac10263..81cfde4e028 100644 --- a/src/bin/pg_amcheck/t/006_cic.pl +++ b/src/bin/pg_amcheck/t/006_cic.pl @@ -196,7 +196,7 @@ $node->pgbench( 0, [qr{actually processed}], [qr{^$}], - 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for GIN/GIST/BRIN/HASH/SPGIST', + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for GIN', { 'concurrent_ops_gin_idx' => sprintf(q( SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 2fdc50b865b..fd0c2cf0bf0 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -43,6 +43,8 @@ #define HEAP_PAGE_PRUNE_MARK_UNUSED_NOW (1 << 0) #define HEAP_PAGE_PRUNE_FREEZE (1 << 1) +#define SO_RESET_SNAPSHOT_EACH_N_PAGE 4096 + typedef struct BulkInsertStateData *BulkInsertState; typedef struct GlobalVisState GlobalVisState; typedef struct TupleTableSlot TupleTableSlot; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 06084752245..d0709782dde 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -25,6 +25,7 @@ #include "storage/read_stream.h" #include "utils/rel.h" #include "utils/snapshot.h" +#include "utils/injection_point.h" #define DEFAULT_TABLE_ACCESS_METHOD "heap" @@ -63,6 +64,17 @@ typedef enum ScanOptions /* unregister snapshot at scan end? */ SO_TEMP_SNAPSHOT = 1 << 9, + /* + * Reset scan and catalog snapshot every so often? If so, each + * SO_RESET_SNAPSHOT_EACH_N_PAGE pages active snapshot is popped, + * catalog snapshot invalidated, latest snapshot pushed as active. + * + * At the end of the scan snapshot is not popped. + * Goal of such mode is keep xmin propagating horizon forward. + * + * see heap_reset_scan_snapshot for details. + */ + SO_RESET_SNAPSHOT = 1 << 10, } ScanOptions; /* @@ -919,7 +931,8 @@ extern TableScanDesc table_beginscan_catalog(Relation relation, int nkeys, static inline TableScanDesc table_beginscan_strat(Relation rel, Snapshot snapshot, int nkeys, ScanKeyData *key, - bool allow_strat, bool allow_sync) + bool allow_strat, bool allow_sync, + bool reset_snapshot) { uint32 flags = SO_TYPE_SEQSCAN | SO_ALLOW_PAGEMODE; @@ -927,6 +940,15 @@ table_beginscan_strat(Relation rel, Snapshot snapshot, flags |= SO_ALLOW_STRAT; if (allow_sync) flags |= SO_ALLOW_SYNC; + if (reset_snapshot) + { + INJECTION_POINT("table_beginscan_strat_reset_snapshots", NULL); + /* Active snapshot is required on start. */ + Assert(GetActiveSnapshot() == snapshot); + /* Active snapshot should not be registered to keep xmin propagating. */ + Assert(GetActiveSnapshot()->regd_count == 0); + flags |= (SO_RESET_SNAPSHOT); + } return table_beginscan_common(rel, snapshot, nkeys, key, NULL, flags); } @@ -1760,6 +1782,10 @@ table_scan_analyze_next_tuple(TableScanDesc scan, * very hard to detect whether they're really incompatible with the chain tip. * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. + * + * In case of non-unique index and non-parallel concurrent build, + * SO_RESET_SNAPSHOT is applied for the scan. That leads to changing snapshots + * on the fly to allow xmin horizon propagate. */ static inline double table_index_build_scan(Relation table_rel, diff --git a/src/include/access/xact.h b/src/include/access/xact.h index f0b4d795071..2e3f617a949 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -488,6 +488,7 @@ extern bool IsTransactionOrTransactionBlock(void); extern char TransactionBlockStatusCode(void); extern void AbortOutOfAnyTransaction(void); extern void PreventInTransactionBlock(bool isTopLevel, const char *stmtType); +extern void PreventIsolationUsesXactSnapshot(const char *stmtType); extern void RequireTransactionBlock(bool isTopLevel, const char *stmtType); extern void WarnNoTransactionBlock(bool isTopLevel, const char *stmtType); extern bool IsInTransactionBlock(bool isTopLevel); diff --git a/src/test/modules/injection_points/Makefile b/src/test/modules/injection_points/Makefile index a41d781f8c9..eeaeaaf2cc6 100644 --- a/src/test/modules/injection_points/Makefile +++ b/src/test/modules/injection_points/Makefile @@ -9,7 +9,7 @@ EXTENSION = injection_points DATA = injection_points--1.0.sql PGFILEDESC = "injection_points - facility for injection points" -REGRESS = injection_points hashagg reindex_conc vacuum +REGRESS = injection_points hashagg reindex_conc vacuum cic_reset_snapshots REGRESS_OPTS = --dlpath=$(top_builddir)/src/test/regress ISOLATION = basic \ diff --git a/src/test/modules/injection_points/expected/cic_reset_snapshots.out b/src/test/modules/injection_points/expected/cic_reset_snapshots.out new file mode 100644 index 00000000000..b4ad90eb339 --- /dev/null +++ b/src/test/modules/injection_points/expected/cic_reset_snapshots.out @@ -0,0 +1,115 @@ +CREATE EXTENSION injection_points; +SELECT injection_points_set_local(); + injection_points_set_local +---------------------------- + +(1 row) + +SELECT injection_points_attach('heap_reset_scan_snapshot_effective', 'notice'); + injection_points_attach +------------------------- + +(1 row) + +SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice'); + injection_points_attach +------------------------- + +(1 row) + +CREATE SCHEMA cic_reset_snap; +CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); +INSERT INTO cic_reset_snap.tbl SELECT i, i * I FROM generate_series(1, 200) s(i); +CREATE FUNCTION cic_reset_snap.predicate_stable(integer) RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN MOD($1, 2) = 0; +END; $$; +CREATE FUNCTION cic_reset_snap.predicate_stable_no_param() RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN false; +END; $$; +---------------- +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +-- The same in parallel mode +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i DESC NULLS LAST); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +BEGIN TRANSACTION; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +ERROR: CREATE INDEX CONCURRENTLY cannot run inside a transaction block +ROLLBACK ; +SET default_transaction_isolation = serializable; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +ERROR: CREATE INDEX CONCURRENTLY does not support transaction isolation higher than READ COMMITTED +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +ERROR: REINDEX CONCURRENTLY does not support transaction isolation higher than READ COMMITTED +RESET default_transaction_isolation; +DROP SCHEMA cic_reset_snap CASCADE; +NOTICE: drop cascades to 3 other objects +DETAIL: drop cascades to table cic_reset_snap.tbl +drop cascades to function cic_reset_snap.predicate_stable(integer) +drop cascades to function cic_reset_snap.predicate_stable_no_param() +DROP EXTENSION injection_points; diff --git a/src/test/modules/injection_points/meson.build b/src/test/modules/injection_points/meson.build index fcc85414515..5d298cd2710 100644 --- a/src/test/modules/injection_points/meson.build +++ b/src/test/modules/injection_points/meson.build @@ -36,6 +36,7 @@ tests += { 'hashagg', 'reindex_conc', 'vacuum', + 'cic_reset_snapshots', ], 'regress_args': ['--dlpath', meson.project_build_root() / 'src/test/regress'], # The injection points are cluster-wide, so disable installcheck diff --git a/src/test/modules/injection_points/sql/cic_reset_snapshots.sql b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql new file mode 100644 index 00000000000..7f7dffa5be4 --- /dev/null +++ b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql @@ -0,0 +1,95 @@ +CREATE EXTENSION injection_points; + +SELECT injection_points_set_local(); +SELECT injection_points_attach('heap_reset_scan_snapshot_effective', 'notice'); +SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice'); + + +CREATE SCHEMA cic_reset_snap; +CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); +INSERT INTO cic_reset_snap.tbl SELECT i, i * I FROM generate_series(1, 200) s(i); + +CREATE FUNCTION cic_reset_snap.predicate_stable(integer) RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN MOD($1, 2) = 0; +END; $$; + +CREATE FUNCTION cic_reset_snap.predicate_stable_no_param() RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN false; +END; $$; + +---------------- +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); + +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +-- The same in parallel mode +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); + +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i DESC NULLS LAST); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +BEGIN TRANSACTION; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +ROLLBACK ; + +SET default_transaction_isolation = serializable; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +RESET default_transaction_isolation; + +DROP SCHEMA cic_reset_snap CASCADE; + +DROP EXTENSION injection_points; -- 2.43.0 [application/octet-stream] v4-0001-Add-stress-tests-for-concurrent-index-builds.patch (11.9K, 3-v4-0001-Add-stress-tests-for-concurrent-index-builds.patch) download | inline diff: From e81b83bb594375789705a2beddc30d26e20be728 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <[email protected]> Date: Sat, 30 Nov 2024 16:24:20 +0100 Subject: [PATCH v4 1/4] Add stress tests for concurrent index builds Introduce stress tests for concurrent index operations: - test concurrent inserts/updates during CREATE/REINDEX INDEX CONCURRENTLY - cover various index types (btree, gin, gist, brin, hash, spgist) - test unique and non-unique indexes - test with expressions and predicates - test both parallel and non-parallel operations These tests verify the behavior of the following commits. --- src/bin/pg_amcheck/meson.build | 1 + src/bin/pg_amcheck/t/006_cic.pl | 273 ++++++++++++++++++++++++++++++++ 2 files changed, 274 insertions(+) create mode 100644 src/bin/pg_amcheck/t/006_cic.pl diff --git a/src/bin/pg_amcheck/meson.build b/src/bin/pg_amcheck/meson.build index 592cef74ecb..51a62dccb7b 100644 --- a/src/bin/pg_amcheck/meson.build +++ b/src/bin/pg_amcheck/meson.build @@ -28,6 +28,7 @@ tests += { 't/003_check.pl', 't/004_verify_heapam.pl', 't/005_opclass_damage.pl', + 't/006_cic.pl', ], }, } diff --git a/src/bin/pg_amcheck/t/006_cic.pl b/src/bin/pg_amcheck/t/006_cic.pl new file mode 100644 index 00000000000..0495ac10263 --- /dev/null +++ b/src/bin/pg_amcheck/t/006_cic.pl @@ -0,0 +1,273 @@ +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Test REINDEX CONCURRENTLY with concurrent modifications and HOT updates +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use constant STRESS_PGBENCH_CLIENTS => 30; +use constant STRESS_PGBENCH_JOBS => 8; +use constant STRESS_PGBENCH_TRANSACTIONS => 10000; +use constant STRESS_MAX_SLEEP_MS => 10; + +use constant DEFAULT_PGBENCH_CLIENTS => 15; +use constant DEFAULT_PGBENCH_JOBS => 4; +use constant DEFAULT_PGBENCH_TRANSACTIONS => 500; +use constant DEFAULT_MAX_SLEEP_MS => 1; + +Test::More->builder->todo_start('filesystem bug') + if PostgreSQL::Test::Utils::has_wal_read_bug; + +my ($node, $result); +my $pg_test_extra = $ENV{PG_TEST_EXTRA} // ''; +my $is_stress = $pg_test_extra =~ /\bstress\b/ ? 1 : 0; +my $pgbench_clients = + $is_stress ? STRESS_PGBENCH_CLIENTS : DEFAULT_PGBENCH_CLIENTS; +my $pgbench_jobs = $is_stress ? STRESS_PGBENCH_JOBS : DEFAULT_PGBENCH_JOBS; +my $pgbench_transactions = + $is_stress ? STRESS_PGBENCH_TRANSACTIONS : DEFAULT_PGBENCH_TRANSACTIONS; +my $max_sleep_ms = $is_stress ? STRESS_MAX_SLEEP_MS : DEFAULT_MAX_SLEEP_MS; +my $pgbench_options = sprintf( + '--no-vacuum --client=%d --jobs=%d --exit-on-abort --transactions=%d', + $pgbench_clients, + $pgbench_jobs, + $pgbench_transactions); +my $no_hot = $is_stress ? int(rand(2)) : 0; + +print( + sprintf( + 'settings: PG_TEST_EXTRA=%s stress=%d clients=%d jobs=%d transactions=%d max_sleep_ms=%d no_hot=%d', + defined($ENV{PG_TEST_EXTRA}) + ? ($pg_test_extra eq '' ? '(empty)' : $pg_test_extra) + : '(undef)', + $is_stress, + $pgbench_clients, + $pgbench_jobs, + $pgbench_transactions, + $max_sleep_ms, + $no_hot)); +print "\n"; + +# +# Test set-up +# +$node = PostgreSQL::Test::Cluster->new('RC_test'); +$node->init; +$node->append_conf('postgresql.conf', + 'lock_timeout = ' . (1000 * $PostgreSQL::Test::Utils::timeout_default)); +$node->append_conf('postgresql.conf', 'fsync = off'); +$node->append_conf('postgresql.conf', 'maintenance_work_mem = 32MB'); # to avoid OOM +$node->append_conf('postgresql.conf', 'shared_buffers = 32MB'); # to avoid OOM +$node->start; +$node->safe_psql('postgres', q(CREATE EXTENSION amcheck)); +$node->safe_psql('postgres', q(CREATE UNLOGGED TABLE tbl(i int primary key, + c1 money default 0, c2 money default 0, + c3 money default 0, updated_at timestamp, + ia int4[], p point))); + +if ($no_hot) { $node->safe_psql('postgres', q(CREATE INDEX CONCURRENTLY idx ON tbl(i, updated_at);)); } + +# create sequence +$node->safe_psql('postgres', q(CREATE UNLOGGED SEQUENCE in_row_rebuild START 1 INCREMENT 1;)); +$node->safe_psql('postgres', q(SELECT nextval('in_row_rebuild');)); + +# Create helper functions for predicate tests +$node->safe_psql('postgres', q( + CREATE FUNCTION predicate_stable() RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ + BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN true; + END; $$; +)); + +$node->safe_psql('postgres', q( + CREATE FUNCTION predicate_const(integer) RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ + BEGIN + RETURN MOD($1, 2) = 0; + END; $$; +)); + +# Run CIC/RIC in different options concurrently with upserts +$node->pgbench( + $pgbench_options, + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY', + { + 'concurrent_ops' => sprintf(q( + SET debug_parallel_query = off; -- this is because predicate_stable implementation + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set variant random(0, 5) + \set parallels random(0, 4) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + \if :variant = 0 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at); + \elif :variant = 1 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at) WHERE predicate_stable(); + \elif :variant = 2 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at) WHERE MOD(i, 2) = 0; + \elif :variant = 3 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at) WHERE predicate_const(i); + \elif :variant = 4 + CREATE INDEX CONCURRENTLY new_idx ON tbl(predicate_const(i)); + \elif :variant = 5 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, predicate_const(i), updated_at) WHERE predicate_const(i); + \endif + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + REINDEX INDEX CONCURRENTLY new_idx; + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + DROP INDEX CONCURRENTLY new_idx; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1000, 100000) + BEGIN; + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + SELECT setval('in_row_rebuild', 1); + COMMIT; + \endif + ), $max_sleep_ms, $max_sleep_ms) + }); + +$node->safe_psql('postgres', q(TRUNCATE TABLE tbl;)); + +# Run CIC/RIC for unique index concurrently with upserts +$node->pgbench( + $pgbench_options, + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for unique BTREE', + { + 'concurrent_ops_unique_idx' => sprintf(q( + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set parallels random(0, 4) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + CREATE UNIQUE INDEX CONCURRENTLY new_idx ON tbl(i); + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + REINDEX INDEX CONCURRENTLY new_idx; + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + DROP INDEX CONCURRENTLY new_idx; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1, power(10, random(1, 5))) + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + SELECT setval('in_row_rebuild', 1); + \endif + ), $max_sleep_ms, $max_sleep_ms) + }); + +$node->safe_psql('postgres', q(TRUNCATE TABLE tbl;)); + +# Run CIC/RIC for GIN with upserts +$node->pgbench( + $pgbench_options, + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for GIN/GIST/BRIN/HASH/SPGIST', + { + 'concurrent_ops_gin_idx' => sprintf(q( + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set parallels random(0, 4) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + CREATE INDEX CONCURRENTLY new_idx ON tbl USING GIN (ia); + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + SELECT gin_index_check('new_idx'); + REINDEX INDEX CONCURRENTLY new_idx; + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + SELECT gin_index_check('new_idx'); + DROP INDEX CONCURRENTLY new_idx; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1, power(10, random(1, 5))) + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + SELECT setval('in_row_rebuild', 1); + \endif + ), $max_sleep_ms, $max_sleep_ms) + }); + +$node->safe_psql('postgres', q(TRUNCATE TABLE tbl;)); + +# Run CIC/RIC for GIST/BRIN/HASH/SPGIST index concurrently with upserts +$node->pgbench( + $pgbench_options, + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for GIST/BRIN/HASH/SPGIST', + { + 'concurrent_ops_other_idx' => sprintf(q( + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set parallels random(0, 4) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + \set variant random(0, 3) + \if :variant = 0 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING GIST (p); + \elif :variant = 1 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING BRIN (updated_at); + \elif :variant = 2 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING HASH (updated_at); + \elif :variant = 3 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING SPGIST (p); + \endif + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + REINDEX INDEX CONCURRENTLY new_idx; + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + DROP INDEX CONCURRENTLY new_idx; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1, power(10, random(1, 5))) + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + SELECT setval('in_row_rebuild', 1); + \endif + ), $max_sleep_ms, $max_sleep_ms) + }); + +$node->stop; +done_testing(); -- 2.43.0 [application/octet-stream] v4-0004-Support-snapshot-resets-in-concurrent-builds-of-u.patch (40.9K, 4-v4-0004-Support-snapshot-resets-in-concurrent-builds-of-u.patch) download | inline diff: From bfcf5a95f8f00b8d8718d6f7c1c009da22e3f7b8 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <[email protected]> Date: Wed, 14 Jan 2026 19:02:34 +0300 Subject: [PATCH v4 4/4] Support snapshot resets in concurrent builds of unique indexes Previously, concurrent builds of unique index used a fixed snapshot for the entire scan to ensure proper uniqueness checks. Now reset snapshots periodically during concurrent unique index builds, while still maintaining uniqueness by: - ignoring SnapshotSelf dead tuples during uniqueness checks in tuplesort as not a guarantee, but a fail-fast mechanics - adding a uniqueness check in _bt_load that detects multiple alive tuples with the same key values as a guarantee of correctness Tuples are SnapshotSelf tested only in the case of equal index key values, otherwise _bt_load works like before. --- src/backend/access/heap/README.HOT | 12 +- src/backend/access/heap/heapam_handler.c | 6 +- src/backend/access/nbtree/nbtdedup.c | 8 +- src/backend/access/nbtree/nbtreadpage.c | 2 +- src/backend/access/nbtree/nbtsort.c | 208 +++++++++++++----- src/backend/access/nbtree/nbtsplitloc.c | 12 +- src/backend/access/nbtree/nbtutils.c | 30 ++- src/backend/catalog/index.c | 8 +- src/backend/commands/indexcmds.c | 4 +- src/backend/utils/sort/tuplesortvariants.c | 71 +++++- src/include/access/nbtree.h | 4 +- src/include/access/tableam.h | 2 +- src/include/utils/tuplesort.h | 1 + .../expected/cic_reset_snapshots.out | 6 + 14 files changed, 274 insertions(+), 100 deletions(-) diff --git a/src/backend/access/heap/README.HOT b/src/backend/access/heap/README.HOT index 74e407f375a..829dad1194e 100644 --- a/src/backend/access/heap/README.HOT +++ b/src/backend/access/heap/README.HOT @@ -386,12 +386,12 @@ have the HOT-safety property enforced before we start to build the new index. After waiting for transactions which had the table open, we build the index -for all rows that are valid in a fresh snapshot. Any tuples visible in the -snapshot will have only valid forward-growing HOT chains. (They might have -older HOT updates behind them which are broken, but this is OK for the same -reason it's OK in a regular index build.) As above, we point the index -entry at the root of the HOT-update chain but we use the key value from the -live tuple. +for all rows that are valid in a fresh snapshot, which is updated every so +often. Any tuples visible in the snapshot will have only valid forward-growing +HOT chains. (They might have older HOT updates behind them which are broken, +but this is OK for the same reason it's OK in a regular index build.) +As above, we point the index entry at the root of the HOT-update chain but we +use the key value from the live tuple. We mark the index open for inserts (but still not ready for reads) then we again wait for transactions which have the table open. Then we take diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index a639e8b31e0..04738d59b91 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1261,15 +1261,15 @@ heapam_index_build_range_scan(Relation heapRelation, * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, or during bootstrap, we take a regular MVCC snapshot * and index whatever's live according to that while that snapshot is reset - * every so often (in case of non-unique index). + * every so often. */ OldestXmin = InvalidTransactionId; /* - * For unique index we need consistent snapshot for the whole scan. + * For concurrent builds of non-system indexes, we may want to periodically + * reset snapshots to allow vacuum to clean up tuples. */ reset_snapshots = indexInfo->ii_Concurrent && - !indexInfo->ii_Unique && !is_system_catalog; /* just for the case */ /* okay to ignore lazy VACUUMs here */ diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c index af7affdf409..10f4f7eeba9 100644 --- a/src/backend/access/nbtree/nbtdedup.c +++ b/src/backend/access/nbtree/nbtdedup.c @@ -149,7 +149,7 @@ _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, _bt_dedup_start_pending(state, itup, offnum); } else if (state->deduplicate && - _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + _bt_keep_natts_fast(rel, state->base, itup, NULL) > nkeyatts && _bt_dedup_save_htid(state, itup)) { /* @@ -376,7 +376,7 @@ _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, /* itup starts first pending interval */ _bt_dedup_start_pending(state, itup, offnum); } - else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + else if (_bt_keep_natts_fast(rel, state->base, itup, NULL) > nkeyatts && _bt_dedup_save_htid(state, itup)) { /* Tuple is equal; just added its TIDs to pending interval */ @@ -789,12 +789,12 @@ _bt_do_singleval(Relation rel, Page page, BTDedupState state, itemid = PageGetItemId(page, minoff); itup = (IndexTuple) PageGetItem(page, itemid); - if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + if (_bt_keep_natts_fast(rel, newitem, itup, NULL) > nkeyatts) { itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page)); itup = (IndexTuple) PageGetItem(page, itemid); - if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + if (_bt_keep_natts_fast(rel, newitem, itup, NULL) > nkeyatts) return true; } diff --git a/src/backend/access/nbtree/nbtreadpage.c b/src/backend/access/nbtree/nbtreadpage.c index 2ba1ca66023..c4c278e777a 100644 --- a/src/backend/access/nbtree/nbtreadpage.c +++ b/src/backend/access/nbtree/nbtreadpage.c @@ -623,7 +623,7 @@ _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate) lasttup = (IndexTuple) PageGetItem(pstate->page, iid); /* Determine the first attribute whose values change on caller's page */ - firstchangingattnum = _bt_keep_natts_fast(rel, firsttup, lasttup); + firstchangingattnum = _bt_keep_natts_fast(rel, firsttup, lasttup, NULL); for (; startikey < so->numberOfKeys; startikey++) { diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 6594c3e15b2..eaa3f09451a 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -87,6 +87,7 @@ typedef struct BTSpool Relation index; bool isunique; bool nulls_not_distinct; + bool unique_dead_ignored; } BTSpool; /* @@ -105,6 +106,7 @@ typedef struct BTShared Oid indexrelid; bool isunique; bool nulls_not_distinct; + bool unique_dead_ignored; bool isconcurrent; int scantuplesortstates; @@ -207,15 +209,14 @@ typedef struct BTLeader */ typedef struct BTBuildState { - bool isunique; - bool nulls_not_distinct; bool havedead; Relation heap; BTSpool *spool; /* - * spool2 is needed only when the index is a unique index. Dead tuples are - * put into spool2 instead of spool in order to avoid uniqueness check. + * spool2 is needed only when the index is a unique index and built + * non-concurrently. Dead tuples are put into spool2 instead of spool + * in order to avoid uniqueness check. */ BTSpool *spool2; double indtuples; @@ -262,7 +263,7 @@ static double _bt_spools_heapscan(Relation heap, Relation index, static void _bt_spooldestroy(BTSpool *btspool); static void _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull); -static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots); +static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool isconcurrent); static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state); static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level); @@ -307,8 +308,6 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) ResetUsage(); #endif /* BTREE_BUILD_STATS */ - buildstate.isunique = indexInfo->ii_Unique; - buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct; buildstate.havedead = false; buildstate.heap = heap; buildstate.spool = NULL; @@ -325,20 +324,20 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) RelationGetRelationName(index)); reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo); - Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); /* * Finish the build by (1) completing the sort of the spool file, (2) * inserting the sorted tuples into btree pages and (3) building the upper * levels. Finally, it may also be necessary to end use of parallelism. */ - _bt_leafbuild(buildstate.spool, buildstate.spool2, !indexInfo->ii_Unique && indexInfo->ii_Concurrent); + _bt_leafbuild(buildstate.spool, buildstate.spool2, indexInfo->ii_Concurrent); _bt_spooldestroy(buildstate.spool); if (buildstate.spool2) _bt_spooldestroy(buildstate.spool2); if (buildstate.btleader) _bt_end_parallel(buildstate.btleader); - Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); result = palloc_object(IndexBuildResult); @@ -385,6 +384,11 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, btspool->index = index; btspool->isunique = indexInfo->ii_Unique; btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct; + /* + * We need to ignore dead tuples for unique checks in case of concurrent build. + * It is required because of periodic reset of snapshot. + */ + btspool->unique_dead_ignored = indexInfo->ii_Concurrent && indexInfo->ii_Unique; /* Save as primary spool */ buildstate->spool = btspool; @@ -433,8 +437,9 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * the use of parallelism or any other factor. */ buildstate->spool->sortstate = - tuplesort_begin_index_btree(heap, index, buildstate->isunique, - buildstate->nulls_not_distinct, + tuplesort_begin_index_btree(heap, index, btspool->isunique, + btspool->nulls_not_distinct, + btspool->unique_dead_ignored, maintenance_work_mem, coordinate, TUPLESORT_NONE); @@ -442,8 +447,12 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * If building a unique index, put dead tuples in a second spool to keep * them out of the uniqueness check. We expect that the second spool (for * dead tuples) won't get very full, so we give it only work_mem. + * + * In case of concurrent build dead tuples do not need to be put into index + * since we wait for all snapshots older than reference snapshot during the + * validation phase. */ - if (indexInfo->ii_Unique) + if (indexInfo->ii_Unique && !indexInfo->ii_Concurrent) { BTSpool *btspool2 = palloc0_object(BTSpool); SortCoordinate coordinate2 = NULL; @@ -474,7 +483,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * full, so we give it only work_mem */ buildstate->spool2->sortstate = - tuplesort_begin_index_btree(heap, index, false, false, work_mem, + tuplesort_begin_index_btree(heap, index, false, false, false, work_mem, coordinate2, TUPLESORT_NONE); } @@ -487,7 +496,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, reltuples = _bt_parallel_heapscan(buildstate, &indexInfo->ii_BrokenHotChain); InvalidateCatalogSnapshot(); - Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); /* * Set the progress target for the next phase. Reset the block number @@ -543,7 +552,7 @@ _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, co * create an entire btree. */ static void -_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots) +_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool isconcurrent) { BTWriteState wstate; @@ -565,7 +574,7 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots) PROGRESS_BTREE_PHASE_PERFORMSORT_2); tuplesort_performsort(btspool2->sortstate); } - Assert(!reset_snapshots || !TransactionIdIsValid(MyProc->xmin)); + Assert(!isconcurrent || !TransactionIdIsValid(MyProc->xmin)); wstate.heap = btspool->heap; wstate.index = btspool->index; @@ -579,7 +588,7 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots) pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_BTREE_PHASE_LEAF_LOAD); - Assert(!reset_snapshots || !TransactionIdIsValid(MyProc->xmin)); + Assert(!isconcurrent || !TransactionIdIsValid(MyProc->xmin)); _bt_load(&wstate, btspool, btspool2); } @@ -1156,13 +1165,118 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) SortSupport sortKeys; int64 tuples_done = 0; bool deduplicate; + bool fail_on_alive_duplicate; wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM); deduplicate = wstate->inskey->allequalimage && !btspool->isunique && BTGetDeduplicateItems(wstate->index); + /* + * The unique_dead_ignored does not guarantee absence of multiple alive + * tuples with the same values in the spool. Such a case may happen if + * alive tuples are located between a few dead tuples, like this: addda. + */ + fail_on_alive_duplicate = btspool->unique_dead_ignored; - if (merge) + if (fail_on_alive_duplicate) + { + bool seen_alive = false, + prev_tested = false; + IndexTuple prev = NULL; + TupleTableSlot *slot = MakeSingleTupleTableSlot(RelationGetDescr(wstate->heap), + &TTSOpsBufferHeapTuple); + IndexFetchTableData *fetch = table_index_fetch_begin(wstate->heap); + + Assert(btspool->isunique); + Assert(!btspool2); + + while ((itup = tuplesort_getindextuple(btspool->sortstate, true)) != NULL) + { + bool tuples_equal = false; + + /* When we see first tuple, create first index page */ + if (state == NULL) + state = _bt_pagestate(wstate, 0); + + if (prev != NULL) /* if this is not the first tuple */ + { + bool has_nulls = false, + call_again = false, + ignored = false, + now_alive; + ItemPointerData tid; + + /* if this tuple equal to previous one? */ + if (wstate->inskey->allequalimage) + tuples_equal = _bt_keep_natts_fast(wstate->index, prev, itup, &has_nulls) > keysz; + else + tuples_equal = _bt_keep_natts(wstate->index, prev, itup, wstate->inskey, &has_nulls) > keysz; + + /* handle null values correctly */ + if (has_nulls && !btspool->nulls_not_distinct) + tuples_equal = false; + + if (tuples_equal) + { + /* check previous tuple if not yet */ + if (!prev_tested) + { + call_again = false; + tid = prev->t_tid; + seen_alive = table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + prev_tested = true; + } + + call_again = false; + tid = itup->t_tid; + now_alive = table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + + /* are multiple alive tuples detected in equal group? */ + if (seen_alive && now_alive) + { + char *key_desc; + TupleDesc tupDes = RelationGetDescr(wstate->index); + bool isnull[INDEX_MAX_KEYS]; + Datum values[INDEX_MAX_KEYS]; + + index_deform_tuple(itup, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(wstate->index, values, isnull); + + /* keep this message in sync with the same in comparetup_index_btree_tiebreak */ + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(wstate->index)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(wstate->heap, + RelationGetRelationName(wstate->index)))); + } + seen_alive |= now_alive; + } + } + + if (!tuples_equal) + { + seen_alive = false; + prev_tested = false; + } + + _bt_buildadd(wstate, state, itup, 0); + if (prev) pfree(prev); + prev = CopyIndexTuple(itup); + + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + if (prev) pfree(prev); + ExecDropSingleTupleTableSlot(slot); + table_index_fetch_end(fetch); + Assert(!TransactionIdIsValid(MyProc->xmin)); + } + else if (merge) { /* * Another BTSpool for dead tuples exists. Now we have to merge @@ -1322,7 +1436,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) InvalidOffsetNumber); } else if (_bt_keep_natts_fast(wstate->index, dstate->base, - itup) > keysz && + itup, NULL) > keysz && _bt_dedup_save_htid(dstate, itup)) { /* @@ -1419,7 +1533,6 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) BufferUsage *bufferusage; bool leaderparticipates = true; bool need_pop_active_snapshot = true; - bool reset_snapshot; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -1437,21 +1550,12 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) scantuplesortstates = leaderparticipates ? request + 1 : request; - /* - * For concurrent non-unique index builds, we can periodically reset snapshots - * to allow the xmin horizon to advance. This is safe since these builds don't - * require a consistent view across the entire scan. Unique indexes still need - * a stable snapshot to properly enforce uniqueness constraints. - */ - reset_snapshot = isconcurrent && !btspool->isunique; - /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that, while that snapshot may be reset periodically in - * case of non-unique index. + * live according to that, while that snapshot may be reset periodically. */ if (!isconcurrent) { @@ -1459,16 +1563,16 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) snapshot = SnapshotAny; need_pop_active_snapshot = false; } - else if (reset_snapshot) + else { + /* + * For concurrent index builds, we can periodically reset snapshots to allow + * the xmin horizon to advance. This is safe since these builds don't + * require a consistent view across the entire scan. + */ snapshot = InvalidSnapshot; PushActiveSnapshot(GetTransactionSnapshot()); } - else - { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); - PushActiveSnapshot(snapshot); - } /* * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and @@ -1480,10 +1584,10 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) shm_toc_estimate_chunk(&pcxt->estimator, estsort); /* - * Unique case requires a second spool, and so we may have to account for - * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2 + * Non-concurrent unique case requires a second spool, and so we may have + * to account for another shared workspace -- PARALLEL_KEY_TUPLESORT_SPOOL2 */ - if (!btspool->isunique) + if (!btspool->isunique || isconcurrent) shm_toc_estimate_keys(&pcxt->estimator, 2); else { @@ -1538,6 +1642,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) btshared->indexrelid = RelationGetRelid(btspool->index); btshared->isunique = btspool->isunique; btshared->nulls_not_distinct = btspool->nulls_not_distinct; + btshared->unique_dead_ignored = btspool->unique_dead_ignored; btshared->isconcurrent = isconcurrent; btshared->scantuplesortstates = scantuplesortstates; btshared->queryid = pgstat_get_my_query_id(); @@ -1552,7 +1657,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) table_parallelscan_initialize(btspool->heap, ParallelTableScanFromBTShared(btshared), snapshot, - reset_snapshot); + isconcurrent); /* * Store shared tuplesort-private state, for which we reserved space. @@ -1565,8 +1670,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared); shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort); - /* Unique case requires a second spool, and associated shared state */ - if (!btspool->isunique) + /* Non-concurrent unique case requires a second spool and shared state */ + if (!btspool->isunique || isconcurrent) sharedsort2 = NULL; else { @@ -1632,7 +1737,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * In case of concurrent build snapshots are going to be reset periodically. * Wait until all workers imported initial snapshot. */ - if (reset_snapshot) + if (isconcurrent) WaitForParallelWorkersToAttach(pcxt, true); /* Join heap scan ourselves */ @@ -1643,13 +1748,13 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - if (!reset_snapshot) + if (!isconcurrent) WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); InvalidateCatalogSnapshot(); - Assert(!reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); + Assert(!isconcurrent || !TransactionIdIsValid(MyProc->xmin)); } /* @@ -1749,9 +1854,10 @@ _bt_leader_participate_as_worker(BTBuildState *buildstate) leaderworker->index = buildstate->spool->index; leaderworker->isunique = buildstate->spool->isunique; leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct; + leaderworker->unique_dead_ignored = buildstate->spool->unique_dead_ignored; /* Initialize second spool, if required */ - if (!btleader->btshared->isunique) + if (!btleader->btshared->isunique || btleader->btshared->isconcurrent) leaderworker2 = NULL; else { @@ -1852,11 +1958,12 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) btspool->index = indexRel; btspool->isunique = btshared->isunique; btspool->nulls_not_distinct = btshared->nulls_not_distinct; + btspool->unique_dead_ignored = btshared->unique_dead_ignored; /* Look up shared state private to tuplesort.c */ sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false); tuplesort_attach_shared(sharedsort, seg); - if (!btshared->isunique) + if (!btshared->isunique || btshared->isconcurrent) { btspool2 = NULL; sharedsort2 = NULL; @@ -1936,6 +2043,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, btspool->index, btspool->isunique, btspool->nulls_not_distinct, + btspool->unique_dead_ignored, sortmem, coordinate, TUPLESORT_NONE); @@ -1958,14 +2066,12 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, coordinate2->nParticipants = -1; coordinate2->sharedsort = sharedsort2; btspool2->sortstate = - tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false, + tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false, false, Min(sortmem, work_mem), coordinate2, false); } /* Fill in buildstate for _bt_build_callback() */ - buildstate.isunique = btshared->isunique; - buildstate.nulls_not_distinct = btshared->nulls_not_distinct; buildstate.havedead = false; buildstate.heap = btspool->heap; buildstate.spool = btspool; diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index de9eca3c8b2..b174fb8d3ee 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -688,7 +688,7 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, { itemid = PageGetItemId(state->origpage, maxoff); tup = (IndexTuple) PageGetItem(state->origpage, itemid); - keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); + keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem, NULL); if (keepnatts > 1 && keepnatts <= nkeyatts) { @@ -719,7 +719,7 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) return false; /* Check same conditions as rightmost item case, too */ - keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); + keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem, NULL); if (keepnatts > 1 && keepnatts <= nkeyatts) { @@ -968,7 +968,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, * avoid appending a heap TID in new high key, we're done. Finish split * with default strategy and initial split interval. */ - perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost, NULL); if (perfectpenalty <= indnkeyatts) return perfectpenalty; @@ -989,7 +989,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, * If page is entirely full of duplicates, a single value strategy split * will be performed. */ - perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost, NULL); if (perfectpenalty <= indnkeyatts) { *strategy = SPLIT_MANY_DUPLICATES; @@ -1028,7 +1028,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, itemid = PageGetItemId(state->origpage, P_HIKEY); hikey = (IndexTuple) PageGetItem(state->origpage, itemid); perfectpenalty = _bt_keep_natts_fast(state->rel, hikey, - state->newitem); + state->newitem, NULL); if (perfectpenalty <= indnkeyatts) *strategy = SPLIT_SINGLE_VALUE; else @@ -1150,7 +1150,7 @@ _bt_split_penalty(FindSplitData *state, SplitPoint *split) lastleft = _bt_split_lastleft(state, split); firstright = _bt_split_firstright(state, split); - return _bt_keep_natts_fast(state->rel, lastleft, firstright); + return _bt_keep_natts_fast(state->rel, lastleft, firstright, NULL); } /* diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 9b091858997..13fb676e389 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -30,9 +30,6 @@ static int _bt_compare_int(const void *va, const void *vb); -static int _bt_keep_natts(Relation rel, IndexTuple lastleft, - IndexTuple firstright, BTScanInsert itup_key); - /* * _bt_mkscankey @@ -708,7 +705,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright)); /* Determine how many attributes must be kept in truncated tuple */ - keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key); + keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key, NULL); #ifdef DEBUG_NO_TRUNCATE /* Force truncation to be ineffective for testing purposes */ @@ -826,17 +823,24 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, /* * _bt_keep_natts - how many key attributes to keep when truncating. * + * This is exported to be used as comparison function during concurrent + * unique index build in case _bt_keep_natts_fast is not suitable because + * collation is not "allequalimage"/deduplication-safe. + * * Caller provides two tuples that enclose a split point. Caller's insertion * scankey is used to compare the tuples; the scankey's argument values are * not considered here. * + * hasnulls value set to true in case of any null column in any tuple. + * * This can return a number of attributes that is one greater than the * number of key attributes for the index relation. This indicates that the * caller must use a heap TID as a unique-ifier in new pivot tuple. */ -static int +int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, - BTScanInsert itup_key) + BTScanInsert itup_key, + bool *hasnulls) { int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); TupleDesc itupdesc = RelationGetDescr(rel); @@ -862,6 +866,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + if (hasnulls) + *hasnulls |= isNull1 | isNull2; if (isNull1 != isNull2) break; @@ -881,7 +887,7 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * expected in an allequalimage index. */ Assert(!itup_key->allequalimage || - keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright)); + keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright, NULL)); return keepnatts; } @@ -892,7 +898,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * This is exported so that a candidate split point can have its effect on * suffix truncation inexpensively evaluated ahead of time when finding a * split location. A naive bitwise approach to datum comparisons is used to - * save cycles. + * save cycles. Also, it may be used as comparison function during concurrent + * build of unique index. * * The approach taken here usually provides the same answer as _bt_keep_natts * will (for the same pair of tuples from a heapkeyspace index), since the @@ -901,6 +908,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * "equal image" columns, routine is guaranteed to give the same result as * _bt_keep_natts would. * + * hasnulls value set to true in case of any null column in any tuple. + * * Callers can rely on the fact that attributes considered equal here are * definitely also equal according to _bt_keep_natts, even when the index uses * an opclass or collation that is not "allequalimage"/deduplication-safe. @@ -909,7 +918,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * more balanced split point. */ int -_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) +_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright, + bool *hasnulls) { TupleDesc itupdesc = RelationGetDescr(rel); int keysz = IndexRelationGetNumberOfKeyAttributes(rel); @@ -926,6 +936,8 @@ _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + if (hasnulls) + *hasnulls |= isNull1 | isNull2; att = TupleDescCompactAttr(itupdesc, attnum - 1); if (isNull1 != isNull2) diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 7c7a41fd09e..d0eef7b1b04 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -1534,7 +1534,7 @@ index_concurrently_build(Oid heapRelationId, index_build(heapRel, indexRelation, indexInfo, false, true); InvalidateCatalogSnapshot(); - Assert(indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); + Assert(!TransactionIdIsValid(MyProc->xmin)); /* Roll back any GUC changes executed by index functions */ AtEOXact_GUC(false, save_nestlevel); @@ -3325,9 +3325,9 @@ IndexCheckExclusion(Relation heapRelation, * if we used HeapTupleSatisfiesVacuum). This leaves us with an index that * does not contain any tuples added to the table while we built the index. * - * Furthermore, in case of non-unique index we set SO_RESET_SNAPSHOT for the - * scan, which causes new snapshot to be set as active every so often. The reason - * for that is to propagate the xmin horizon forward. + * Furthermore, we set SO_RESET_SNAPSHOT for the scan, which causes new + * snapshot to be set as active every so often. The reason for that is to + * propagate the xmin horizon forward. * * Next, we mark the index "indisready" (but still not "indisvalid") and * commit the second transaction and start a third. Again we wait for all diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 415d0236c46..97029be378f 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1702,8 +1702,8 @@ DefineIndex(ParseState *pstate, * chains can be created where the new tuple and the old tuple in the * chain have different index keys. * - * We build the index using all tuples that are visible using single or - * multiple refreshing snapshots. We can be sure that any HOT updates to + * We build the index using all tuples that are visible using multiple + * refreshing snapshots. We can be sure that any HOT updates to * these tuples will be compatible with the index, since any updates made * by transactions that didn't know about the index are now committed or * rolled back. Thus, each visible tuple is either the end of its diff --git a/src/backend/utils/sort/tuplesortvariants.c b/src/backend/utils/sort/tuplesortvariants.c index 2509ac3e3a4..c3d8f92434a 100644 --- a/src/backend/utils/sort/tuplesortvariants.c +++ b/src/backend/utils/sort/tuplesortvariants.c @@ -25,6 +25,8 @@ #include "access/hash.h" #include "access/htup_details.h" #include "access/nbtree.h" +#include "access/relscan.h" +#include "access/tableam.h" #include "catalog/index.h" #include "catalog/pg_collation.h" #include "executor/executor.h" @@ -35,6 +37,7 @@ #include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/tuplesort.h" +#include "storage/proc.h" /* sort-type codes for sort__start probes */ @@ -136,6 +139,7 @@ typedef struct bool enforceUnique; /* complain if we find duplicate tuples */ bool uniqueNullsNotDistinct; /* unique constraint null treatment */ + bool uniqueDeadIgnored; /* ignore dead tuples in unique check */ } TuplesortIndexBTreeArg; /* @@ -361,6 +365,7 @@ tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, + bool uniqueDeadIgnored, int workMem, SortCoordinate coordinate, int sortopt) @@ -403,6 +408,7 @@ tuplesort_begin_index_btree(Relation heapRel, arg->index.indexRel = indexRel; arg->enforceUnique = enforceUnique; arg->uniqueNullsNotDistinct = uniqueNullsNotDistinct; + arg->uniqueDeadIgnored = uniqueDeadIgnored; indexScanKey = _bt_mkscankey(indexRel, NULL); @@ -1670,6 +1676,7 @@ comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; char *key_desc; + bool uniqueCheckFail = true; /* * Some rather brain-dead implementations of qsort (such as the one in @@ -1679,18 +1686,58 @@ comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, */ Assert(tuple1 != tuple2); - index_deform_tuple(tuple1, tupDes, values, isnull); - - key_desc = BuildIndexValueDescription(arg->index.indexRel, values, isnull); - - ereport(ERROR, - (errcode(ERRCODE_UNIQUE_VIOLATION), - errmsg("could not create unique index \"%s\"", - RelationGetRelationName(arg->index.indexRel)), - key_desc ? errdetail("Key %s is duplicated.", key_desc) : - errdetail("Duplicate keys exist."), - errtableconstraint(arg->index.heapRel, - RelationGetRelationName(arg->index.indexRel)))); + /* This is fail-fast check, see _bt_load for details. */ + if (arg->uniqueDeadIgnored) + { + bool any_tuple_dead, + call_again = false, + ignored; + + TupleTableSlot *slot = MakeSingleTupleTableSlot(RelationGetDescr(arg->index.heapRel), + &TTSOpsBufferHeapTuple); + ItemPointerData tid = tuple1->t_tid; + + IndexFetchTableData *fetch = table_index_fetch_begin(arg->index.heapRel); + any_tuple_dead = !table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + + if (!any_tuple_dead) + { + call_again = false; + tid = tuple2->t_tid; + any_tuple_dead = !table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, + &ignored); + } + + if (any_tuple_dead) + { + elog(DEBUG5, "skipping duplicate values because some of them are dead: (%u,%u) vs (%u,%u)", + ItemPointerGetBlockNumber(&tuple1->t_tid), + ItemPointerGetOffsetNumber(&tuple1->t_tid), + ItemPointerGetBlockNumber(&tuple2->t_tid), + ItemPointerGetOffsetNumber(&tuple2->t_tid)); + + uniqueCheckFail = false; + } + ExecDropSingleTupleTableSlot(slot); + table_index_fetch_end(fetch); + Assert(!TransactionIdIsValid(MyProc->xmin)); + } + if (uniqueCheckFail) + { + index_deform_tuple(tuple1, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(arg->index.indexRel, values, isnull); + + /* keep this error message in sync with the same in _bt_load */ + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(arg->index.indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(arg->index.heapRel, + RelationGetRelationName(arg->index.indexRel)))); + } } /* diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index da7503c57b6..b72445a7610 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1309,8 +1309,10 @@ extern bool btproperty(Oid index_oid, int attno, extern char *btbuildphasename(int64 phasenum); extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key); +extern int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, + BTScanInsert itup_key, bool *hasnulls); extern int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, - IndexTuple firstright); + IndexTuple firstright, bool *hasnulls); extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum); extern void _bt_check_third_page(Relation rel, Relation heap, diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 84b06ffa42f..92290e79591 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -1784,7 +1784,7 @@ table_scan_analyze_next_tuple(TableScanDesc scan, * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. * - * In case of non-unique concurrent index build, + * In case of concurrent index build, * SO_RESET_SNAPSHOT is applied for the scan. That leads to changing snapshots * on the fly to allow xmin horizon propagate. */ diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index da68f45acf2..448dc83aa58 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -396,6 +396,7 @@ extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, + bool uniqueDeadIgnored, int workMem, SortCoordinate coordinate, int sortopt); extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel, diff --git a/src/test/modules/injection_points/expected/cic_reset_snapshots.out b/src/test/modules/injection_points/expected/cic_reset_snapshots.out index f2f2025b2c8..bb84d61f40d 100644 --- a/src/test/modules/injection_points/expected/cic_reset_snapshots.out +++ b/src/test/modules/injection_points/expected/cic_reset_snapshots.out @@ -41,7 +41,11 @@ END; $$; ---------------- ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots @@ -86,7 +90,9 @@ SELECT injection_points_detach('heap_reset_scan_snapshot_effective'); (1 row) CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); NOTICE: notice triggered for injection point table_parallelscan_initialize -- 2.43.0 [application/octet-stream] v4-0003-Support-snapshot-resets-in-parallel-concurrent-in.patch (41.5K, 5-v4-0003-Support-snapshot-resets-in-parallel-concurrent-in.patch) download | inline diff: From 4030ea5249932305d6f1e5af1fee7a4637e21c5e Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <[email protected]> Date: Wed, 14 Jan 2026 18:45:56 +0300 Subject: [PATCH v4 3/4] Support snapshot resets in parallel concurrent index builds Extend periodic snapshot reset support to parallel builds, previously limited to non-parallel operations. This allows the xmin horizon to advance during parallel concurrent index builds as well. The main limitation of applying that technique to parallel builds was a requirement to wait until worker processes restore their initial snapshot from leader. To address this, following changes applied: - add infrastructure to track snapshot restoration in parallel workers - extend parallel scan initialization to support periodic snapshot resets - wait for parallel workers to restore their initial snapshots before proceeding with scan - relax limitation for parallel worker to call GetLatestSnapshot --- src/backend/access/brin/brin.c | 54 +++++++++------- src/backend/access/gin/gininsert.c | 51 ++++++++------- src/backend/access/heap/heapam_handler.c | 12 ++-- src/backend/access/nbtree/nbtsort.c | 60 +++++++++++++----- src/backend/access/table/tableam.c | 37 +++++++++-- src/backend/access/transam/parallel.c | 62 +++++++++++++++++-- src/backend/catalog/index.c | 2 +- src/backend/executor/nodeSeqscan.c | 3 +- src/backend/executor/nodeTidrangescan.c | 3 +- src/backend/utils/time/snapmgr.c | 8 --- src/include/access/parallel.h | 3 +- src/include/access/relscan.h | 1 + src/include/access/tableam.h | 5 +- .../expected/cic_reset_snapshots.out | 25 +++++++- .../sql/cic_reset_snapshots.sql | 5 +- 15 files changed, 244 insertions(+), 87 deletions(-) diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 8534d7125a0..d3fc8c14a79 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -147,7 +147,6 @@ typedef struct BrinLeader */ BrinShared *brinshared; Sharedsort *sharedsort; - Snapshot snapshot; WalUsage *walusage; BufferUsage *bufferusage; } BrinLeader; @@ -235,7 +234,7 @@ static void brin_fill_empty_ranges(BrinBuildState *state, static void _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, bool isconcurrent, int request); static void _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state); -static Size _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot); +static Size _brin_parallel_estimate_shared(Relation heap); static double _brin_parallel_heapscan(BrinBuildState *state); static double _brin_parallel_merge(BrinBuildState *state); static void _brin_leader_participate_as_worker(BrinBuildState *buildstate, @@ -1226,7 +1225,6 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) reltuples = _brin_parallel_merge(state); _brin_end_parallel(state->bs_leader, state); - Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } else /* no parallel index build */ { @@ -1259,7 +1257,6 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) brin_fill_empty_ranges(state, state->bs_currRangeStart, state->bs_maxRangeStart); - Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } /* release resources */ @@ -1275,6 +1272,9 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = idxtuples; + InvalidateCatalogSnapshot(); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); + return result; } @@ -2384,7 +2384,6 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, { ParallelContext *pcxt; int scantuplesortstates; - Snapshot snapshot; Size estbrinshared; Size estsort; BrinShared *brinshared; @@ -2415,25 +2414,25 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a - * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that. + * concurrent build, we take a regular MVCC snapshot and push it as active. + * Later we index whatever's live according to that snapshot while that + * snapshot is reset periodically. */ if (!isconcurrent) { Assert(ActiveSnapshotSet()); - snapshot = SnapshotAny; need_pop_active_snapshot = false; } else { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); + Assert(!ActiveSnapshotSet()); PushActiveSnapshot(GetTransactionSnapshot()); } /* * Estimate size for our own PARALLEL_KEY_BRIN_SHARED workspace. */ - estbrinshared = _brin_parallel_estimate_shared(heap, snapshot); + estbrinshared = _brin_parallel_estimate_shared(heap); shm_toc_estimate_chunk(&pcxt->estimator, estbrinshared); estsort = tuplesort_estimate_shared(scantuplesortstates); shm_toc_estimate_chunk(&pcxt->estimator, estsort); @@ -2473,8 +2472,6 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, { if (need_pop_active_snapshot) PopActiveSnapshot(); - if (IsMVCCSnapshot(snapshot)) - UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); ExitParallelMode(); return; @@ -2499,7 +2496,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, table_parallelscan_initialize(heap, ParallelTableScanFromBrinShared(brinshared), - snapshot); + isconcurrent ? InvalidSnapshot : SnapshotAny, + isconcurrent); /* * Store shared tuplesort-private state, for which we reserved space. @@ -2545,7 +2543,6 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, brinleader->nparticipanttuplesorts++; brinleader->brinshared = brinshared; brinleader->sharedsort = sharedsort; - brinleader->snapshot = snapshot; brinleader->walusage = walusage; brinleader->bufferusage = bufferusage; @@ -2561,6 +2558,13 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, /* Save leader state now that it's clear build will be parallel */ buildstate->bs_leader = brinleader; + /* + * In case of concurrent build snapshots are going to be reset periodically. + * We need to wait until all workers imported initial snapshot. + */ + if (isconcurrent) + WaitForParallelWorkersToAttach(pcxt, true); + /* Join heap scan ourselves */ if (leaderparticipates) _brin_leader_participate_as_worker(buildstate, heap, index); @@ -2569,9 +2573,13 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - WaitForParallelWorkersToAttach(pcxt); + if (!isconcurrent) + WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); + + InvalidateCatalogSnapshot(); + Assert(!brinleader->brinshared->isconcurrent || !TransactionIdIsValid(MyProc->xmin)); } /* @@ -2592,9 +2600,6 @@ _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state) for (i = 0; i < brinleader->pcxt->nworkers_launched; i++) InstrAccumParallelQuery(&brinleader->bufferusage[i], &brinleader->walusage[i]); - /* Free last reference to MVCC snapshot, if one was used */ - if (IsMVCCSnapshot(brinleader->snapshot)) - UnregisterSnapshot(brinleader->snapshot); DestroyParallelContext(brinleader->pcxt); ExitParallelMode(); } @@ -2794,14 +2799,14 @@ _brin_parallel_merge(BrinBuildState *state) /* * Returns size of shared memory required to store state for a parallel - * brin index build based on the snapshot its parallel scan will use. + * brin index build. */ static Size -_brin_parallel_estimate_shared(Relation heap, Snapshot snapshot) +_brin_parallel_estimate_shared(Relation heap) { /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */ return add_size(BUFFERALIGN(sizeof(BrinShared)), - table_parallelscan_estimate(heap, snapshot)); + table_parallelscan_estimate(heap, InvalidSnapshot)); } /* @@ -2963,6 +2968,13 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) _brin_parallel_scan_and_build(buildstate, brinshared, sharedsort, heapRel, indexRel, sortmem, false); + if (brinshared->isconcurrent) + { + PopActiveSnapshot(); + InvalidateCatalogSnapshot(); + Assert(!TransactionIdIsValid(MyProc->xmin)); + PushActiveSnapshot(GetTransactionSnapshot()); + } /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index c3085d174c7..e9220b0a72e 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -138,7 +138,6 @@ typedef struct GinLeader */ GinBuildShared *ginshared; Sharedsort *sharedsort; - Snapshot snapshot; WalUsage *walusage; BufferUsage *bufferusage; } GinLeader; @@ -188,7 +187,7 @@ typedef struct static void _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, bool isconcurrent, int request); static void _gin_end_parallel(GinLeader *ginleader, GinBuildState *state); -static Size _gin_parallel_estimate_shared(Relation heap, Snapshot snapshot); +static Size _gin_parallel_estimate_shared(Relation heap); static double _gin_parallel_heapscan(GinBuildState *state); static double _gin_parallel_merge(GinBuildState *state); static void _gin_leader_participate_as_worker(GinBuildState *buildstate, @@ -755,7 +754,6 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) reltuples = _gin_parallel_merge(state); _gin_end_parallel(state->bs_leader, state); - Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } else /* no parallel index build */ { @@ -779,7 +777,6 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) list, nlist, &buildstate.buildStats); } MemoryContextSwitchTo(oldCtx); - Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } MemoryContextDelete(buildstate.funcCtx); @@ -809,6 +806,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); return result; } @@ -947,7 +945,6 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, { ParallelContext *pcxt; int scantuplesortstates; - Snapshot snapshot; Size estginshared; Size estsort; GinBuildShared *ginshared; @@ -977,25 +974,25 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a - * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that. + * concurrent build, we take a regular MVCC snapshot and push it as active. + * Later we index whatever's live according to that snapshot while that + * snapshot is reset periodically. */ if (!isconcurrent) { Assert(ActiveSnapshotSet()); - snapshot = SnapshotAny; need_pop_active_snapshot = false; } else { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); + Assert(!ActiveSnapshotSet()); PushActiveSnapshot(GetTransactionSnapshot()); } /* * Estimate size for our own PARALLEL_KEY_GIN_SHARED workspace. */ - estginshared = _gin_parallel_estimate_shared(heap, snapshot); + estginshared = _gin_parallel_estimate_shared(heap); shm_toc_estimate_chunk(&pcxt->estimator, estginshared); estsort = tuplesort_estimate_shared(scantuplesortstates); shm_toc_estimate_chunk(&pcxt->estimator, estsort); @@ -1035,8 +1032,6 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, { if (need_pop_active_snapshot) PopActiveSnapshot(); - if (IsMVCCSnapshot(snapshot)) - UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); ExitParallelMode(); return; @@ -1060,7 +1055,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, table_parallelscan_initialize(heap, ParallelTableScanFromGinBuildShared(ginshared), - snapshot); + isconcurrent ? InvalidSnapshot : SnapshotAny, + isconcurrent); /* * Store shared tuplesort-private state, for which we reserved space. @@ -1102,7 +1098,6 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, ginleader->nparticipanttuplesorts++; ginleader->ginshared = ginshared; ginleader->sharedsort = sharedsort; - ginleader->snapshot = snapshot; ginleader->walusage = walusage; ginleader->bufferusage = bufferusage; @@ -1118,6 +1113,13 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, /* Save leader state now that it's clear build will be parallel */ buildstate->bs_leader = ginleader; + /* + * In case of concurrent build snapshots are going to be reset periodically. + * We need to wait until all workers imported initial snapshot. + */ + if (isconcurrent) + WaitForParallelWorkersToAttach(pcxt, true); + /* Join heap scan ourselves */ if (leaderparticipates) _gin_leader_participate_as_worker(buildstate, heap, index); @@ -1126,9 +1128,12 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - WaitForParallelWorkersToAttach(pcxt); + if (!isconcurrent) + WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); + InvalidateCatalogSnapshot(); + Assert(!ginleader->ginshared->isconcurrent || !TransactionIdIsValid(MyProc->xmin)); } /* @@ -1149,9 +1154,6 @@ _gin_end_parallel(GinLeader *ginleader, GinBuildState *state) for (i = 0; i < ginleader->pcxt->nworkers_launched; i++) InstrAccumParallelQuery(&ginleader->bufferusage[i], &ginleader->walusage[i]); - /* Free last reference to MVCC snapshot, if one was used */ - if (IsMVCCSnapshot(ginleader->snapshot)) - UnregisterSnapshot(ginleader->snapshot); DestroyParallelContext(ginleader->pcxt); ExitParallelMode(); } @@ -1826,14 +1828,14 @@ _gin_parallel_merge(GinBuildState *state) /* * Returns size of shared memory required to store state for a parallel - * gin index build based on the snapshot its parallel scan will use. + * gin index build. */ static Size -_gin_parallel_estimate_shared(Relation heap, Snapshot snapshot) +_gin_parallel_estimate_shared(Relation heap) { /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */ return add_size(BUFFERALIGN(sizeof(GinBuildShared)), - table_parallelscan_estimate(heap, snapshot)); + table_parallelscan_estimate(heap, InvalidSnapshot)); } /* @@ -2218,6 +2220,13 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) _gin_parallel_scan_and_build(&buildstate, ginshared, sharedsort, heapRel, indexRel, sortmem, false); + if (ginshared->isconcurrent) + { + PopActiveSnapshot(); + InvalidateCatalogSnapshot(); + Assert(!TransactionIdIsValid(MyProc->xmin)); + PushActiveSnapshot(GetTransactionSnapshot()); + } /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 9cad2e115bd..a639e8b31e0 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1260,14 +1260,13 @@ heapam_index_build_range_scan(Relation heapRelation, * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, or during bootstrap, we take a regular MVCC snapshot - * and index whatever's live according to that. + * and index whatever's live according to that while that snapshot is reset + * every so often (in case of non-unique index). */ OldestXmin = InvalidTransactionId; /* * For unique index we need consistent snapshot for the whole scan. - * In case of parallel scan some additional infrastructure required - * to perform scan with SO_RESET_SNAPSHOT which is not yet ready. */ reset_snapshots = indexInfo->ii_Concurrent && !indexInfo->ii_Unique && @@ -1329,8 +1328,11 @@ heapam_index_build_range_scan(Relation heapRelation, Assert(!IsBootstrapProcessingMode()); Assert(allow_sync); snapshot = scan->rs_snapshot; - PushActiveSnapshot(snapshot); - need_pop_active_snapshot = true; + if (!reset_snapshots) + { + PushActiveSnapshot(snapshot); + need_pop_active_snapshot = true; + } } hscan = (HeapScanDesc) scan; diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 6f091d39813..6594c3e15b2 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -325,22 +325,20 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) RelationGetRelationName(index)); reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo); - Assert(indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique || - !indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); /* * Finish the build by (1) completing the sort of the spool file, (2) * inserting the sorted tuples into btree pages and (3) building the upper * levels. Finally, it may also be necessary to end use of parallelism. */ - _bt_leafbuild(buildstate.spool, buildstate.spool2, !indexInfo->ii_ParallelWorkers && indexInfo->ii_Concurrent); + _bt_leafbuild(buildstate.spool, buildstate.spool2, !indexInfo->ii_Unique && indexInfo->ii_Concurrent); _bt_spooldestroy(buildstate.spool); if (buildstate.spool2) _bt_spooldestroy(buildstate.spool2); if (buildstate.btleader) _bt_end_parallel(buildstate.btleader); - Assert(indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique || - !indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); result = palloc_object(IndexBuildResult); @@ -489,8 +487,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, reltuples = _bt_parallel_heapscan(buildstate, &indexInfo->ii_BrokenHotChain); InvalidateCatalogSnapshot(); - Assert(indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique || - !indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); /* * Set the progress target for the next phase. Reset the block number @@ -1422,6 +1419,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) BufferUsage *bufferusage; bool leaderparticipates = true; bool need_pop_active_snapshot = true; + bool reset_snapshot; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -1439,12 +1437,21 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) scantuplesortstates = leaderparticipates ? request + 1 : request; + /* + * For concurrent non-unique index builds, we can periodically reset snapshots + * to allow the xmin horizon to advance. This is safe since these builds don't + * require a consistent view across the entire scan. Unique indexes still need + * a stable snapshot to properly enforce uniqueness constraints. + */ + reset_snapshot = isconcurrent && !btspool->isunique; + /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that. + * live according to that, while that snapshot may be reset periodically in + * case of non-unique index. */ if (!isconcurrent) { @@ -1452,6 +1459,11 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) snapshot = SnapshotAny; need_pop_active_snapshot = false; } + else if (reset_snapshot) + { + snapshot = InvalidSnapshot; + PushActiveSnapshot(GetTransactionSnapshot()); + } else { snapshot = RegisterSnapshot(GetTransactionSnapshot()); @@ -1512,7 +1524,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) { if (need_pop_active_snapshot) PopActiveSnapshot(); - if (IsMVCCSnapshot(snapshot)) + if (snapshot != InvalidSnapshot && IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); ExitParallelMode(); @@ -1539,7 +1551,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) btshared->brokenhotchain = false; table_parallelscan_initialize(btspool->heap, ParallelTableScanFromBTShared(btshared), - snapshot); + snapshot, + reset_snapshot); /* * Store shared tuplesort-private state, for which we reserved space. @@ -1615,6 +1628,13 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) /* Save leader state now that it's clear build will be parallel */ buildstate->btleader = btleader; + /* + * In case of concurrent build snapshots are going to be reset periodically. + * Wait until all workers imported initial snapshot. + */ + if (reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, true); + /* Join heap scan ourselves */ if (leaderparticipates) _bt_leader_participate_as_worker(buildstate); @@ -1623,9 +1643,13 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - WaitForParallelWorkersToAttach(pcxt); + if (!reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); + + InvalidateCatalogSnapshot(); + Assert(!reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); } /* @@ -1647,7 +1671,7 @@ _bt_end_parallel(BTLeader *btleader) InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]); /* Free last reference to MVCC snapshot, if one was used */ - if (IsMVCCSnapshot(btleader->snapshot)) + if (btleader->snapshot != InvalidSnapshot && IsMVCCSnapshot(btleader->snapshot)) UnregisterSnapshot(btleader->snapshot); DestroyParallelContext(btleader->pcxt); ExitParallelMode(); @@ -1897,6 +1921,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, SortCoordinate coordinate; BTBuildState buildstate; TableScanDesc scan; + ParallelTableScanDesc pscan; double reltuples; IndexInfo *indexInfo; @@ -1951,11 +1976,15 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, /* Join parallel scan */ indexInfo = BuildIndexInfo(btspool->index); indexInfo->ii_Concurrent = btshared->isconcurrent; - scan = table_beginscan_parallel(btspool->heap, - ParallelTableScanFromBTShared(btshared)); + pscan = ParallelTableScanFromBTShared(btshared); + scan = table_beginscan_parallel(btspool->heap, pscan); reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo, true, progress, _bt_build_callback, &buildstate, scan); + InvalidateCatalogSnapshot(); + if (pscan->phs_reset_snapshot) + PopActiveSnapshot(); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); /* Execute this worker's part of the sort */ if (progress) @@ -1991,4 +2020,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, tuplesort_end(btspool->sortstate); if (btspool2) tuplesort_end(btspool2->sortstate); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); + if (pscan->phs_reset_snapshot) + PushActiveSnapshot(GetTransactionSnapshot()); } diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index dfda1af412e..26df5638921 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -132,10 +132,10 @@ table_parallelscan_estimate(Relation rel, Snapshot snapshot) { Size sz = 0; - if (IsMVCCSnapshot(snapshot)) + if (snapshot != InvalidSnapshot && IsMVCCSnapshot(snapshot)) sz = add_size(sz, EstimateSnapshotSpace(snapshot)); else - Assert(snapshot == SnapshotAny); + Assert(snapshot == SnapshotAny || snapshot == InvalidSnapshot); sz = add_size(sz, rel->rd_tableam->parallelscan_estimate(rel)); @@ -144,21 +144,36 @@ table_parallelscan_estimate(Relation rel, Snapshot snapshot) void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, - Snapshot snapshot) + Snapshot snapshot, bool reset_snapshot) { Size snapshot_off = rel->rd_tableam->parallelscan_initialize(rel, pscan); pscan->phs_snapshot_off = snapshot_off; - if (IsMVCCSnapshot(snapshot)) + /* + * Initialize parallel scan description. For normal scans with a regular + * MVCC snapshot, serialize the snapshot info. For scans that use periodic + * snapshot resets, mark the scan accordingly. + */ + if (reset_snapshot) + { + Assert(snapshot == InvalidSnapshot); + pscan->phs_snapshot_any = false; + pscan->phs_reset_snapshot = true; + INJECTION_POINT("table_parallelscan_initialize", NULL); + } + else if (IsMVCCSnapshot(snapshot)) { SerializeSnapshot(snapshot, (char *) pscan + pscan->phs_snapshot_off); pscan->phs_snapshot_any = false; + pscan->phs_reset_snapshot = false; } else { Assert(snapshot == SnapshotAny); + Assert(!reset_snapshot); pscan->phs_snapshot_any = true; + pscan->phs_reset_snapshot = false; } } @@ -171,7 +186,19 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan) Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator)); - if (!pscan->phs_snapshot_any) + /* + * For scans that + * use periodic snapshot resets, mark the scan accordingly and use the active + * snapshot as the initial state. + */ + if (pscan->phs_reset_snapshot) + { + Assert(ActiveSnapshotSet()); + flags |= SO_RESET_SNAPSHOT; + /* Start with current active snapshot. */ + snapshot = GetActiveSnapshot(); + } + else if (!pscan->phs_snapshot_any) { /* Snapshot was serialized -- restore it */ snapshot = RestoreSnapshot((char *) pscan + pscan->phs_snapshot_off); diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c index ab1dfb30e73..0d2018ce154 100644 --- a/src/backend/access/transam/parallel.c +++ b/src/backend/access/transam/parallel.c @@ -79,6 +79,7 @@ #define PARALLEL_KEY_RELMAPPER_STATE UINT64CONST(0xFFFFFFFFFFFF000D) #define PARALLEL_KEY_UNCOMMITTEDENUMS UINT64CONST(0xFFFFFFFFFFFF000E) #define PARALLEL_KEY_CLIENTCONNINFO UINT64CONST(0xFFFFFFFFFFFF000F) +#define PARALLEL_KEY_SNAPSHOT_RESTORED UINT64CONST(0xFFFFFFFFFFFF0010) /* Fixed-size parallel state. */ typedef struct FixedParallelState @@ -308,6 +309,10 @@ InitializeParallelDSM(ParallelContext *pcxt) pcxt->nworkers)); shm_toc_estimate_keys(&pcxt->estimator, 1); + shm_toc_estimate_chunk(&pcxt->estimator, mul_size(sizeof(bool), + pcxt->nworkers)); + shm_toc_estimate_keys(&pcxt->estimator, 1); + /* Estimate how much we'll need for the entrypoint info. */ shm_toc_estimate_chunk(&pcxt->estimator, strlen(pcxt->library_name) + strlen(pcxt->function_name) + 2); @@ -379,6 +384,7 @@ InitializeParallelDSM(ParallelContext *pcxt) char *entrypointstate; char *uncommittedenumsspace; char *clientconninfospace; + bool *snapshot_set_flag_space; Size lnamelen; /* Serialize shared libraries we have loaded. */ @@ -494,6 +500,19 @@ InitializeParallelDSM(ParallelContext *pcxt) strcpy(entrypointstate, pcxt->library_name); strcpy(entrypointstate + lnamelen + 1, pcxt->function_name); shm_toc_insert(pcxt->toc, PARALLEL_KEY_ENTRYPOINT, entrypointstate); + + /* + * Establish dynamic shared memory to pass information about importing + * of snapshot. + */ + snapshot_set_flag_space = + shm_toc_allocate(pcxt->toc, mul_size(sizeof(bool), pcxt->nworkers)); + for (i = 0; i < pcxt->nworkers; ++i) + { + pcxt->worker[i].snapshot_restored = &snapshot_set_flag_space[i]; + *pcxt->worker[i].snapshot_restored = false; + } + shm_toc_insert(pcxt->toc, PARALLEL_KEY_SNAPSHOT_RESTORED, snapshot_set_flag_space); } /* Update nworkers_to_launch, in case we changed nworkers above. */ @@ -670,6 +689,10 @@ LaunchParallelWorkers(ParallelContext *pcxt) * Wait for all workers to attach to their error queues, and throw an error if * any worker fails to do this. * + * wait_for_snapshot: track whether each parallel worker has successfully restored + * its snapshot. This is needed when using periodic snapshot resets to ensure all + * workers have a valid initial snapshot before proceeding with the scan. + * * Callers can assume that if this function returns successfully, then the * number of workers given by pcxt->nworkers_launched have initialized and * attached to their error queues. Whether or not these workers are guaranteed @@ -699,7 +722,7 @@ LaunchParallelWorkers(ParallelContext *pcxt) * call this function at all. */ void -WaitForParallelWorkersToAttach(ParallelContext *pcxt) +WaitForParallelWorkersToAttach(ParallelContext *pcxt, bool wait_for_snapshot) { int i; @@ -743,9 +766,24 @@ WaitForParallelWorkersToAttach(ParallelContext *pcxt) mq = shm_mq_get_queue(pcxt->worker[i].error_mqh); if (shm_mq_get_sender(mq) != NULL) { - /* Yes, so it is known to be attached. */ - pcxt->known_attached_workers[i] = true; - ++pcxt->nknown_attached_workers; + pg_read_barrier(); + if (!wait_for_snapshot || *pcxt->worker[i].snapshot_restored) + { + /* Worker is known to be attached. */ + pcxt->known_attached_workers[i] = true; + ++pcxt->nknown_attached_workers; + } + else + { + /* + * Worker attached but hasn't restored its snapshot yet. + */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 10, WAIT_EVENT_BGWORKER_STARTUP); + if (rc & WL_LATCH_SET) + ResetLatch(MyLatch); + } } } else if (status == BGWH_STOPPED) @@ -788,6 +826,16 @@ WaitForParallelWorkersToAttach(ParallelContext *pcxt) break; } } + + /* Set snapshot restored flag to false. */ + if (pcxt->nworkers > 0) + { + bool *snapshot_restored_space; + snapshot_restored_space = + shm_toc_lookup(pcxt->toc, PARALLEL_KEY_SNAPSHOT_RESTORED, false); + for (i = 0; i < pcxt->nworkers; ++i) + snapshot_restored_space[i] = false; + } } /* @@ -1304,6 +1352,7 @@ ParallelWorkerMain(Datum main_arg) shm_toc *toc; FixedParallelState *fps; char *error_queue_space; + bool *snapshot_restored_space; shm_mq *mq; shm_mq_handle *mqh; char *libraryspace; @@ -1507,6 +1556,11 @@ ParallelWorkerMain(Datum main_arg) fps->parallel_leader_pgproc); PushActiveSnapshot(asnapshot); + /* Snapshot is restored, set flag to make leader know about it. */ + snapshot_restored_space = shm_toc_lookup(toc, PARALLEL_KEY_SNAPSHOT_RESTORED, false); + snapshot_restored_space[ParallelWorkerNumber] = true; + pg_write_barrier(); + /* * We've changed which tuples we can see, and must therefore invalidate * system caches. diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index f4dc4563c91..7c7a41fd09e 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -1534,7 +1534,7 @@ index_concurrently_build(Oid heapRelationId, index_build(heapRel, indexRelation, indexInfo, false, true); InvalidateCatalogSnapshot(); - Assert((indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique) || !TransactionIdIsValid(MyProc->xmin)); + Assert(indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); /* Roll back any GUC changes executed by index functions */ AtEOXact_GUC(false, save_nestlevel); diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c index 8f219f60a93..06a91692072 100644 --- a/src/backend/executor/nodeSeqscan.c +++ b/src/backend/executor/nodeSeqscan.c @@ -372,7 +372,8 @@ ExecSeqScanInitializeDSM(SeqScanState *node, pscan = shm_toc_allocate(pcxt->toc, node->pscan_len); table_parallelscan_initialize(node->ss.ss_currentRelation, pscan, - estate->es_snapshot); + estate->es_snapshot, + false); shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); node->ss.ss_currentScanDesc = table_beginscan_parallel(node->ss.ss_currentRelation, pscan); diff --git a/src/backend/executor/nodeTidrangescan.c b/src/backend/executor/nodeTidrangescan.c index 617713bde04..f1bf737f395 100644 --- a/src/backend/executor/nodeTidrangescan.c +++ b/src/backend/executor/nodeTidrangescan.c @@ -456,7 +456,8 @@ ExecTidRangeScanInitializeDSM(TidRangeScanState *node, ParallelContext *pcxt) pscan = shm_toc_allocate(pcxt->toc, node->trss_pscanlen); table_parallelscan_initialize(node->ss.ss_currentRelation, pscan, - estate->es_snapshot); + estate->es_snapshot, + false); shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); node->ss.ss_currentScanDesc = table_beginscan_parallel_tidrange(node->ss.ss_currentRelation, diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 2e6197f5f35..df9116532c0 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -353,14 +353,6 @@ GetTransactionSnapshot(void) Snapshot GetLatestSnapshot(void) { - /* - * We might be able to relax this, but nothing that could otherwise work - * needs it. - */ - if (IsInParallelMode()) - elog(ERROR, - "cannot update SecondarySnapshot during a parallel operation"); - /* * So far there are no cases requiring support for GetLatestSnapshot() * during logical decoding, but it wouldn't be hard to add if required. diff --git a/src/include/access/parallel.h b/src/include/access/parallel.h index 60f857675e0..9a3e14e0c0e 100644 --- a/src/include/access/parallel.h +++ b/src/include/access/parallel.h @@ -28,6 +28,7 @@ typedef struct ParallelWorkerInfo { BackgroundWorkerHandle *bgwhandle; shm_mq_handle *error_mqh; + bool *snapshot_restored; } ParallelWorkerInfo; typedef struct ParallelContext @@ -67,7 +68,7 @@ extern void InitializeParallelDSM(ParallelContext *pcxt); extern void ReinitializeParallelDSM(ParallelContext *pcxt); extern void ReinitializeParallelWorkers(ParallelContext *pcxt, int nworkers_to_launch); extern void LaunchParallelWorkers(ParallelContext *pcxt); -extern void WaitForParallelWorkersToAttach(ParallelContext *pcxt); +extern void WaitForParallelWorkersToAttach(ParallelContext *pcxt, bool wait_for_snapshot); extern void WaitForParallelWorkersToFinish(ParallelContext *pcxt); extern void DestroyParallelContext(ParallelContext *pcxt); extern bool ParallelContextActive(void); diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index ce340c076f8..acfa06aed78 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -81,6 +81,7 @@ typedef struct ParallelTableScanDescData RelFileLocator phs_locator; /* physical relation to scan */ bool phs_syncscan; /* report location to syncscan logic? */ bool phs_snapshot_any; /* SnapshotAny, not phs_snapshot_data? */ + bool phs_reset_snapshot; /* use SO_RESET_SNAPSHOT? */ Size phs_snapshot_off; /* data for snapshot */ } ParallelTableScanDescData; typedef struct ParallelTableScanDescData *ParallelTableScanDesc; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index d0709782dde..84b06ffa42f 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -1151,7 +1151,8 @@ extern Size table_parallelscan_estimate(Relation rel, Snapshot snapshot); */ extern void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, - Snapshot snapshot); + Snapshot snapshot, + bool reset_snapshot); /* * Begin a parallel scan. `pscan` needs to have been initialized with @@ -1783,7 +1784,7 @@ table_scan_analyze_next_tuple(TableScanDesc scan, * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. * - * In case of non-unique index and non-parallel concurrent build, + * In case of non-unique concurrent index build, * SO_RESET_SNAPSHOT is applied for the scan. That leads to changing snapshots * on the fly to allow xmin horizon propagate. */ diff --git a/src/test/modules/injection_points/expected/cic_reset_snapshots.out b/src/test/modules/injection_points/expected/cic_reset_snapshots.out index b4ad90eb339..f2f2025b2c8 100644 --- a/src/test/modules/injection_points/expected/cic_reset_snapshots.out +++ b/src/test/modules/injection_points/expected/cic_reset_snapshots.out @@ -17,6 +17,12 @@ SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice' (1 row) +SELECT injection_points_attach('table_parallelscan_initialize', 'notice'); + injection_points_attach +------------------------- + +(1 row) + CREATE SCHEMA cic_reset_snap; CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); INSERT INTO cic_reset_snap.tbl SELECT i, i * I FROM generate_series(1, 200) s(i); @@ -72,30 +78,45 @@ NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective DROP INDEX CONCURRENTLY cic_reset_snap.idx; -- The same in parallel mode ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); +-- Detach to keep test stable, since parallel worker may complete scan before leader +SELECT injection_points_detach('heap_reset_scan_snapshot_effective'); + injection_points_detach +------------------------- + +(1 row) + CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots -NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots -NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i DESC NULLS LAST); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; BEGIN TRANSACTION; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); diff --git a/src/test/modules/injection_points/sql/cic_reset_snapshots.sql b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql index 7f7dffa5be4..37819bf0fb7 100644 --- a/src/test/modules/injection_points/sql/cic_reset_snapshots.sql +++ b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql @@ -3,7 +3,7 @@ CREATE EXTENSION injection_points; SELECT injection_points_set_local(); SELECT injection_points_attach('heap_reset_scan_snapshot_effective', 'notice'); SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice'); - +SELECT injection_points_attach('table_parallelscan_initialize', 'notice'); CREATE SCHEMA cic_reset_snap; CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); @@ -53,6 +53,9 @@ DROP INDEX CONCURRENTLY cic_reset_snap.idx; -- The same in parallel mode ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); +-- Detach to keep test stable, since parallel worker may complete scan before leader +SELECT injection_points_detach('heap_reset_scan_snapshot_effective'); + CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; DROP INDEX CONCURRENTLY cic_reset_snap.idx; -- 2.43.0 ^ permalink raw reply [nested|flat] 6+ messages in thread
* Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY 2026-03-09 00:03 Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY Mihail Nikalayeu <[email protected]> 2026-03-20 01:15 ` Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY Mihail Nikalayeu <[email protected]> 2026-03-21 23:50 ` Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY Mihail Nikalayeu <[email protected]> @ 2026-04-06 17:55 ` Mihail Nikalayeu <[email protected]> 2026-04-11 17:48 ` Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY Mihail Nikalayeu <[email protected]> 0 siblings, 1 reply; 6+ messages in thread From: Mihail Nikalayeu @ 2026-04-06 17:55 UTC (permalink / raw) To: Álvaro Herrera <[email protected]>; +Cc: PostgreSQL Hackers <[email protected]>; Matthias van de Meent <[email protected]>; Antonin Houska <[email protected]>; Sergey Sargsyan <[email protected]>; Hannu Krosing <[email protected]> Hello! 0) Rebased 1) Added REPEATABLE READ isolation level 2) Some GUGs for developers 3) Some small refactorings Attachments: [application/octet-stream] v5-0001-Add-stress-tests-for-concurrent-index-builds.patch (12.5K, 2-v5-0001-Add-stress-tests-for-concurrent-index-builds.patch) download | inline diff: From 9d99b545fbb2f2ee938851c154a392b2cbe94850 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <[email protected]> Date: Sat, 30 Nov 2024 16:24:20 +0100 Subject: [PATCH v5 1/4] Add stress tests for concurrent index builds Introduce stress tests for concurrent index operations: - test concurrent inserts/updates during CREATE/REINDEX INDEX CONCURRENTLY - cover various index types (btree, gin, gist, brin, hash, spgist) - test unique and non-unique indexes - test with expressions and predicates - test both parallel and non-parallel operations These tests verify the behavior of the following commits. --- src/bin/pg_amcheck/meson.build | 1 + src/bin/pg_amcheck/t/006_cic.pl | 293 ++++++++++++++++++++++++++++++++ 2 files changed, 294 insertions(+) create mode 100644 src/bin/pg_amcheck/t/006_cic.pl diff --git a/src/bin/pg_amcheck/meson.build b/src/bin/pg_amcheck/meson.build index 592cef74ecb..51a62dccb7b 100644 --- a/src/bin/pg_amcheck/meson.build +++ b/src/bin/pg_amcheck/meson.build @@ -28,6 +28,7 @@ tests += { 't/003_check.pl', 't/004_verify_heapam.pl', 't/005_opclass_damage.pl', + 't/006_cic.pl', ], }, } diff --git a/src/bin/pg_amcheck/t/006_cic.pl b/src/bin/pg_amcheck/t/006_cic.pl new file mode 100644 index 00000000000..dd7a1eff0ef --- /dev/null +++ b/src/bin/pg_amcheck/t/006_cic.pl @@ -0,0 +1,293 @@ +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Test REINDEX CONCURRENTLY with concurrent modifications and HOT updates +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use constant STRESS_PGBENCH_CLIENTS => 30; +use constant STRESS_PGBENCH_JOBS => 8; +use constant STRESS_PGBENCH_TRANSACTIONS => 10000; +use constant STRESS_MAX_SLEEP_MS => 10; + +use constant DEFAULT_PGBENCH_CLIENTS => 15; +use constant DEFAULT_PGBENCH_JOBS => 4; +use constant DEFAULT_PGBENCH_TRANSACTIONS => 500; +use constant DEFAULT_MAX_SLEEP_MS => 1; + +Test::More->builder->todo_start('filesystem bug') + if PostgreSQL::Test::Utils::has_wal_read_bug; + +my $node; +my $pg_test_extra = $ENV{PG_TEST_EXTRA} // ''; +my $is_stress = $pg_test_extra =~ /\bstress\b/ ? 1 : 0; +my $pgbench_clients = + $is_stress ? STRESS_PGBENCH_CLIENTS : DEFAULT_PGBENCH_CLIENTS; +my $pgbench_jobs = $is_stress ? STRESS_PGBENCH_JOBS : DEFAULT_PGBENCH_JOBS; +my $pgbench_transactions = + $is_stress ? STRESS_PGBENCH_TRANSACTIONS : DEFAULT_PGBENCH_TRANSACTIONS; +my $max_sleep_ms = $is_stress ? STRESS_MAX_SLEEP_MS : DEFAULT_MAX_SLEEP_MS; +my $pgbench_options = sprintf( + '--no-vacuum --client=%d --jobs=%d --exit-on-abort --transactions=%d', + $pgbench_clients, + $pgbench_jobs, + $pgbench_transactions); +my $no_hot = $is_stress ? int(rand(2)) : 0; + +print( + sprintf( + 'settings: PG_TEST_EXTRA=%s stress=%d clients=%d jobs=%d transactions=%d max_sleep_ms=%d no_hot=%d', + defined($ENV{PG_TEST_EXTRA}) + ? ($pg_test_extra eq '' ? '(empty)' : $pg_test_extra) + : '(undef)', + $is_stress, + $pgbench_clients, + $pgbench_jobs, + $pgbench_transactions, + $max_sleep_ms, + $no_hot)); +print "\n"; + +# +# Test set-up +# +$node = PostgreSQL::Test::Cluster->new('RC_test'); +$node->init; +$node->append_conf('postgresql.conf', + 'lock_timeout = ' . (1000 * $PostgreSQL::Test::Utils::timeout_default)); +$node->append_conf('postgresql.conf', 'fsync = off'); +$node->append_conf('postgresql.conf', 'maintenance_work_mem = 32MB'); # to avoid OOM +$node->append_conf('postgresql.conf', 'shared_buffers = 32MB'); # to avoid OOM +$node->start; +$node->safe_psql('postgres', q(CREATE EXTENSION amcheck)); +$node->safe_psql('postgres', q(CREATE UNLOGGED TABLE tbl(i int primary key, + c1 money default 0, c2 money default 0, + c3 money default 0, updated_at timestamp, + ia int4[], p point))); + +if ($no_hot) { $node->safe_psql('postgres', q(CREATE INDEX CONCURRENTLY idx ON tbl(i, updated_at);)); } + +# create sequence +$node->safe_psql('postgres', q(CREATE UNLOGGED SEQUENCE in_row_rebuild START 1 INCREMENT 1;)); +$node->safe_psql('postgres', q(SELECT nextval('in_row_rebuild');)); + +# Create helper functions for predicate tests +$node->safe_psql('postgres', q( + CREATE FUNCTION predicate_stable() RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ + BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN true; + END; $$; +)); + +$node->safe_psql('postgres', q( + CREATE FUNCTION predicate_const(integer) RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ + BEGIN + RETURN MOD($1, 2) = 0; + END; $$; +)); + +# Run CIC/RIC in different options concurrently with upserts +$node->pgbench( + $pgbench_options, + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY', + { + 'concurrent_ops' => sprintf(q( + SET debug_parallel_query = off; -- this is because predicate_stable implementation + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set variant random(0, 5) + \set parallels random(0, 4) + \set use_rr random(0, 9) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + \if :use_rr = 0 + SET default_transaction_isolation = 'repeatable read'; + \endif + \if :variant = 0 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at); + \elif :variant = 1 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at) WHERE predicate_stable(); + \elif :variant = 2 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at) WHERE MOD(i, 2) = 0; + \elif :variant = 3 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at) WHERE predicate_const(i); + \elif :variant = 4 + CREATE INDEX CONCURRENTLY new_idx ON tbl(predicate_const(i)); + \elif :variant = 5 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, predicate_const(i), updated_at) WHERE predicate_const(i); + \endif + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + REINDEX INDEX CONCURRENTLY new_idx; + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + DROP INDEX CONCURRENTLY new_idx; + RESET default_transaction_isolation; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1000, 100000) + BEGIN; + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + SELECT setval('in_row_rebuild', 1); + COMMIT; + \endif + ), $max_sleep_ms, $max_sleep_ms) + }); + +$node->safe_psql('postgres', q(TRUNCATE TABLE tbl;)); + +# Run CIC/RIC for unique index concurrently with upserts +$node->pgbench( + $pgbench_options, + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for unique BTREE', + { + 'concurrent_ops_unique_idx' => sprintf(q( + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set parallels random(0, 4) + \set use_rr random(0, 9) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + \if :use_rr = 0 + SET default_transaction_isolation = 'repeatable read'; + \endif + CREATE UNIQUE INDEX CONCURRENTLY new_idx ON tbl(i); + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + REINDEX INDEX CONCURRENTLY new_idx; + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + DROP INDEX CONCURRENTLY new_idx; + RESET default_transaction_isolation; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1, power(10, random(1, 5))) + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + SELECT setval('in_row_rebuild', 1); + \endif + ), $max_sleep_ms, $max_sleep_ms) + }); + +$node->safe_psql('postgres', q(TRUNCATE TABLE tbl;)); + +# Run CIC/RIC for GIN with upserts +$node->pgbench( + $pgbench_options, + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for GIN', + { + 'concurrent_ops_gin_idx' => sprintf(q( + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set parallels random(0, 4) + \set use_rr random(0, 9) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + \if :use_rr = 0 + SET default_transaction_isolation = 'repeatable read'; + \endif + CREATE INDEX CONCURRENTLY new_idx ON tbl USING GIN (ia); + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + SELECT gin_index_check('new_idx'); + REINDEX INDEX CONCURRENTLY new_idx; + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + SELECT gin_index_check('new_idx'); + DROP INDEX CONCURRENTLY new_idx; + RESET default_transaction_isolation; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1, power(10, random(1, 5))) + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + SELECT setval('in_row_rebuild', 1); + \endif + ), $max_sleep_ms, $max_sleep_ms) + }); + +$node->safe_psql('postgres', q(TRUNCATE TABLE tbl;)); + +# Run CIC/RIC for GIST/BRIN/HASH/SPGIST index concurrently with upserts +$node->pgbench( + $pgbench_options, + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for GIST/BRIN/HASH/SPGIST', + { + 'concurrent_ops_other_idx' => sprintf(q( + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set parallels random(0, 4) + \set use_rr random(0, 9) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + \if :use_rr = 0 + SET default_transaction_isolation = 'repeatable read'; + \endif + \set variant random(0, 3) + \if :variant = 0 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING GIST (p); + \elif :variant = 1 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING BRIN (updated_at); + \elif :variant = 2 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING HASH (updated_at); + \elif :variant = 3 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING SPGIST (p); + \endif + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + REINDEX INDEX CONCURRENTLY new_idx; + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + DROP INDEX CONCURRENTLY new_idx; + RESET default_transaction_isolation; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1, power(10, random(1, 5))) + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + SELECT setval('in_row_rebuild', 1); + \endif + ), $max_sleep_ms, $max_sleep_ms) + }); + +$node->stop; +done_testing(); -- 2.43.0 [application/octet-stream] v5-0004-Support-snapshot-resets-in-concurrent-builds-of-u.patch (37.7K, 3-v5-0004-Support-snapshot-resets-in-concurrent-builds-of-u.patch) download | inline diff: From 27aa9e530fc5cb322efc7ada388ac635532f01a9 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <[email protected]> Date: Wed, 14 Jan 2026 17:02:34 +0100 Subject: [PATCH v5 4/4] Support snapshot resets in concurrent builds of unique indexes Previously, concurrent builds of unique index used a fixed snapshot for the entire scan to ensure proper uniqueness checks. Now reset snapshots periodically during concurrent unique index builds, while still maintaining uniqueness by: - ignoring SnapshotSelf dead tuples during uniqueness checks in tuplesort as not a guarantee, but a fail-fast mechanics - adding a uniqueness check in _bt_load that detects multiple alive tuples with the same key values as a guarantee of correctness Tuples are SnapshotSelf tested only in the case of equal index key values, otherwise _bt_load works like before. --- src/backend/access/heap/README.HOT | 12 +- src/backend/access/heap/heapam_handler.c | 7 +- src/backend/access/nbtree/nbtdedup.c | 8 +- src/backend/access/nbtree/nbtreadpage.c | 2 +- src/backend/access/nbtree/nbtsort.c | 175 +++++++++++++++--- src/backend/access/nbtree/nbtsplitloc.c | 12 +- src/backend/access/nbtree/nbtutils.c | 30 ++- src/backend/catalog/index.c | 6 +- src/backend/commands/indexcmds.c | 4 +- src/backend/utils/sort/tuplesortvariants.c | 80 ++++++-- src/include/access/nbtree.h | 4 +- src/include/access/tableam.h | 7 +- src/include/catalog/index.h | 2 - src/include/utils/tuplesort.h | 1 + .../expected/cic_reset_snapshots.out | 6 + 15 files changed, 274 insertions(+), 82 deletions(-) diff --git a/src/backend/access/heap/README.HOT b/src/backend/access/heap/README.HOT index 74e407f375a..829dad1194e 100644 --- a/src/backend/access/heap/README.HOT +++ b/src/backend/access/heap/README.HOT @@ -386,12 +386,12 @@ have the HOT-safety property enforced before we start to build the new index. After waiting for transactions which had the table open, we build the index -for all rows that are valid in a fresh snapshot. Any tuples visible in the -snapshot will have only valid forward-growing HOT chains. (They might have -older HOT updates behind them which are broken, but this is OK for the same -reason it's OK in a regular index build.) As above, we point the index -entry at the root of the HOT-update chain but we use the key value from the -live tuple. +for all rows that are valid in a fresh snapshot, which is updated every so +often. Any tuples visible in the snapshot will have only valid forward-growing +HOT chains. (They might have older HOT updates behind them which are broken, +but this is OK for the same reason it's OK in a regular index build.) +As above, we point the index entry at the root of the HOT-update chain but we +use the key value from the live tuple. We mark the index open for inserts (but still not ready for reads) then we again wait for transactions which have the table open. Then we take diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index fb2d61360c9..00b1b5ac1fb 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1165,13 +1165,14 @@ heapam_index_build_range_scan(Relation heapRelation, * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, or during bootstrap, we take a regular MVCC snapshot * and index whatever's live according to that while that snapshot is reset - * every so often (in the case of non-unique index). + * every so often. */ OldestXmin = InvalidTransactionId; /* - * For unique indexes we need a consistent snapshot for the whole scan. - * Resetting snapshots also doesn't work in xact-snapshot isolation modes, + * For concurrent builds of non-system indexes, we want to periodically + * reset snapshots. + * Resetting snapshots doesn't work in xact-snapshot isolation modes, * because those keep a registered transaction snapshot for the whole xact. */ reset_snapshots = IndexBuildResetsSnapshots(indexInfo) && diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c index af7affdf409..10f4f7eeba9 100644 --- a/src/backend/access/nbtree/nbtdedup.c +++ b/src/backend/access/nbtree/nbtdedup.c @@ -149,7 +149,7 @@ _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, _bt_dedup_start_pending(state, itup, offnum); } else if (state->deduplicate && - _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + _bt_keep_natts_fast(rel, state->base, itup, NULL) > nkeyatts && _bt_dedup_save_htid(state, itup)) { /* @@ -376,7 +376,7 @@ _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, /* itup starts first pending interval */ _bt_dedup_start_pending(state, itup, offnum); } - else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + else if (_bt_keep_natts_fast(rel, state->base, itup, NULL) > nkeyatts && _bt_dedup_save_htid(state, itup)) { /* Tuple is equal; just added its TIDs to pending interval */ @@ -789,12 +789,12 @@ _bt_do_singleval(Relation rel, Page page, BTDedupState state, itemid = PageGetItemId(page, minoff); itup = (IndexTuple) PageGetItem(page, itemid); - if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + if (_bt_keep_natts_fast(rel, newitem, itup, NULL) > nkeyatts) { itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page)); itup = (IndexTuple) PageGetItem(page, itemid); - if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + if (_bt_keep_natts_fast(rel, newitem, itup, NULL) > nkeyatts) return true; } diff --git a/src/backend/access/nbtree/nbtreadpage.c b/src/backend/access/nbtree/nbtreadpage.c index 2ba1ca66023..c4c278e777a 100644 --- a/src/backend/access/nbtree/nbtreadpage.c +++ b/src/backend/access/nbtree/nbtreadpage.c @@ -623,7 +623,7 @@ _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate) lasttup = (IndexTuple) PageGetItem(pstate->page, iid); /* Determine the first attribute whose values change on caller's page */ - firstchangingattnum = _bt_keep_natts_fast(rel, firsttup, lasttup); + firstchangingattnum = _bt_keep_natts_fast(rel, firsttup, lasttup, NULL); for (; startikey < so->numberOfKeys; startikey++) { diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 6bb365c951d..e02a4a60a8b 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -87,6 +87,7 @@ typedef struct BTSpool Relation index; bool isunique; bool nulls_not_distinct; + bool unique_dead_ignored; } BTSpool; /* @@ -105,6 +106,7 @@ typedef struct BTShared Oid indexrelid; bool isunique; bool nulls_not_distinct; + bool unique_dead_ignored; bool isconcurrent; int scantuplesortstates; @@ -207,15 +209,14 @@ typedef struct BTLeader */ typedef struct BTBuildState { - bool isunique; - bool nulls_not_distinct; bool havedead; Relation heap; BTSpool *spool; /* - * spool2 is needed only when the index is a unique index. Dead tuples are - * put into spool2 instead of spool in order to avoid uniqueness check. + * spool2 is needed only when the index is a unique index and built + * non-concurrently. Dead tuples are put into spool2 instead of spool + * in order to avoid uniqueness check. */ BTSpool *spool2; double indtuples; @@ -262,7 +263,7 @@ static double _bt_spools_heapscan(Relation heap, Relation index, static void _bt_spooldestroy(BTSpool *btspool); static void _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull); -static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots); +static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool isconcurrent); static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state); static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level); @@ -307,8 +308,6 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) ResetUsage(); #endif /* BTREE_BUILD_STATS */ - buildstate.isunique = indexInfo->ii_Unique; - buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct; buildstate.havedead = false; buildstate.heap = heap; buildstate.spool = NULL; @@ -385,6 +384,12 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, btspool->index = index; btspool->isunique = indexInfo->ii_Unique; btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct; + /* + * We need to ignore dead tuples for unique checks only when concurrent + * unique builds actually use periodic snapshot resets. + */ + btspool->unique_dead_ignored = IndexBuildResetsSnapshots(indexInfo) && + indexInfo->ii_Unique; /* Save as primary spool */ buildstate->spool = btspool; @@ -433,8 +438,9 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * the use of parallelism or any other factor. */ buildstate->spool->sortstate = - tuplesort_begin_index_btree(heap, index, buildstate->isunique, - buildstate->nulls_not_distinct, + tuplesort_begin_index_btree(heap, index, btspool->isunique, + btspool->nulls_not_distinct, + btspool->unique_dead_ignored, maintenance_work_mem, coordinate, TUPLESORT_NONE); @@ -442,8 +448,12 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * If building a unique index, put dead tuples in a second spool to keep * them out of the uniqueness check. We expect that the second spool (for * dead tuples) won't get very full, so we give it only work_mem. + * + * In case of concurrent build dead tuples do not need to be put into index + * since we wait for all snapshots older than reference snapshot during the + * validation phase. */ - if (indexInfo->ii_Unique) + if (indexInfo->ii_Unique && !IndexBuildResetsSnapshots(indexInfo)) { BTSpool *btspool2 = palloc0_object(BTSpool); SortCoordinate coordinate2 = NULL; @@ -474,7 +484,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * full, so we give it only work_mem */ buildstate->spool2->sortstate = - tuplesort_begin_index_btree(heap, index, false, false, work_mem, + tuplesort_begin_index_btree(heap, index, false, false, false, work_mem, coordinate2, TUPLESORT_NONE); } @@ -1156,13 +1166,118 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) SortSupport sortKeys; int64 tuples_done = 0; bool deduplicate; + bool fail_on_alive_duplicate; wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM); deduplicate = wstate->inskey->allequalimage && !btspool->isunique && BTGetDeduplicateItems(wstate->index); + /* + * The unique_dead_ignored does not guarantee absence of multiple alive + * tuples with the same values in the spool. Such a case may happen if + * alive tuples are located between a few dead tuples, like this: addda. + */ + fail_on_alive_duplicate = btspool->unique_dead_ignored; - if (merge) + if (fail_on_alive_duplicate) + { + bool seen_alive = false, + prev_tested = false; + IndexTuple prev = NULL; + TupleTableSlot *slot = MakeSingleTupleTableSlot(RelationGetDescr(wstate->heap), + &TTSOpsBufferHeapTuple); + IndexFetchTableData *fetch = table_index_fetch_begin(wstate->heap, SO_NONE); + + Assert(btspool->isunique); + Assert(!btspool2); + + while ((itup = tuplesort_getindextuple(btspool->sortstate, true)) != NULL) + { + bool tuples_equal = false; + + /* When we see first tuple, create first index page */ + if (state == NULL) + state = _bt_pagestate(wstate, 0); + + if (prev != NULL) /* if this is not the first tuple */ + { + bool has_nulls = false, + call_again = false, + ignored = false, + now_alive; + ItemPointerData tid; + + /* is this tuple equal to previous one? */ + if (wstate->inskey->allequalimage) + tuples_equal = _bt_keep_natts_fast(wstate->index, prev, itup, &has_nulls) > keysz; + else + tuples_equal = _bt_keep_natts(wstate->index, prev, itup, wstate->inskey, &has_nulls) > keysz; + + /* handle null values correctly */ + if (has_nulls && !btspool->nulls_not_distinct) + tuples_equal = false; + + if (tuples_equal) + { + /* check previous tuple if not yet */ + if (!prev_tested) + { + call_again = false; + tid = prev->t_tid; + seen_alive = table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + prev_tested = true; + } + + call_again = false; + tid = itup->t_tid; + now_alive = table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + + /* are multiple alive tuples detected in equal group? */ + if (seen_alive && now_alive) + { + char *key_desc; + TupleDesc tupDes = RelationGetDescr(wstate->index); + bool isnull[INDEX_MAX_KEYS]; + Datum values[INDEX_MAX_KEYS]; + + index_deform_tuple(itup, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(wstate->index, values, isnull); + + /* keep this message in sync with the same in comparetup_index_btree_tiebreak */ + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(wstate->index)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(wstate->heap, + RelationGetRelationName(wstate->index)))); + } + seen_alive |= now_alive; + } + } + + if (!tuples_equal) + { + seen_alive = false; + prev_tested = false; + } + + _bt_buildadd(wstate, state, itup, 0); + if (prev) pfree(prev); + prev = CopyIndexTuple(itup); + + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + if (prev) pfree(prev); + ExecDropSingleTupleTableSlot(slot); + table_index_fetch_end(fetch); + Assert(!TransactionIdIsValid(MyProc->xmin)); + } + else if (merge) { /* * Another BTSpool for dead tuples exists. Now we have to merge @@ -1322,7 +1437,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) InvalidOffsetNumber); } else if (_bt_keep_natts_fast(wstate->index, dstate->base, - itup) > keysz && + itup, NULL) > keysz && _bt_dedup_save_htid(dstate, itup)) { /* @@ -1438,15 +1553,13 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) scantuplesortstates = leaderparticipates ? request + 1 : request; /* - * For concurrent non-unique index builds, we can periodically reset - * snapshots to allow the xmin horizon to advance. This is safe since - * these builds don't require a consistent view across the entire scan. - * Unique indexes still need a stable snapshot to properly enforce - * uniqueness constraints. Isolation modes where + * For concurrent index builds, we can periodically reset snapshots to + * allow the xmin horizon to advance. + * Isolation modes where * IsolationUsesXactSnapshot() is true also prevent resetting because * they keep a registered transaction snapshot for the whole transaction. */ - reset_snapshot = isconcurrent && !IsolationUsesXactSnapshot() && !btspool->isunique; + reset_snapshot = isconcurrent && !IsolationUsesXactSnapshot(); /* * Prepare for scan of the base relation. In a normal index build, we use @@ -1454,7 +1567,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, we take a regular MVCC snapshot and index whatever's * live according to that, while that snapshot may be reset periodically - * for non-unique indexes in non-xact-snapshot isolation modes. + * in non-xact-snapshot isolation modes. */ if (!isconcurrent) { @@ -1483,10 +1596,10 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) shm_toc_estimate_chunk(&pcxt->estimator, estsort); /* - * Unique case requires a second spool, and so we may have to account for - * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2 + * Non-concurrent unique case requires a second spool, and so we may have + * to account for another shared workspace -- PARALLEL_KEY_TUPLESORT_SPOOL2 */ - if (!btspool->isunique) + if (!btspool->isunique || reset_snapshot) shm_toc_estimate_keys(&pcxt->estimator, 2); else { @@ -1541,6 +1654,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) btshared->indexrelid = RelationGetRelid(btspool->index); btshared->isunique = btspool->isunique; btshared->nulls_not_distinct = btspool->nulls_not_distinct; + btshared->unique_dead_ignored = btspool->unique_dead_ignored; btshared->isconcurrent = isconcurrent; btshared->scantuplesortstates = scantuplesortstates; btshared->queryid = pgstat_get_my_query_id(); @@ -1568,8 +1682,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared); shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort); - /* Unique case requires a second spool, and associated shared state */ - if (!btspool->isunique) + /* Non-concurrent unique case requires a second spool and shared state */ + if (!btspool->isunique || reset_snapshot) sharedsort2 = NULL; else { @@ -1752,9 +1866,10 @@ _bt_leader_participate_as_worker(BTBuildState *buildstate) leaderworker->index = buildstate->spool->index; leaderworker->isunique = buildstate->spool->isunique; leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct; + leaderworker->unique_dead_ignored = buildstate->spool->unique_dead_ignored; /* Initialize second spool, if required */ - if (!btleader->btshared->isunique) + if (!btleader->btshared->isunique || (btleader->btshared->isconcurrent && !IsolationUsesXactSnapshot())) leaderworker2 = NULL; else { @@ -1855,11 +1970,12 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) btspool->index = indexRel; btspool->isunique = btshared->isunique; btspool->nulls_not_distinct = btshared->nulls_not_distinct; + btspool->unique_dead_ignored = btshared->unique_dead_ignored; /* Look up shared state private to tuplesort.c */ sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false); tuplesort_attach_shared(sharedsort, seg); - if (!btshared->isunique) + if (!btshared->isunique || (btshared->isconcurrent && !IsolationUsesXactSnapshot())) { btspool2 = NULL; sharedsort2 = NULL; @@ -1939,6 +2055,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, btspool->index, btspool->isunique, btspool->nulls_not_distinct, + btspool->unique_dead_ignored, sortmem, coordinate, TUPLESORT_NONE); @@ -1961,14 +2078,12 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, coordinate2->nParticipants = -1; coordinate2->sharedsort = sharedsort2; btspool2->sortstate = - tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false, + tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false, false, Min(sortmem, work_mem), coordinate2, false); } /* Fill in buildstate for _bt_build_callback() */ - buildstate.isunique = btshared->isunique; - buildstate.nulls_not_distinct = btshared->nulls_not_distinct; buildstate.havedead = false; buildstate.heap = btspool->heap; buildstate.spool = btspool; diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index de9eca3c8b2..b174fb8d3ee 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -688,7 +688,7 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, { itemid = PageGetItemId(state->origpage, maxoff); tup = (IndexTuple) PageGetItem(state->origpage, itemid); - keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); + keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem, NULL); if (keepnatts > 1 && keepnatts <= nkeyatts) { @@ -719,7 +719,7 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) return false; /* Check same conditions as rightmost item case, too */ - keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); + keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem, NULL); if (keepnatts > 1 && keepnatts <= nkeyatts) { @@ -968,7 +968,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, * avoid appending a heap TID in new high key, we're done. Finish split * with default strategy and initial split interval. */ - perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost, NULL); if (perfectpenalty <= indnkeyatts) return perfectpenalty; @@ -989,7 +989,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, * If page is entirely full of duplicates, a single value strategy split * will be performed. */ - perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost, NULL); if (perfectpenalty <= indnkeyatts) { *strategy = SPLIT_MANY_DUPLICATES; @@ -1028,7 +1028,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, itemid = PageGetItemId(state->origpage, P_HIKEY); hikey = (IndexTuple) PageGetItem(state->origpage, itemid); perfectpenalty = _bt_keep_natts_fast(state->rel, hikey, - state->newitem); + state->newitem, NULL); if (perfectpenalty <= indnkeyatts) *strategy = SPLIT_SINGLE_VALUE; else @@ -1150,7 +1150,7 @@ _bt_split_penalty(FindSplitData *state, SplitPoint *split) lastleft = _bt_split_lastleft(state, split); firstright = _bt_split_firstright(state, split); - return _bt_keep_natts_fast(state->rel, lastleft, firstright); + return _bt_keep_natts_fast(state->rel, lastleft, firstright, NULL); } /* diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 014faa1622f..b69bb61ce42 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -32,9 +32,6 @@ static int _bt_compare_int(const void *va, const void *vb); -static int _bt_keep_natts(Relation rel, IndexTuple lastleft, - IndexTuple firstright, BTScanInsert itup_key); - /* * _bt_mkscankey @@ -707,7 +704,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright)); /* Determine how many attributes must be kept in truncated tuple */ - keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key); + keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key, NULL); #ifdef DEBUG_NO_TRUNCATE /* Force truncation to be ineffective for testing purposes */ @@ -825,17 +822,24 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, /* * _bt_keep_natts - how many key attributes to keep when truncating. * + * This is exported to be used as comparison function during concurrent + * unique index build in case _bt_keep_natts_fast is not suitable because + * collation is not "allequalimage"/deduplication-safe. + * * Caller provides two tuples that enclose a split point. Caller's insertion * scankey is used to compare the tuples; the scankey's argument values are * not considered here. * + * hasnulls value set to true in case of any null column in any tuple. + * * This can return a number of attributes that is one greater than the * number of key attributes for the index relation. This indicates that the * caller must use a heap TID as a unique-ifier in new pivot tuple. */ -static int +int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, - BTScanInsert itup_key) + BTScanInsert itup_key, + bool *hasnulls) { int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); TupleDesc itupdesc = RelationGetDescr(rel); @@ -861,6 +865,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + if (hasnulls) + *hasnulls |= isNull1 | isNull2; if (isNull1 != isNull2) break; @@ -880,7 +886,7 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * expected in an allequalimage index. */ Assert(!itup_key->allequalimage || - keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright)); + keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright, NULL)); return keepnatts; } @@ -891,7 +897,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * This is exported so that a candidate split point can have its effect on * suffix truncation inexpensively evaluated ahead of time when finding a * split location. A naive bitwise approach to datum comparisons is used to - * save cycles. + * save cycles. Also, it may be used as comparison function during concurrent + * build of unique index. * * The approach taken here usually provides the same answer as _bt_keep_natts * will (for the same pair of tuples from a heapkeyspace index), since the @@ -900,6 +907,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * "equal image" columns, routine is guaranteed to give the same result as * _bt_keep_natts would. * + * hasnulls value set to true in case of any null column in any tuple. + * * Callers can rely on the fact that attributes considered equal here are * definitely also equal according to _bt_keep_natts, even when the index uses * an opclass or collation that is not "allequalimage"/deduplication-safe. @@ -908,7 +917,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * more balanced split point. */ int -_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) +_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright, + bool *hasnulls) { TupleDesc itupdesc = RelationGetDescr(rel); int keysz = IndexRelationGetNumberOfKeyAttributes(rel); @@ -925,6 +935,8 @@ _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + if (hasnulls) + *hasnulls |= isNull1 | isNull2; att = TupleDescCompactAttr(itupdesc, attnum - 1); if (isNull1 != isNull2) diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index ce017e836be..fa073a33b65 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -3344,9 +3344,9 @@ IndexCheckExclusion(Relation heapRelation, * if we used HeapTupleSatisfiesVacuum). This leaves us with an index that * does not contain any tuples added to the table while we built the index. * - * Furthermore, in case of non-unique index we set SO_RESET_SNAPSHOT for the - * scan, which causes new snapshot to be set as active every so often. The reason - * for that is to propagate the xmin horizon forward. + * Furthermore, we set SO_RESET_SNAPSHOT for the scan, which causes new + * snapshot to be set as active every so often. The reason for that is to + * propagate the xmin horizon forward. * * Next, we mark the index "indisready" (but still not "indisvalid") and * commit the second transaction and start a third. Again we wait for all diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index be3dc5e8d28..9fff8635d3d 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1704,8 +1704,8 @@ DefineIndex(ParseState *pstate, * chains can be created where the new tuple and the old tuple in the * chain have different index keys. * - * We build the index using all tuples that are visible using single or - * multiple refreshing snapshots. We can be sure that any HOT updates to + * We build the index using all tuples that are visible using multiple + * refreshing snapshots. We can be sure that any HOT updates to * these tuples will be compatible with the index, since any updates made * by transactions that didn't know about the index are now committed or * rolled back. Thus, each visible tuple is either the end of its diff --git a/src/backend/utils/sort/tuplesortvariants.c b/src/backend/utils/sort/tuplesortvariants.c index 2509ac3e3a4..1a2a8b64aa7 100644 --- a/src/backend/utils/sort/tuplesortvariants.c +++ b/src/backend/utils/sort/tuplesortvariants.c @@ -25,6 +25,8 @@ #include "access/hash.h" #include "access/htup_details.h" #include "access/nbtree.h" +#include "access/relscan.h" +#include "access/tableam.h" #include "catalog/index.h" #include "catalog/pg_collation.h" #include "executor/executor.h" @@ -35,6 +37,7 @@ #include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/tuplesort.h" +#include "storage/proc.h" /* sort-type codes for sort__start probes */ @@ -136,6 +139,7 @@ typedef struct bool enforceUnique; /* complain if we find duplicate tuples */ bool uniqueNullsNotDistinct; /* unique constraint null treatment */ + bool uniqueDeadIgnored; /* ignore dead tuples in unique check */ } TuplesortIndexBTreeArg; /* @@ -361,6 +365,7 @@ tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, + bool uniqueDeadIgnored, int workMem, SortCoordinate coordinate, int sortopt) @@ -403,6 +408,7 @@ tuplesort_begin_index_btree(Relation heapRel, arg->index.indexRel = indexRel; arg->enforceUnique = enforceUnique; arg->uniqueNullsNotDistinct = uniqueNullsNotDistinct; + arg->uniqueDeadIgnored = uniqueDeadIgnored; indexScanKey = _bt_mkscankey(indexRel, NULL); @@ -1670,6 +1676,7 @@ comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; char *key_desc; + bool uniqueCheckFail = true; /* * Some rather brain-dead implementations of qsort (such as the one in @@ -1679,18 +1686,67 @@ comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, */ Assert(tuple1 != tuple2); - index_deform_tuple(tuple1, tupDes, values, isnull); - - key_desc = BuildIndexValueDescription(arg->index.indexRel, values, isnull); - - ereport(ERROR, - (errcode(ERRCODE_UNIQUE_VIOLATION), - errmsg("could not create unique index \"%s\"", - RelationGetRelationName(arg->index.indexRel)), - key_desc ? errdetail("Key %s is duplicated.", key_desc) : - errdetail("Duplicate keys exist."), - errtableconstraint(arg->index.heapRel, - RelationGetRelationName(arg->index.indexRel)))); + /* This is fail-fast check, see _bt_load for details. */ + if (arg->uniqueDeadIgnored) + { + bool any_tuple_dead, + call_again = false, + ignored; + /* + * Keep the slot/fetch lifetime local to this duplicate check so the + * xmin propagation assertion below continues to validate cleanup. + * + * XXX: This fail-fast check performs heap fetches from the sort + * comparator. That can be expensive when many equal keys are + * compared. Revisit whether this optimization is worth keeping, or + * whether it should be replaced with a bounded/probabilistic check + * or deferred entirely to _bt_load(). + */ + TupleTableSlot *slot = MakeSingleTupleTableSlot(RelationGetDescr(arg->index.heapRel), + &TTSOpsBufferHeapTuple); + ItemPointerData tid = tuple1->t_tid; + + IndexFetchTableData *fetch = table_index_fetch_begin(arg->index.heapRel, SO_NONE); + any_tuple_dead = !table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + + if (!any_tuple_dead) + { + call_again = false; + tid = tuple2->t_tid; + any_tuple_dead = !table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, + &ignored); + } + + if (any_tuple_dead) + { + elog(DEBUG5, "skipping duplicate values because some of them are dead: (%u,%u) vs (%u,%u)", + ItemPointerGetBlockNumber(&tuple1->t_tid), + ItemPointerGetOffsetNumber(&tuple1->t_tid), + ItemPointerGetBlockNumber(&tuple2->t_tid), + ItemPointerGetOffsetNumber(&tuple2->t_tid)); + + uniqueCheckFail = false; + } + ExecDropSingleTupleTableSlot(slot); + table_index_fetch_end(fetch); + Assert(!TransactionIdIsValid(MyProc->xmin)); + } + if (uniqueCheckFail) + { + index_deform_tuple(tuple1, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(arg->index.indexRel, values, isnull); + + /* keep this error message in sync with the same in _bt_load */ + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(arg->index.indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(arg->index.heapRel, + RelationGetRelationName(arg->index.indexRel)))); + } } /* diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 3097e9bb1af..1266c8eecdf 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1307,8 +1307,10 @@ extern bool btproperty(Oid index_oid, int attno, extern char *btbuildphasename(int64 phasenum); extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key); +extern int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, + BTScanInsert itup_key, bool *hasnulls); extern int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, - IndexTuple firstright); + IndexTuple firstright, bool *hasnulls); extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum); extern void _bt_check_third_page(Relation rel, Relation heap, diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 02c31caed2b..7e3e3242a16 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -1853,9 +1853,10 @@ table_scan_analyze_next_tuple(TableScanDesc scan, * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. * - * In case of non-unique concurrent build, - * concurrent_index_reset_snapshot_every_n_pages is applied for the scan. - * That leads to changing snapshots on the fly to allow xmin horizon propagate. + * In case of concurrent index build, + * concurrent_index_reset_snapshot_every_n_pages is applied for the + * scan. That leads to changing snapshots on the fly to allow xmin + * horizon propagate. */ static inline double table_index_build_scan(Relation table_rel, diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 60f8356ed82..ed000bf10d5 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -32,13 +32,11 @@ typedef struct AttrMap AttrMap; * * Snapshot resetting is only applicable when all of: * - the build is concurrent (ii_Concurrent) - * - the index is non-unique (unique needs consistent snapshot) * - isolation level is not REPEATABLE READ/SERIALIZABLE (those keep a * registered transaction snapshot) */ #define IndexBuildResetsSnapshots(indexInfo) \ ((indexInfo)->ii_Concurrent && \ - !(indexInfo)->ii_Unique && \ !IsolationUsesXactSnapshot()) /* Action code for index_set_state_flags */ diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index da68f45acf2..448dc83aa58 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -396,6 +396,7 @@ extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, + bool uniqueDeadIgnored, int workMem, SortCoordinate coordinate, int sortopt); extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel, diff --git a/src/test/modules/injection_points/expected/cic_reset_snapshots.out b/src/test/modules/injection_points/expected/cic_reset_snapshots.out index e5a8a7f3a79..7835a796b88 100644 --- a/src/test/modules/injection_points/expected/cic_reset_snapshots.out +++ b/src/test/modules/injection_points/expected/cic_reset_snapshots.out @@ -41,7 +41,11 @@ END; $$; ---------------- ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots @@ -86,7 +90,9 @@ SELECT injection_points_detach('heap_reset_scan_snapshot_effective'); (1 row) CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); NOTICE: notice triggered for injection point table_parallelscan_initialize -- 2.43.0 [application/octet-stream] v5-0003-Support-snapshot-resets-in-parallel-concurrent-in.patch (38.9K, 4-v5-0003-Support-snapshot-resets-in-parallel-concurrent-in.patch) download | inline diff: From 2809e96d5cb559865e3e91883285c4cbefa4e332 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <[email protected]> Date: Fri, 3 Apr 2026 19:56:44 +0200 Subject: [PATCH v5 3/4] Support snapshot resets in parallel concurrent index builds Extend periodic snapshot reset support to parallel builds, previously limited to non-parallel operations. This allows the xmin horizon to advance during parallel concurrent index builds as well. The main limitation of applying that technique to parallel builds was a requirement to wait until worker processes restore their initial snapshot from leader. To address this, following changes applied: - add infrastructure to track snapshot restoration in parallel workers - extend parallel scan initialization to support periodic snapshot resets - wait for parallel workers to restore their initial snapshots before proceeding with scan - relax limitation for parallel worker to call GetLatestSnapshot --- src/backend/access/brin/brin.c | 54 ++++++++++++---- src/backend/access/gin/gininsert.c | 51 +++++++++++---- src/backend/access/heap/heapam_handler.c | 7 +-- src/backend/access/nbtree/nbtsort.c | 51 +++++++++++++-- src/backend/access/table/tableam.c | 37 +++++++++-- src/backend/access/transam/parallel.c | 62 +++++++++++++++++-- src/backend/executor/nodeSeqscan.c | 3 +- src/backend/executor/nodeTidrangescan.c | 3 +- src/backend/utils/time/snapmgr.c | 8 --- src/include/access/parallel.h | 3 +- src/include/access/relscan.h | 1 + src/include/access/tableam.h | 5 +- src/include/catalog/index.h | 4 +- .../expected/cic_reset_snapshots.out | 25 +++++++- .../sql/cic_reset_snapshots.sql | 5 +- 15 files changed, 259 insertions(+), 60 deletions(-) diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 09741186264..ff57fe61add 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -1275,6 +1275,9 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = idxtuples; + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); + return result; } @@ -2394,6 +2397,7 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, BufferUsage *bufferusage; bool leaderparticipates = true; bool need_pop_active_snapshot = true; + bool reset_snapshot; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -2411,12 +2415,16 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, scantuplesortstates = leaderparticipates ? request + 1 : request; + reset_snapshot = isconcurrent && !IsolationUsesXactSnapshot(); + /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a - * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that. + * concurrent build, we take a regular MVCC snapshot and push it as active. + * Later we index whatever's live according to that snapshot while that + * snapshot may be reset periodically (in non-xact-snapshot isolation + * modes) to allow the xmin horizon to advance. */ if (!isconcurrent) { @@ -2424,10 +2432,15 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, snapshot = SnapshotAny; need_pop_active_snapshot = false; } + else if (reset_snapshot) + { + snapshot = InvalidSnapshot; + PushActiveSnapshot(GetTransactionSnapshot()); + } else { snapshot = RegisterSnapshot(GetTransactionSnapshot()); - PushActiveSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(snapshot); } /* @@ -2473,7 +2486,7 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, { if (need_pop_active_snapshot) PopActiveSnapshot(); - if (IsMVCCSnapshot(snapshot)) + if (snapshot != InvalidSnapshot && IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); ExitParallelMode(); @@ -2499,7 +2512,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, table_parallelscan_initialize(heap, ParallelTableScanFromBrinShared(brinshared), - snapshot); + snapshot, + reset_snapshot); /* * Store shared tuplesort-private state, for which we reserved space. @@ -2561,6 +2575,13 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, /* Save leader state now that it's clear build will be parallel */ buildstate->bs_leader = brinleader; + /* + * In case of concurrent build with snapshot resets, wait until all + * workers imported initial snapshot. + */ + if (reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, true); + /* Join heap scan ourselves */ if (leaderparticipates) _brin_leader_participate_as_worker(buildstate, heap, index); @@ -2569,9 +2590,13 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - WaitForParallelWorkersToAttach(pcxt); + if (!reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); + + InvalidateCatalogSnapshot(); + Assert(!reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); } /* @@ -2592,8 +2617,7 @@ _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state) for (i = 0; i < brinleader->pcxt->nworkers_launched; i++) InstrAccumParallelQuery(&brinleader->bufferusage[i], &brinleader->walusage[i]); - /* Free last reference to MVCC snapshot, if one was used */ - if (IsMVCCSnapshot(brinleader->snapshot)) + if (brinleader->snapshot != InvalidSnapshot && IsMVCCSnapshot(brinleader->snapshot)) UnregisterSnapshot(brinleader->snapshot); DestroyParallelContext(brinleader->pcxt); ExitParallelMode(); @@ -2794,7 +2818,7 @@ _brin_parallel_merge(BrinBuildState *state) /* * Returns size of shared memory required to store state for a parallel - * brin index build based on the snapshot its parallel scan will use. + * brin index build. */ static Size _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot) @@ -2843,6 +2867,7 @@ _brin_parallel_scan_and_build(BrinBuildState *state, { SortCoordinate coordinate; TableScanDesc scan; + ParallelTableScanDesc pscan; double reltuples; IndexInfo *indexInfo; @@ -2859,13 +2884,18 @@ _brin_parallel_scan_and_build(BrinBuildState *state, /* Join parallel scan */ indexInfo = BuildIndexInfo(index); indexInfo->ii_Concurrent = brinshared->isconcurrent; + pscan = ParallelTableScanFromBrinShared(brinshared); scan = table_beginscan_parallel(heap, - ParallelTableScanFromBrinShared(brinshared), + pscan, SO_NONE); reltuples = table_index_build_scan(heap, index, indexInfo, true, true, brinbuildCallbackParallel, state, scan); + InvalidateCatalogSnapshot(); + if (pscan->phs_reset_snapshot) + PopActiveSnapshot(); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); /* insert the last item */ form_and_spill_tuple(state); @@ -2888,6 +2918,9 @@ _brin_parallel_scan_and_build(BrinBuildState *state, ConditionVariableSignal(&brinshared->workersdonecv); tuplesort_end(state->bs_sortstate); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); + if (pscan->phs_reset_snapshot) + PushActiveSnapshot(GetTransactionSnapshot()); } /* @@ -2964,7 +2997,6 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) _brin_parallel_scan_and_build(buildstate, brinshared, sharedsort, heapRel, indexRel, sortmem, false); - /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 5b035aa5dec..3251db0e617 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -809,6 +809,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); return result; } @@ -957,6 +958,7 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, BufferUsage *bufferusage; bool leaderparticipates = true; bool need_pop_active_snapshot = true; + bool reset_snapshot; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -973,12 +975,16 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, scantuplesortstates = leaderparticipates ? request + 1 : request; + reset_snapshot = isconcurrent && !IsolationUsesXactSnapshot(); + /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a - * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that. + * concurrent build, we take a regular MVCC snapshot and push it as active. + * Later we index whatever's live according to that snapshot while that + * snapshot may be reset periodically (in non-xact-snapshot isolation + * modes) to allow the xmin horizon to advance. */ if (!isconcurrent) { @@ -986,10 +992,15 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, snapshot = SnapshotAny; need_pop_active_snapshot = false; } + else if (reset_snapshot) + { + snapshot = InvalidSnapshot; + PushActiveSnapshot(GetTransactionSnapshot()); + } else { snapshot = RegisterSnapshot(GetTransactionSnapshot()); - PushActiveSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(snapshot); } /* @@ -1035,7 +1046,7 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, { if (need_pop_active_snapshot) PopActiveSnapshot(); - if (IsMVCCSnapshot(snapshot)) + if (snapshot != InvalidSnapshot && IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); ExitParallelMode(); @@ -1060,7 +1071,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, table_parallelscan_initialize(heap, ParallelTableScanFromGinBuildShared(ginshared), - snapshot); + snapshot, + reset_snapshot); /* * Store shared tuplesort-private state, for which we reserved space. @@ -1118,6 +1130,13 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, /* Save leader state now that it's clear build will be parallel */ buildstate->bs_leader = ginleader; + /* + * In case of concurrent build with snapshot resets, wait until all + * workers imported initial snapshot. + */ + if (reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, true); + /* Join heap scan ourselves */ if (leaderparticipates) _gin_leader_participate_as_worker(buildstate, heap, index); @@ -1126,9 +1145,12 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - WaitForParallelWorkersToAttach(pcxt); + if (!reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); + InvalidateCatalogSnapshot(); + Assert(!reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); } /* @@ -1149,8 +1171,7 @@ _gin_end_parallel(GinLeader *ginleader, GinBuildState *state) for (i = 0; i < ginleader->pcxt->nworkers_launched; i++) InstrAccumParallelQuery(&ginleader->bufferusage[i], &ginleader->walusage[i]); - /* Free last reference to MVCC snapshot, if one was used */ - if (IsMVCCSnapshot(ginleader->snapshot)) + if (ginleader->snapshot != InvalidSnapshot && IsMVCCSnapshot(ginleader->snapshot)) UnregisterSnapshot(ginleader->snapshot); DestroyParallelContext(ginleader->pcxt); ExitParallelMode(); @@ -1826,7 +1847,7 @@ _gin_parallel_merge(GinBuildState *state) /* * Returns size of shared memory required to store state for a parallel - * gin index build based on the snapshot its parallel scan will use. + * gin index build. */ static Size _gin_parallel_estimate_shared(Relation heap, Snapshot snapshot) @@ -2058,6 +2079,7 @@ _gin_parallel_scan_and_build(GinBuildState *state, { SortCoordinate coordinate; TableScanDesc scan; + ParallelTableScanDesc pscan; double reltuples; IndexInfo *indexInfo; @@ -2088,13 +2110,18 @@ _gin_parallel_scan_and_build(GinBuildState *state, /* Join parallel scan */ indexInfo = BuildIndexInfo(index); indexInfo->ii_Concurrent = ginshared->isconcurrent; + pscan = ParallelTableScanFromGinBuildShared(ginshared); scan = table_beginscan_parallel(heap, - ParallelTableScanFromGinBuildShared(ginshared), + pscan, SO_NONE); reltuples = table_index_build_scan(heap, index, indexInfo, true, progress, ginBuildCallbackParallel, state, scan); + InvalidateCatalogSnapshot(); + if (pscan->phs_reset_snapshot) + PopActiveSnapshot(); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); /* write remaining accumulated entries */ ginFlushBuildState(state, index); @@ -2124,6 +2151,9 @@ _gin_parallel_scan_and_build(GinBuildState *state, ConditionVariableSignal(&ginshared->workersdonecv); tuplesort_end(state->bs_sortstate); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); + if (pscan->phs_reset_snapshot) + PushActiveSnapshot(GetTransactionSnapshot()); } /* @@ -2219,7 +2249,6 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) _gin_parallel_scan_and_build(&buildstate, ginshared, sharedsort, heapRel, indexRel, sortmem, false); - /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 754f5a720df..fb2d61360c9 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1164,7 +1164,8 @@ heapam_index_build_range_scan(Relation heapRelation, * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, or during bootstrap, we take a regular MVCC snapshot - * and index whatever's live according to that. + * and index whatever's live according to that while that snapshot is reset + * every so often (in the case of non-unique index). */ OldestXmin = InvalidTransactionId; @@ -1172,8 +1173,6 @@ heapam_index_build_range_scan(Relation heapRelation, * For unique indexes we need a consistent snapshot for the whole scan. * Resetting snapshots also doesn't work in xact-snapshot isolation modes, * because those keep a registered transaction snapshot for the whole xact. - * In the case of parallel scan, some additional infrastructure is required - * to perform a scan with SO_RESET_SNAPSHOT which is not yet ready. */ reset_snapshots = IndexBuildResetsSnapshots(indexInfo) && !is_system_catalog; /* just for the case */ @@ -1235,7 +1234,7 @@ heapam_index_build_range_scan(Relation heapRelation, Assert(!IsBootstrapProcessingMode()); Assert(allow_sync); snapshot = scan->rs_snapshot; - if (IsMVCCSnapshot(snapshot)) + if (!reset_snapshots && IsMVCCSnapshot(snapshot)) { /* Don't expose SnapshotAny to SQL run by predicates/expressions. */ PushActiveSnapshot(snapshot); diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 8d804d6bcfe..6bb365c951d 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -1419,6 +1419,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) BufferUsage *bufferusage; bool leaderparticipates = true; bool need_pop_active_snapshot = true; + bool reset_snapshot; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -1436,12 +1437,24 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) scantuplesortstates = leaderparticipates ? request + 1 : request; + /* + * For concurrent non-unique index builds, we can periodically reset + * snapshots to allow the xmin horizon to advance. This is safe since + * these builds don't require a consistent view across the entire scan. + * Unique indexes still need a stable snapshot to properly enforce + * uniqueness constraints. Isolation modes where + * IsolationUsesXactSnapshot() is true also prevent resetting because + * they keep a registered transaction snapshot for the whole transaction. + */ + reset_snapshot = isconcurrent && !IsolationUsesXactSnapshot() && !btspool->isunique; + /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that. + * live according to that, while that snapshot may be reset periodically + * for non-unique indexes in non-xact-snapshot isolation modes. */ if (!isconcurrent) { @@ -1449,6 +1462,11 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) snapshot = SnapshotAny; need_pop_active_snapshot = false; } + else if (reset_snapshot) + { + snapshot = InvalidSnapshot; + PushActiveSnapshot(GetTransactionSnapshot()); + } else { snapshot = RegisterSnapshot(GetTransactionSnapshot()); @@ -1509,7 +1527,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) { if (need_pop_active_snapshot) PopActiveSnapshot(); - if (IsMVCCSnapshot(snapshot)) + if (snapshot != InvalidSnapshot && IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); ExitParallelMode(); @@ -1536,7 +1554,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) btshared->brokenhotchain = false; table_parallelscan_initialize(btspool->heap, ParallelTableScanFromBTShared(btshared), - snapshot); + snapshot, + reset_snapshot); /* * Store shared tuplesort-private state, for which we reserved space. @@ -1612,6 +1631,13 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) /* Save leader state now that it's clear build will be parallel */ buildstate->btleader = btleader; + /* + * In the case of concurrent build, snapshots are going to be reset periodically. + * Wait until all workers imported initial snapshot. + */ + if (reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, true); + /* Join heap scan ourselves */ if (leaderparticipates) _bt_leader_participate_as_worker(buildstate); @@ -1620,9 +1646,13 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - WaitForParallelWorkersToAttach(pcxt); + if (!reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); + + InvalidateCatalogSnapshot(); + Assert(!reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); } /* @@ -1644,7 +1674,7 @@ _bt_end_parallel(BTLeader *btleader) InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]); /* Free last reference to MVCC snapshot, if one was used */ - if (IsMVCCSnapshot(btleader->snapshot)) + if (btleader->snapshot != InvalidSnapshot && IsMVCCSnapshot(btleader->snapshot)) UnregisterSnapshot(btleader->snapshot); DestroyParallelContext(btleader->pcxt); ExitParallelMode(); @@ -1894,6 +1924,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, SortCoordinate coordinate; BTBuildState buildstate; TableScanDesc scan; + ParallelTableScanDesc pscan; double reltuples; IndexInfo *indexInfo; @@ -1948,12 +1979,17 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, /* Join parallel scan */ indexInfo = BuildIndexInfo(btspool->index); indexInfo->ii_Concurrent = btshared->isconcurrent; + pscan = ParallelTableScanFromBTShared(btshared); scan = table_beginscan_parallel(btspool->heap, - ParallelTableScanFromBTShared(btshared), + pscan, SO_NONE); reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo, true, progress, _bt_build_callback, &buildstate, scan); + InvalidateCatalogSnapshot(); + if (pscan->phs_reset_snapshot) + PopActiveSnapshot(); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); /* Execute this worker's part of the sort */ if (progress) @@ -1989,4 +2025,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, tuplesort_end(btspool->sortstate); if (btspool2) tuplesort_end(btspool2->sortstate); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); + if (pscan->phs_reset_snapshot) + PushActiveSnapshot(GetTransactionSnapshot()); } diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 68ff0966f1c..cf337eda5f6 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -132,10 +132,10 @@ table_parallelscan_estimate(Relation rel, Snapshot snapshot) { Size sz = 0; - if (IsMVCCSnapshot(snapshot)) + if (snapshot != InvalidSnapshot && IsMVCCSnapshot(snapshot)) sz = add_size(sz, EstimateSnapshotSpace(snapshot)); else - Assert(snapshot == SnapshotAny); + Assert(snapshot == SnapshotAny || snapshot == InvalidSnapshot); sz = add_size(sz, rel->rd_tableam->parallelscan_estimate(rel)); @@ -144,21 +144,36 @@ table_parallelscan_estimate(Relation rel, Snapshot snapshot) void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, - Snapshot snapshot) + Snapshot snapshot, bool reset_snapshot) { Size snapshot_off = rel->rd_tableam->parallelscan_initialize(rel, pscan); pscan->phs_snapshot_off = snapshot_off; - if (IsMVCCSnapshot(snapshot)) + /* + * Initialize parallel scan description. For normal scans with a regular + * MVCC snapshot, serialize the snapshot info. For scans that use periodic + * snapshot resets, mark the scan accordingly. + */ + if (reset_snapshot) + { + Assert(snapshot == InvalidSnapshot); + pscan->phs_snapshot_any = false; + pscan->phs_reset_snapshot = true; + INJECTION_POINT("table_parallelscan_initialize", NULL); + } + else if (IsMVCCSnapshot(snapshot)) { SerializeSnapshot(snapshot, (char *) pscan + pscan->phs_snapshot_off); pscan->phs_snapshot_any = false; + pscan->phs_reset_snapshot = false; } else { Assert(snapshot == SnapshotAny); + Assert(!reset_snapshot); pscan->phs_snapshot_any = true; + pscan->phs_reset_snapshot = false; } } @@ -172,7 +187,19 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan, Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator)); - if (!pscan->phs_snapshot_any) + /* + * For scans that + * use periodic snapshot resets, mark the scan accordingly and use the active + * snapshot as the initial state. + */ + if (pscan->phs_reset_snapshot) + { + Assert(ActiveSnapshotSet()); + internal_flags |= SO_RESET_SNAPSHOT; + /* Start with current active snapshot. */ + snapshot = GetActiveSnapshot(); + } + else if (!pscan->phs_snapshot_any) { /* Snapshot was serialized -- restore it */ snapshot = RestoreSnapshot((char *) pscan + pscan->phs_snapshot_off); diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c index 89e9d224eec..6f50f5aff9c 100644 --- a/src/backend/access/transam/parallel.c +++ b/src/backend/access/transam/parallel.c @@ -79,6 +79,7 @@ #define PARALLEL_KEY_RELMAPPER_STATE UINT64CONST(0xFFFFFFFFFFFF000D) #define PARALLEL_KEY_UNCOMMITTEDENUMS UINT64CONST(0xFFFFFFFFFFFF000E) #define PARALLEL_KEY_CLIENTCONNINFO UINT64CONST(0xFFFFFFFFFFFF000F) +#define PARALLEL_KEY_SNAPSHOT_RESTORED UINT64CONST(0xFFFFFFFFFFFF0010) /* Fixed-size parallel state. */ typedef struct FixedParallelState @@ -308,6 +309,10 @@ InitializeParallelDSM(ParallelContext *pcxt) pcxt->nworkers)); shm_toc_estimate_keys(&pcxt->estimator, 1); + shm_toc_estimate_chunk(&pcxt->estimator, mul_size(sizeof(bool), + pcxt->nworkers)); + shm_toc_estimate_keys(&pcxt->estimator, 1); + /* Estimate how much we'll need for the entrypoint info. */ shm_toc_estimate_chunk(&pcxt->estimator, strlen(pcxt->library_name) + strlen(pcxt->function_name) + 2); @@ -379,6 +384,7 @@ InitializeParallelDSM(ParallelContext *pcxt) char *entrypointstate; char *uncommittedenumsspace; char *clientconninfospace; + bool *snapshot_set_flag_space; Size lnamelen; /* Serialize shared libraries we have loaded. */ @@ -494,6 +500,19 @@ InitializeParallelDSM(ParallelContext *pcxt) strcpy(entrypointstate, pcxt->library_name); strcpy(entrypointstate + lnamelen + 1, pcxt->function_name); shm_toc_insert(pcxt->toc, PARALLEL_KEY_ENTRYPOINT, entrypointstate); + + /* + * Establish dynamic shared memory to pass information about importing + * of snapshot. + */ + snapshot_set_flag_space = + shm_toc_allocate(pcxt->toc, mul_size(sizeof(bool), pcxt->nworkers)); + for (i = 0; i < pcxt->nworkers; ++i) + { + pcxt->worker[i].snapshot_restored = &snapshot_set_flag_space[i]; + *pcxt->worker[i].snapshot_restored = false; + } + shm_toc_insert(pcxt->toc, PARALLEL_KEY_SNAPSHOT_RESTORED, snapshot_set_flag_space); } /* Update nworkers_to_launch, in case we changed nworkers above. */ @@ -670,6 +689,10 @@ LaunchParallelWorkers(ParallelContext *pcxt) * Wait for all workers to attach to their error queues, and throw an error if * any worker fails to do this. * + * wait_for_snapshot: track whether each parallel worker has successfully restored + * its snapshot. This is needed when using periodic snapshot resets to ensure all + * workers have a valid initial snapshot before proceeding with the scan. + * * Callers can assume that if this function returns successfully, then the * number of workers given by pcxt->nworkers_launched have initialized and * attached to their error queues. Whether or not these workers are guaranteed @@ -699,7 +722,7 @@ LaunchParallelWorkers(ParallelContext *pcxt) * call this function at all. */ void -WaitForParallelWorkersToAttach(ParallelContext *pcxt) +WaitForParallelWorkersToAttach(ParallelContext *pcxt, bool wait_for_snapshot) { int i; @@ -743,9 +766,24 @@ WaitForParallelWorkersToAttach(ParallelContext *pcxt) mq = shm_mq_get_queue(pcxt->worker[i].error_mqh); if (shm_mq_get_sender(mq) != NULL) { - /* Yes, so it is known to be attached. */ - pcxt->known_attached_workers[i] = true; - ++pcxt->nknown_attached_workers; + pg_read_barrier(); + if (!wait_for_snapshot || *pcxt->worker[i].snapshot_restored) + { + /* Worker is known to be attached. */ + pcxt->known_attached_workers[i] = true; + ++pcxt->nknown_attached_workers; + } + else + { + /* + * Worker attached but hasn't restored its snapshot yet. + */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 10, WAIT_EVENT_BGWORKER_STARTUP); + if (rc & WL_LATCH_SET) + ResetLatch(MyLatch); + } } } else if (status == BGWH_STOPPED) @@ -788,6 +826,16 @@ WaitForParallelWorkersToAttach(ParallelContext *pcxt) break; } } + + /* Set snapshot restored flag to false. */ + if (pcxt->nworkers > 0) + { + bool *snapshot_restored_space; + snapshot_restored_space = + shm_toc_lookup(pcxt->toc, PARALLEL_KEY_SNAPSHOT_RESTORED, false); + for (i = 0; i < pcxt->nworkers; ++i) + snapshot_restored_space[i] = false; + } } /* @@ -1304,6 +1352,7 @@ ParallelWorkerMain(Datum main_arg) shm_toc *toc; FixedParallelState *fps; char *error_queue_space; + bool *snapshot_restored_space; shm_mq *mq; shm_mq_handle *mqh; char *libraryspace; @@ -1507,6 +1556,11 @@ ParallelWorkerMain(Datum main_arg) fps->parallel_leader_pgproc); PushActiveSnapshot(asnapshot); + /* Snapshot is restored, set flag to make leader know about it. */ + snapshot_restored_space = shm_toc_lookup(toc, PARALLEL_KEY_SNAPSHOT_RESTORED, false); + snapshot_restored_space[ParallelWorkerNumber] = true; + pg_write_barrier(); + /* * We've changed which tuples we can see, and must therefore invalidate * system caches. diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c index 04803b0e37d..3b090fd2c30 100644 --- a/src/backend/executor/nodeSeqscan.c +++ b/src/backend/executor/nodeSeqscan.c @@ -374,7 +374,8 @@ ExecSeqScanInitializeDSM(SeqScanState *node, pscan = shm_toc_allocate(pcxt->toc, node->pscan_len); table_parallelscan_initialize(node->ss.ss_currentRelation, pscan, - estate->es_snapshot); + estate->es_snapshot, + false); shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); node->ss.ss_currentScanDesc = diff --git a/src/backend/executor/nodeTidrangescan.c b/src/backend/executor/nodeTidrangescan.c index 4a8fe91b2b3..74e1ca89b7b 100644 --- a/src/backend/executor/nodeTidrangescan.c +++ b/src/backend/executor/nodeTidrangescan.c @@ -458,7 +458,8 @@ ExecTidRangeScanInitializeDSM(TidRangeScanState *node, ParallelContext *pcxt) pscan = shm_toc_allocate(pcxt->toc, node->trss_pscanlen); table_parallelscan_initialize(node->ss.ss_currentRelation, pscan, - estate->es_snapshot); + estate->es_snapshot, + false); shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); node->ss.ss_currentScanDesc = table_beginscan_parallel_tidrange(node->ss.ss_currentRelation, diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 2e6197f5f35..df9116532c0 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -353,14 +353,6 @@ GetTransactionSnapshot(void) Snapshot GetLatestSnapshot(void) { - /* - * We might be able to relax this, but nothing that could otherwise work - * needs it. - */ - if (IsInParallelMode()) - elog(ERROR, - "cannot update SecondarySnapshot during a parallel operation"); - /* * So far there are no cases requiring support for GetLatestSnapshot() * during logical decoding, but it wouldn't be hard to add if required. diff --git a/src/include/access/parallel.h b/src/include/access/parallel.h index 60f857675e0..9a3e14e0c0e 100644 --- a/src/include/access/parallel.h +++ b/src/include/access/parallel.h @@ -28,6 +28,7 @@ typedef struct ParallelWorkerInfo { BackgroundWorkerHandle *bgwhandle; shm_mq_handle *error_mqh; + bool *snapshot_restored; } ParallelWorkerInfo; typedef struct ParallelContext @@ -67,7 +68,7 @@ extern void InitializeParallelDSM(ParallelContext *pcxt); extern void ReinitializeParallelDSM(ParallelContext *pcxt); extern void ReinitializeParallelWorkers(ParallelContext *pcxt, int nworkers_to_launch); extern void LaunchParallelWorkers(ParallelContext *pcxt); -extern void WaitForParallelWorkersToAttach(ParallelContext *pcxt); +extern void WaitForParallelWorkersToAttach(ParallelContext *pcxt, bool wait_for_snapshot); extern void WaitForParallelWorkersToFinish(ParallelContext *pcxt); extern void DestroyParallelContext(ParallelContext *pcxt); extern bool ParallelContextActive(void); diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 960abf6c214..4249bb45755 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -81,6 +81,7 @@ typedef struct ParallelTableScanDescData RelFileLocator phs_locator; /* physical relation to scan */ bool phs_syncscan; /* report location to syncscan logic? */ bool phs_snapshot_any; /* SnapshotAny, not phs_snapshot_data? */ + bool phs_reset_snapshot; /* use SO_RESET_SNAPSHOT? */ Size phs_snapshot_off; /* data for snapshot */ } ParallelTableScanDescData; typedef struct ParallelTableScanDescData *ParallelTableScanDesc; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 941af05ad3e..02c31caed2b 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -1207,7 +1207,8 @@ extern Size table_parallelscan_estimate(Relation rel, Snapshot snapshot); */ extern void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, - Snapshot snapshot); + Snapshot snapshot, + bool reset_snapshot); /* * Begin a parallel scan. `pscan` needs to have been initialized with @@ -1852,7 +1853,7 @@ table_scan_analyze_next_tuple(TableScanDesc scan, * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. * - * In case of non-unique index and non-parallel concurrent build, + * In case of non-unique concurrent build, * concurrent_index_reset_snapshot_every_n_pages is applied for the scan. * That leads to changing snapshots on the fly to allow xmin horizon propagate. */ diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 3d93232361f..60f8356ed82 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -35,13 +35,11 @@ typedef struct AttrMap AttrMap; * - the index is non-unique (unique needs consistent snapshot) * - isolation level is not REPEATABLE READ/SERIALIZABLE (those keep a * registered transaction snapshot) - * - the build is not parallel (parallel needs separate infrastructure) */ #define IndexBuildResetsSnapshots(indexInfo) \ ((indexInfo)->ii_Concurrent && \ !(indexInfo)->ii_Unique && \ - !IsolationUsesXactSnapshot() && \ - !(indexInfo)->ii_ParallelWorkers) + !IsolationUsesXactSnapshot()) /* Action code for index_set_state_flags */ typedef enum diff --git a/src/test/modules/injection_points/expected/cic_reset_snapshots.out b/src/test/modules/injection_points/expected/cic_reset_snapshots.out index ed4aaaf3463..e5a8a7f3a79 100644 --- a/src/test/modules/injection_points/expected/cic_reset_snapshots.out +++ b/src/test/modules/injection_points/expected/cic_reset_snapshots.out @@ -17,6 +17,12 @@ SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice' (1 row) +SELECT injection_points_attach('table_parallelscan_initialize', 'notice'); + injection_points_attach +------------------------- + +(1 row) + CREATE SCHEMA cic_reset_snap; CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); INSERT INTO cic_reset_snap.tbl SELECT i, i * I FROM generate_series(1, 200) s(i); @@ -72,30 +78,45 @@ NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective DROP INDEX CONCURRENTLY cic_reset_snap.idx; -- The same in parallel mode ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); +-- Detach to keep test stable, since parallel worker may complete scan before leader +SELECT injection_points_detach('heap_reset_scan_snapshot_effective'); + injection_points_detach +------------------------- + +(1 row) + CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots -NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots -NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i DESC NULLS LAST); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; BEGIN TRANSACTION; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); diff --git a/src/test/modules/injection_points/sql/cic_reset_snapshots.sql b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql index 553787539b3..a85b0cbf575 100644 --- a/src/test/modules/injection_points/sql/cic_reset_snapshots.sql +++ b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql @@ -3,7 +3,7 @@ CREATE EXTENSION injection_points; SELECT injection_points_set_local(); SELECT injection_points_attach('heap_reset_scan_snapshot_effective', 'notice'); SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice'); - +SELECT injection_points_attach('table_parallelscan_initialize', 'notice'); CREATE SCHEMA cic_reset_snap; CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); @@ -53,6 +53,9 @@ DROP INDEX CONCURRENTLY cic_reset_snap.idx; -- The same in parallel mode ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); +-- Detach to keep test stable, since parallel worker may complete scan before leader +SELECT injection_points_detach('heap_reset_scan_snapshot_effective'); + CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; DROP INDEX CONCURRENTLY cic_reset_snap.idx; -- 2.43.0 [application/octet-stream] v5-0002-Reset-snapshots-periodically-in-non-unique-non-pa.patch (52.5K, 5-v5-0002-Reset-snapshots-periodically-in-non-unique-non-pa.patch) download | inline diff: From 18994668032858122cd599429591e26d406146c2 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <[email protected]> Date: Fri, 3 Apr 2026 18:23:01 +0200 Subject: [PATCH v5 2/4] Reset snapshots periodically in non-unique non-parallel concurrent index builds Long-living snapshots used by CREATE INDEX CONCURRENTLY and REINDEX CONCURRENTLY can hold back the global xmin horizon. Commit d9d0762 attempted to allow VACUUM to ignore such snapshots to mitigate this problem. However, this was reverted in commit e28bb88 because it could cause indexes to miss heap tuples that were HOT-updated and HOT-pruned during the index creation, leading to index corruption. This patch introduces an alternative by periodically resetting the snapshot used during the first phase. By resetting the snapshot every N pages during the heap scan, it allows the xmin horizon to advance. Currently, this technique is applied to: - only during the first scan of the heap: The second scan during index validation still uses a single snapshot to ensure index correctness - non-parallel index builds: Parallel index builds are not yet supported and will be addressed in following commits - non-unique indexes: Unique index builds still require a consistent snapshot to enforce uniqueness constraints, will be addressed in following commits A new scan option SO_RESET_SNAPSHOT is introduced. When set, it causes the snapshot to be reset "between" every SO_RESET_SNAPSHOT_EACH_N_PAGE pages during the scan. The heap scan code is adjusted to support this option, and the index build code is modified to use it for applicable concurrent index builds that are not on system catalogs and not using parallel workers. --- contrib/amcheck/verify_nbtree.c | 3 +- contrib/pgstattuple/pgstattuple.c | 2 +- src/backend/access/brin/brin.c | 19 ++- src/backend/access/gin/gininsert.c | 22 ++++ src/backend/access/gist/gistbuild.c | 5 + src/backend/access/hash/hash.c | 4 + src/backend/access/heap/heapam.c | 50 +++++++- src/backend/access/heap/heapam_handler.c | 74 +++++++++-- src/backend/access/index/genam.c | 2 +- src/backend/access/nbtree/nbtsort.c | 27 +++- src/backend/access/spgist/spginsert.c | 4 + src/backend/catalog/index.c | 30 ++++- src/backend/commands/indexcmds.c | 14 +-- src/backend/optimizer/plan/planner.c | 10 ++ src/backend/utils/errcodes.txt | 1 + src/backend/utils/misc/guc_parameters.dat | 10 ++ src/include/access/tableam.h | 31 ++++- src/include/catalog/index.h | 17 +++ src/include/miscadmin.h | 1 + src/test/modules/injection_points/Makefile | 2 +- .../expected/cic_reset_snapshots.out | 118 ++++++++++++++++++ src/test/modules/injection_points/meson.build | 1 + .../sql/cic_reset_snapshots.sql | 101 +++++++++++++++ 23 files changed, 509 insertions(+), 39 deletions(-) create mode 100644 src/test/modules/injection_points/expected/cic_reset_snapshots.out create mode 100644 src/test/modules/injection_points/sql/cic_reset_snapshots.sql diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index b74ab5f7a05..40874167631 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -557,7 +557,8 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - true); /* syncscan OK? */ + true, /* syncscan OK? */ + false); /* * Scan will behave as the first scan of a CREATE INDEX CONCURRENTLY diff --git a/contrib/pgstattuple/pgstattuple.c b/contrib/pgstattuple/pgstattuple.c index 6a7f8cb4a7c..a0e7ed2e137 100644 --- a/contrib/pgstattuple/pgstattuple.c +++ b/contrib/pgstattuple/pgstattuple.c @@ -335,7 +335,7 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo) errmsg("only heap AM is supported"))); /* Disable syncscan because we assume we scan from block zero upwards */ - scan = table_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false); + scan = table_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false, false); hscan = (HeapScanDesc) scan; InitDirtySnapshot(SnapshotDirty); diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index bdb30752e09..09741186264 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -1221,11 +1221,12 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) state->bs_sortstate = tuplesort_begin_index_brin(maintenance_work_mem, coordinate, TUPLESORT_NONE); - + InvalidateCatalogSnapshot(); /* scan the relation and merge per-worker results */ reltuples = _brin_parallel_merge(state); _brin_end_parallel(state->bs_leader, state); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); } else /* no parallel index build */ { @@ -1238,6 +1239,7 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) reltuples = table_index_build_scan(heap, index, indexInfo, false, true, brinbuildCallback, state, NULL); + InvalidateCatalogSnapshot(); /* * process the final batch * @@ -1257,6 +1259,7 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) brin_fill_empty_ranges(state, state->bs_currRangeStart, state->bs_maxRangeStart); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); } /* release resources */ @@ -2390,6 +2393,7 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; + bool need_pop_active_snapshot = true; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -2415,9 +2419,16 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * live according to that. */ if (!isconcurrent) + { + Assert(ActiveSnapshotSet()); snapshot = SnapshotAny; + need_pop_active_snapshot = false; + } else + { snapshot = RegisterSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(GetTransactionSnapshot()); + } /* * Estimate size for our own PARALLEL_KEY_BRIN_SHARED workspace. @@ -2460,6 +2471,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, /* If no DSM segment was available, back out (do serial build) */ if (pcxt->seg == NULL) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); if (IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); @@ -2539,6 +2552,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); _brin_end_parallel(brinleader, NULL); return; } @@ -2555,6 +2570,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * sure that the failure-to-start case will not hang forever. */ WaitForParallelWorkersToAttach(pcxt); + if (need_pop_active_snapshot) + PopActiveSnapshot(); } /* diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 9d83a495775..5b035aa5dec 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -31,6 +31,7 @@ #include "storage/condition_variable.h" #include "storage/proc.h" #include "storage/predicate.h" +#include "storage/proc.h" #include "tcop/tcopprot.h" #include "utils/datum.h" #include "utils/memutils.h" @@ -683,6 +684,9 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) buildstate.accum.ginstate = &buildstate.ginstate; ginInitBA(&buildstate.accum); + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); + /* Report table scan phase started */ pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_GIN_PHASE_INDEXBUILD_TABLESCAN); @@ -745,11 +749,13 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) tuplesort_begin_index_gin(heap, index, maintenance_work_mem, coordinate, TUPLESORT_NONE); + InvalidateCatalogSnapshot(); /* scan the relation in parallel and merge per-worker results */ reltuples = _gin_parallel_merge(state); _gin_end_parallel(state->bs_leader, state); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); } else /* no parallel index build */ { @@ -759,6 +765,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) */ reltuples = table_index_build_scan(heap, index, indexInfo, false, true, ginBuildCallback, &buildstate, NULL); + InvalidateCatalogSnapshot(); /* dump remaining entries to the index */ oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx); @@ -772,6 +779,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) list, nlist, &buildstate.buildStats); } MemoryContextSwitchTo(oldCtx); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); } MemoryContextDelete(buildstate.funcCtx); @@ -948,6 +956,7 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; + bool need_pop_active_snapshot = true; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -972,9 +981,16 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * live according to that. */ if (!isconcurrent) + { + Assert(ActiveSnapshotSet()); snapshot = SnapshotAny; + need_pop_active_snapshot = false; + } else + { snapshot = RegisterSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(GetTransactionSnapshot()); + } /* * Estimate size for our own PARALLEL_KEY_GIN_SHARED workspace. @@ -1017,6 +1033,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, /* If no DSM segment was available, back out (do serial build) */ if (pcxt->seg == NULL) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); if (IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); @@ -1091,6 +1109,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); _gin_end_parallel(ginleader, NULL); return; } @@ -1107,6 +1127,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * sure that the failure-to-start case will not hang forever. */ WaitForParallelWorkersToAttach(pcxt); + if (need_pop_active_snapshot) + PopActiveSnapshot(); } /* diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 7f57c787f4c..86af1e11317 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -38,11 +38,13 @@ #include "access/gist_private.h" #include "access/tableam.h" #include "access/xloginsert.h" +#include "catalog/index.h" #include "miscadmin.h" #include "nodes/execnodes.h" #include "optimizer/optimizer.h" #include "storage/bufmgr.h" #include "storage/bulk_write.h" +#include "storage/proc.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -259,6 +261,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) buildstate.indtuples = 0; buildstate.indtuplesSize = 0; + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); if (buildstate.buildMode == GIST_SORTED_BUILD) { /* @@ -350,6 +353,8 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = (double) buildstate.indtuples; + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); return result; } diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 8d8cd30dc38..44a374c0d57 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -24,12 +24,14 @@ #include "access/stratnum.h" #include "access/tableam.h" #include "access/xloginsert.h" +#include "catalog/index.h" #include "commands/progress.h" #include "commands/vacuum.h" #include "miscadmin.h" #include "nodes/execnodes.h" #include "optimizer/plancat.h" #include "pgstat.h" +#include "storage/proc.h" #include "storage/read_stream.h" #include "utils/fmgrprotos.h" #include "utils/index_selfuncs.h" @@ -210,6 +212,8 @@ hashbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); return result; } diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index e06ce2db2cf..0394e0023cd 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -55,6 +55,9 @@ #include "utils/spccache.h" #include "utils/syscache.h" +/* GUCs */ +int concurrent_index_reset_snapshot_every_n_pages = 4096; + static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, CommandId cid, uint32 options); @@ -698,6 +701,36 @@ heap_prepare_pagescan(TableScanDesc sscan) LockBuffer(buffer, BUFFER_LOCK_UNLOCK); } +/* + * Reset the active snapshot during a scan. + * This ensures the xmin horizon can advance while maintaining safe tuple visibility. + * Note: No other snapshot should be active during this operation. + */ +static inline void +heap_reset_scan_snapshot(TableScanDesc sscan) +{ + /* Make sure no other snapshot was set as active. */ + Assert(GetActiveSnapshot() == sscan->rs_snapshot); + /* And make sure active snapshot is not registered. */ + Assert(GetActiveSnapshot()->regd_count == 0); + PopActiveSnapshot(); + + sscan->rs_snapshot = InvalidSnapshot; /* just to be tidy */ + Assert(!HaveRegisteredOrActiveSnapshot()); + InvalidateCatalogSnapshot(); + + /* The goal of snapshot reset is to allow horizon to advance. */ + Assert(!TransactionIdIsValid(MyProc->xmin)); +#if USE_INJECTION_POINTS + /* In some cases it is still not possible due xid assign. */ + if (!TransactionIdIsValid(MyProc->xid)) + INJECTION_POINT("heap_reset_scan_snapshot_effective", NULL); +#endif + + PushActiveSnapshot(GetLatestSnapshot()); + sscan->rs_snapshot = GetActiveSnapshot(); +} + /* * heap_fetch_next_buffer - read and pin the next block from MAIN_FORKNUM. * @@ -739,7 +772,13 @@ heap_fetch_next_buffer(HeapScanDesc scan, ScanDirection dir) scan->rs_cbuf = read_stream_next_buffer(scan->rs_read_stream, NULL); if (BufferIsValid(scan->rs_cbuf)) + { scan->rs_cblock = BufferGetBlockNumber(scan->rs_cbuf); + if (scan->rs_base.rs_flags & SO_RESET_SNAPSHOT && + concurrent_index_reset_snapshot_every_n_pages > 0 && + scan->rs_cblock % concurrent_index_reset_snapshot_every_n_pages == 0) + heap_reset_scan_snapshot((TableScanDesc) scan); + } } /* @@ -1410,7 +1449,16 @@ heap_endscan(TableScanDesc sscan) if (scan->rs_parallelworkerdata != NULL) pfree(scan->rs_parallelworkerdata); - +#ifdef USE_ASSERT_CHECKING + if (scan->rs_base.rs_flags & SO_RESET_SNAPSHOT) + { + Assert(!(scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)); + /* Make sure no other snapshot was set as active. */ + Assert(GetActiveSnapshot() == sscan->rs_snapshot); + /* And make sure snapshot is not registered. */ + Assert(GetActiveSnapshot()->regd_count == 0); + } +#endif if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT) UnregisterSnapshot(scan->rs_base.rs_snapshot); diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 07f07188d46..754f5a720df 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1114,8 +1114,17 @@ heapam_index_build_range_scan(Relation heapRelation, TupleTableSlot *slot; EState *estate; ExprContext *econtext; - Snapshot snapshot; + /* + * In isolation modes where IsolationUsesXactSnapshot() is true, the + * registered scan snapshot can differ from the active snapshot copy + * pushed for expression evaluation, so remember the registered one + * separately for later UnregisterSnapshot(). + */ + Snapshot snapshot, + registered_snapshot = InvalidSnapshot; bool need_unregister_snapshot = false; + bool need_pop_active_snapshot = false; + bool reset_snapshots = false; TransactionId OldestXmin; BlockNumber previous_blkno = InvalidBlockNumber; BlockNumber root_blkno = InvalidBlockNumber; @@ -1150,9 +1159,6 @@ heapam_index_build_range_scan(Relation heapRelation, /* Arrange for econtext's scan tuple to be the tuple under test */ econtext->ecxt_scantuple = slot; - /* Set up execution state for predicate, if any. */ - predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); - /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time @@ -1162,6 +1168,16 @@ heapam_index_build_range_scan(Relation heapRelation, */ OldestXmin = InvalidTransactionId; + /* + * For unique indexes we need a consistent snapshot for the whole scan. + * Resetting snapshots also doesn't work in xact-snapshot isolation modes, + * because those keep a registered transaction snapshot for the whole xact. + * In the case of parallel scan, some additional infrastructure is required + * to perform a scan with SO_RESET_SNAPSHOT which is not yet ready. + */ + reset_snapshots = IndexBuildResetsSnapshots(indexInfo) && + !is_system_catalog; /* just for the case */ + /* okay to ignore lazy VACUUMs here */ if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent) OldestXmin = GetOldestNonRemovableTransactionId(heapRelation); @@ -1170,24 +1186,42 @@ heapam_index_build_range_scan(Relation heapRelation, { /* * Serial index build. - * - * Must begin our own heap scan in this case. We may also need to - * register a snapshot whose lifetime is under our direct control. */ if (!TransactionIdIsValid(OldestXmin)) { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); - need_unregister_snapshot = true; + snapshot = GetTransactionSnapshot(); + /* + * Must begin our own heap scan in this case. We may also need to + * register a snapshot whose lifetime is under our direct control. + * In case of resetting of a snapshot during the scan, registration is + * not allowed because snapshot is going to be changed every so + * often. + */ + if (!reset_snapshots) + { + /* Store active snapshot because PushActiveSnapshot() may copy */ + snapshot = registered_snapshot = RegisterSnapshot(snapshot); + need_unregister_snapshot = true; + } + Assert(!ActiveSnapshotSet()); + PushActiveSnapshot(snapshot); + /* table_beginscan_strat() needs the exact active snapshot pointer */ + snapshot = GetActiveSnapshot(); + need_pop_active_snapshot = true; } else + { + Assert(!indexInfo->ii_Concurrent); snapshot = SnapshotAny; + } scan = table_beginscan_strat(heapRelation, /* relation */ snapshot, /* snapshot */ 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - allow_sync); /* syncscan OK? */ + allow_sync, /* syncscan OK? */ + reset_snapshots /* reset snapshots? */); } else { @@ -1201,6 +1235,12 @@ heapam_index_build_range_scan(Relation heapRelation, Assert(!IsBootstrapProcessingMode()); Assert(allow_sync); snapshot = scan->rs_snapshot; + if (IsMVCCSnapshot(snapshot)) + { + /* Don't expose SnapshotAny to SQL run by predicates/expressions. */ + PushActiveSnapshot(snapshot); + need_pop_active_snapshot = true; + } } hscan = (HeapScanDesc) scan; @@ -1215,6 +1255,13 @@ heapam_index_build_range_scan(Relation heapRelation, Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) : !TransactionIdIsValid(OldestXmin)); Assert(snapshot == SnapshotAny || !anyvisible); + Assert(snapshot == SnapshotAny || ActiveSnapshotSet()); + + /* Set up execution state for predicate, if any. */ + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + /* Clear reference to snapshot since it may be changed by the scan itself. */ + if (reset_snapshots) + snapshot = InvalidSnapshot; /* Publish number of blocks to scan */ if (progress) @@ -1650,9 +1697,11 @@ heapam_index_build_range_scan(Relation heapRelation, table_endscan(scan); + if (need_pop_active_snapshot) + PopActiveSnapshot(); /* we can now forget our snapshot, if set and registered by us */ if (need_unregister_snapshot) - UnregisterSnapshot(snapshot); + UnregisterSnapshot(registered_snapshot); ExecDropSingleTupleTableSlot(slot); @@ -1722,7 +1771,8 @@ heapam_index_validate_scan(Relation heapRelation, 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - false); /* syncscan not OK */ + false, /* syncscan not OK */ + false); hscan = (HeapScanDesc) scan; pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL, diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 1408989c568..eef12741249 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -473,7 +473,7 @@ systable_beginscan(Relation heapRelation, */ sysscan->scan = table_beginscan_strat(heapRelation, snapshot, nkeys, key, - true, false); + true, false, false); sysscan->iscan = NULL; } diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 756dfa3dcf4..8d804d6bcfe 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -262,7 +262,7 @@ static double _bt_spools_heapscan(Relation heap, Relation index, static void _bt_spooldestroy(BTSpool *btspool); static void _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull); -static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2); +static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots); static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state); static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level); @@ -325,18 +325,20 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) RelationGetRelationName(index)); reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); /* * Finish the build by (1) completing the sort of the spool file, (2) * inserting the sorted tuples into btree pages and (3) building the upper * levels. Finally, it may also be necessary to end use of parallelism. */ - _bt_leafbuild(buildstate.spool, buildstate.spool2); + _bt_leafbuild(buildstate.spool, buildstate.spool2, IndexBuildResetsSnapshots(indexInfo)); _bt_spooldestroy(buildstate.spool); if (buildstate.spool2) _bt_spooldestroy(buildstate.spool2); if (buildstate.btleader) _bt_end_parallel(buildstate.btleader); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); result = palloc_object(IndexBuildResult); @@ -484,6 +486,8 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, else reltuples = _bt_parallel_heapscan(buildstate, &indexInfo->ii_BrokenHotChain); + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); /* * Set the progress target for the next phase. Reset the block number @@ -539,7 +543,7 @@ _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, co * create an entire btree. */ static void -_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) +_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots) { BTWriteState wstate; @@ -561,18 +565,21 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) PROGRESS_BTREE_PHASE_PERFORMSORT_2); tuplesort_performsort(btspool2->sortstate); } + Assert(!reset_snapshots || !TransactionIdIsValid(MyProc->xmin)); wstate.heap = btspool->heap; wstate.index = btspool->index; wstate.inskey = _bt_mkscankey(wstate.index, NULL); /* _bt_mkscankey() won't set allequalimage without metapage */ wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true); + InvalidateCatalogSnapshot(); /* reserve the metapage */ wstate.btws_pages_alloced = BTREE_METAPAGE + 1; pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_BTREE_PHASE_LEAF_LOAD); + Assert(!reset_snapshots || !TransactionIdIsValid(MyProc->xmin)); _bt_load(&wstate, btspool, btspool2); } @@ -1411,6 +1418,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; + bool need_pop_active_snapshot = true; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -1436,9 +1444,16 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * live according to that. */ if (!isconcurrent) + { + Assert(ActiveSnapshotSet()); snapshot = SnapshotAny; + need_pop_active_snapshot = false; + } else + { snapshot = RegisterSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(snapshot); + } /* * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and @@ -1492,6 +1507,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) /* If no DSM segment was available, back out (do serial build) */ if (pcxt->seg == NULL) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); if (IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); @@ -1586,6 +1603,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); _bt_end_parallel(btleader); return; } @@ -1602,6 +1621,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * sure that the failure-to-start case will not hang forever. */ WaitForParallelWorkersToAttach(pcxt); + if (need_pop_active_snapshot) + PopActiveSnapshot(); } /* diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 780ef646a54..ff457e3bbfa 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -20,10 +20,12 @@ #include "access/spgist_private.h" #include "access/tableam.h" #include "access/xloginsert.h" +#include "catalog/index.h" #include "miscadmin.h" #include "nodes/execnodes.h" #include "storage/bufmgr.h" #include "storage/bulk_write.h" +#include "storage/proc.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -143,6 +145,8 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) result = palloc0_object(IndexBuildResult); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); return result; } diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 9407c357f27..ce017e836be 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -81,6 +81,7 @@ #include "utils/snapmgr.h" #include "utils/syscache.h" #include "utils/tuplesort.h" +#include "storage/proc.h" /* Potentially set by pg_upgrade_support functions */ Oid binary_upgrade_next_index_pg_class_oid = InvalidOid; @@ -1510,8 +1511,8 @@ index_concurrently_build(Oid heapRelationId, Relation indexRelation; IndexInfo *indexInfo; - /* This had better make sure that a snapshot is active */ - Assert(ActiveSnapshotSet()); + Assert(!TransactionIdIsValid(MyProc->xmin)); + Assert(!TransactionIdIsValid(MyProc->xid)); /* Open and lock the parent heap relation */ heapRel = table_open(heapRelationId, ShareUpdateExclusiveLock); @@ -1529,19 +1530,28 @@ index_concurrently_build(Oid heapRelationId, indexRelation = index_open(indexRelationId, RowExclusiveLock); + /* BuildIndexInfo may require as snapshot for expressions and predicates */ + PushActiveSnapshot(GetTransactionSnapshot()); /* * We have to re-build the IndexInfo struct, since it was lost in the * commit of the transaction where this concurrent index was created at * the catalog level. */ indexInfo = BuildIndexInfo(indexRelation); + /* Done with snapshot */ + PopActiveSnapshot(); Assert(!indexInfo->ii_ReadyForInserts); indexInfo->ii_Concurrent = true; indexInfo->ii_BrokenHotChain = false; + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); /* Now build the index */ index_build(heapRel, indexRelation, indexInfo, false, true, true); + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); + /* Roll back any GUC changes executed by index functions */ AtEOXact_GUC(false, save_nestlevel); @@ -1552,12 +1562,19 @@ index_concurrently_build(Oid heapRelationId, table_close(heapRel, NoLock); index_close(indexRelation, NoLock); + /* + * Updating pg_index might involve TOAST table access, so ensure we + * have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); /* * Update the pg_index row to mark the index as ready for inserts. Once we * commit this transaction, any new transactions that open the table must * insert new entries into the index for insertions and non-HOT updates. */ index_set_state_flags(indexRelationId, INDEX_CREATE_SET_READY); + /* we can do away with our snapshot */ + PopActiveSnapshot(); } /* @@ -3257,7 +3274,8 @@ IndexCheckExclusion(Relation heapRelation, 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - true); /* syncscan OK */ + true, /* syncscan OK */ + false); while (table_scan_getnextslot(scan, ForwardScanDirection, slot)) { @@ -3320,12 +3338,16 @@ IndexCheckExclusion(Relation heapRelation, * as of the start of the scan (see table_index_build_scan), whereas a normal * build takes care to include recently-dead tuples. This is OK because * we won't mark the index valid until all transactions that might be able - * to see those tuples are gone. The reason for doing that is to avoid + * to see those tuples are gone. One of reasons for doing that is to avoid * bogus unique-index failures due to concurrent UPDATEs (we might see * different versions of the same row as being valid when we pass over them, * if we used HeapTupleSatisfiesVacuum). This leaves us with an index that * does not contain any tuples added to the table while we built the index. * + * Furthermore, in case of non-unique index we set SO_RESET_SNAPSHOT for the + * scan, which causes new snapshot to be set as active every so often. The reason + * for that is to propagate the xmin horizon forward. + * * Next, we mark the index "indisready" (but still not "indisvalid") and * commit the second transaction and start a third. Again we wait for all * transactions that could have been modifying the table to terminate. Now diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 9ab74c8df0a..be3dc5e8d28 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1704,23 +1704,17 @@ DefineIndex(ParseState *pstate, * chains can be created where the new tuple and the old tuple in the * chain have different index keys. * - * We now take a new snapshot, and build the index using all tuples that - * are visible in this snapshot. We can be sure that any HOT updates to + * We build the index using all tuples that are visible using single or + * multiple refreshing snapshots. We can be sure that any HOT updates to * these tuples will be compatible with the index, since any updates made * by transactions that didn't know about the index are now committed or * rolled back. Thus, each visible tuple is either the end of its * HOT-chain or the extension of the chain is HOT-safe for this index. */ - /* Set ActiveSnapshot since functions in the indexes may need it */ - PushActiveSnapshot(GetTransactionSnapshot()); - /* Perform concurrent build of index */ index_concurrently_build(tableId, indexRelationId); - /* we can do away with our snapshot */ - PopActiveSnapshot(); - /* * Commit this transaction to make the indisready update visible. */ @@ -4137,9 +4131,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein if (newidx->safe) set_indexsafe_procflags(); - /* Set ActiveSnapshot since functions in the indexes may need it */ - PushActiveSnapshot(GetTransactionSnapshot()); - /* * Update progress for the index to build, with the correct parent * table involved. @@ -4154,7 +4145,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein /* Perform concurrent build of new index */ index_concurrently_build(newidx->tableId, newidx->indexId); - PopActiveSnapshot(); CommitTransactionCommand(); } diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 4ec76ce31a9..6a086e23a6c 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -63,6 +63,7 @@ #include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/selfuncs.h" +#include "utils/snapmgr.h" /* GUC parameters */ double cursor_tuple_fraction = DEFAULT_CURSOR_TUPLE_FRACTION; @@ -7047,6 +7048,7 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) Relation heap; Relation index; RelOptInfo *rel; + bool need_pop_active_snapshot = false; int parallel_workers; BlockNumber heap_blocks; double reltuples; @@ -7102,6 +7104,12 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) heap = table_open(tableOid, NoLock); index = index_open(indexOid, NoLock); + /* Set ActiveSnapshot since functions in the indexes may need it */ + if (!ActiveSnapshotSet()) + { + PushActiveSnapshot(GetTransactionSnapshot()); + need_pop_active_snapshot = true; + } /* * Determine if it's safe to proceed. * @@ -7159,6 +7167,8 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) parallel_workers--; done: + if (need_pop_active_snapshot) + PopActiveSnapshot(); index_close(index, NoLock); table_close(heap, NoLock); diff --git a/src/backend/utils/errcodes.txt b/src/backend/utils/errcodes.txt index 5b25402ebbe..a80f08af241 100644 --- a/src/backend/utils/errcodes.txt +++ b/src/backend/utils/errcodes.txt @@ -259,6 +259,7 @@ Section: Class 25 - Invalid Transaction State 25P02 E ERRCODE_IN_FAILED_SQL_TRANSACTION in_failed_sql_transaction 25P03 E ERRCODE_IDLE_IN_TRANSACTION_SESSION_TIMEOUT idle_in_transaction_session_timeout 25P04 E ERRCODE_TRANSACTION_TIMEOUT transaction_timeout +25P05 E ERRCODE_INAPPROPRIATE_ISOLATION_LEVEL_FOR_COMMAND inappropriate_isolation_level_for_command Section: Class 26 - Invalid SQL Statement Name diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 7a8a5d0764c..47cb33378bc 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -509,6 +509,16 @@ options => 'compute_query_id_options', }, +{ name => 'concurrent_index_reset_snapshot_every_n_pages', type => 'int', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Sets how often concurrent index builds refresh their snapshot.', + long_desc => 'Zero disables periodic snapshot refresh during the first heap scan of eligible CREATE INDEX CONCURRENTLY and REINDEX CONCURRENTLY builds.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'concurrent_index_reset_snapshot_every_n_pages', + boot_val => '4096', + min => '0', + max => 'INT_MAX', +}, + { name => 'config_file', type => 'string', context => 'PGC_POSTMASTER', group => 'FILE_LOCATIONS', short_desc => 'Sets the server\'s main configuration file.', flags => 'GUC_DISALLOW_IN_FILE | GUC_SUPERUSER_ONLY', diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 4647785fd35..941af05ad3e 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -24,6 +24,7 @@ #include "storage/read_stream.h" #include "utils/rel.h" #include "utils/snapshot.h" +#include "utils/injection_point.h" #define DEFAULT_TABLE_ACCESS_METHOD "heap" @@ -69,6 +70,18 @@ typedef enum ScanOptions /* set if the query doesn't modify the relation */ SO_HINT_REL_READ_ONLY = 1 << 10, + /* + * Reset scan and catalog snapshot every so often? If so, the + * concurrent_index_reset_snapshot_every_n_pages GUC decides when the + * active snapshot is popped, the catalog snapshot invalidated, and the + * latest snapshot pushed as active. + * + * At the end of the scan snapshot is not popped. + * Goal of such mode is keep xmin propagating horizon forward. + * + * see heap_reset_scan_snapshot for details. + */ + SO_RESET_SNAPSHOT = 1 << 11, } ScanOptions; /* @@ -82,7 +95,7 @@ typedef enum ScanOptions (SO_TYPE_SEQSCAN | SO_TYPE_BITMAPSCAN | SO_TYPE_SAMPLESCAN | \ SO_TYPE_TIDSCAN | SO_TYPE_TIDRANGESCAN | SO_TYPE_ANALYZE | \ SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE | \ - SO_TEMP_SNAPSHOT) + SO_TEMP_SNAPSHOT | SO_RESET_SNAPSHOT) /* * Result codes for table_{update,delete,lock_tuple}, and for visibility @@ -962,7 +975,8 @@ extern TableScanDesc table_beginscan_catalog(Relation relation, int nkeys, static inline TableScanDesc table_beginscan_strat(Relation rel, Snapshot snapshot, int nkeys, ScanKeyData *key, - bool allow_strat, bool allow_sync) + bool allow_strat, bool allow_sync, + bool reset_snapshot) { uint32 flags = SO_TYPE_SEQSCAN | SO_ALLOW_PAGEMODE; @@ -970,6 +984,15 @@ table_beginscan_strat(Relation rel, Snapshot snapshot, flags |= SO_ALLOW_STRAT; if (allow_sync) flags |= SO_ALLOW_SYNC; + if (reset_snapshot) + { + INJECTION_POINT("table_beginscan_strat_reset_snapshots", NULL); + /* Active snapshot is required on start. */ + Assert(GetActiveSnapshot() == snapshot); + /* Active snapshot should not be registered to keep xmin propagating. */ + Assert(GetActiveSnapshot()->regd_count == 0); + flags |= (SO_RESET_SNAPSHOT); + } return table_beginscan_common(rel, snapshot, nkeys, key, NULL, flags, SO_NONE); @@ -1828,6 +1851,10 @@ table_scan_analyze_next_tuple(TableScanDesc scan, * very hard to detect whether they're really incompatible with the chain tip. * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. + * + * In case of non-unique index and non-parallel concurrent build, + * concurrent_index_reset_snapshot_every_n_pages is applied for the scan. + * That leads to changing snapshots on the fly to allow xmin horizon propagate. */ static inline double table_index_build_scan(Relation table_rel, diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 9aee8226347..3d93232361f 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -14,6 +14,7 @@ #ifndef INDEX_H #define INDEX_H +#include "access/xact.h" #include "catalog/objectaddress.h" #include "nodes/execnodes.h" @@ -26,6 +27,22 @@ typedef struct AttrMap AttrMap; #define DEFAULT_INDEX_TYPE "btree" +/* + * Does this concurrent index build use periodic snapshot resets? + * + * Snapshot resetting is only applicable when all of: + * - the build is concurrent (ii_Concurrent) + * - the index is non-unique (unique needs consistent snapshot) + * - isolation level is not REPEATABLE READ/SERIALIZABLE (those keep a + * registered transaction snapshot) + * - the build is not parallel (parallel needs separate infrastructure) + */ +#define IndexBuildResetsSnapshots(indexInfo) \ + ((indexInfo)->ii_Concurrent && \ + !(indexInfo)->ii_Unique && \ + !IsolationUsesXactSnapshot() && \ + !(indexInfo)->ii_ParallelWorkers) + /* Action code for index_set_state_flags */ typedef enum { diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 7277c37e779..b6c9fc64a8d 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -269,6 +269,7 @@ extern PGDLLIMPORT int work_mem; extern PGDLLIMPORT double hash_mem_multiplier; extern PGDLLIMPORT int maintenance_work_mem; extern PGDLLIMPORT int max_parallel_maintenance_workers; +extern PGDLLIMPORT int concurrent_index_reset_snapshot_every_n_pages; /* * Upper and lower hard limits for the buffer access strategy ring size diff --git a/src/test/modules/injection_points/Makefile b/src/test/modules/injection_points/Makefile index a41d781f8c9..eeaeaaf2cc6 100644 --- a/src/test/modules/injection_points/Makefile +++ b/src/test/modules/injection_points/Makefile @@ -9,7 +9,7 @@ EXTENSION = injection_points DATA = injection_points--1.0.sql PGFILEDESC = "injection_points - facility for injection points" -REGRESS = injection_points hashagg reindex_conc vacuum +REGRESS = injection_points hashagg reindex_conc vacuum cic_reset_snapshots REGRESS_OPTS = --dlpath=$(top_builddir)/src/test/regress ISOLATION = basic \ diff --git a/src/test/modules/injection_points/expected/cic_reset_snapshots.out b/src/test/modules/injection_points/expected/cic_reset_snapshots.out new file mode 100644 index 00000000000..ed4aaaf3463 --- /dev/null +++ b/src/test/modules/injection_points/expected/cic_reset_snapshots.out @@ -0,0 +1,118 @@ +CREATE EXTENSION injection_points; +SELECT injection_points_set_local(); + injection_points_set_local +---------------------------- + +(1 row) + +SELECT injection_points_attach('heap_reset_scan_snapshot_effective', 'notice'); + injection_points_attach +------------------------- + +(1 row) + +SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice'); + injection_points_attach +------------------------- + +(1 row) + +CREATE SCHEMA cic_reset_snap; +CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); +INSERT INTO cic_reset_snap.tbl SELECT i, i * I FROM generate_series(1, 200) s(i); +CREATE FUNCTION cic_reset_snap.predicate_stable(integer) RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN MOD($1, 2) = 0; +END; $$; +CREATE FUNCTION cic_reset_snap.predicate_stable_no_param() RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN false; +END; $$; +---------------- +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +-- The same in parallel mode +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i DESC NULLS LAST); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +BEGIN TRANSACTION; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +ERROR: CREATE INDEX CONCURRENTLY cannot run inside a transaction block +ROLLBACK ; +SET default_transaction_isolation = 'repeatable read'; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +SET default_transaction_isolation = serializable; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +RESET default_transaction_isolation; +DROP SCHEMA cic_reset_snap CASCADE; +NOTICE: drop cascades to 3 other objects +DETAIL: drop cascades to table cic_reset_snap.tbl +drop cascades to function cic_reset_snap.predicate_stable(integer) +drop cascades to function cic_reset_snap.predicate_stable_no_param() +DROP EXTENSION injection_points; diff --git a/src/test/modules/injection_points/meson.build b/src/test/modules/injection_points/meson.build index fcc85414515..5d298cd2710 100644 --- a/src/test/modules/injection_points/meson.build +++ b/src/test/modules/injection_points/meson.build @@ -36,6 +36,7 @@ tests += { 'hashagg', 'reindex_conc', 'vacuum', + 'cic_reset_snapshots', ], 'regress_args': ['--dlpath', meson.project_build_root() / 'src/test/regress'], # The injection points are cluster-wide, so disable installcheck diff --git a/src/test/modules/injection_points/sql/cic_reset_snapshots.sql b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql new file mode 100644 index 00000000000..553787539b3 --- /dev/null +++ b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql @@ -0,0 +1,101 @@ +CREATE EXTENSION injection_points; + +SELECT injection_points_set_local(); +SELECT injection_points_attach('heap_reset_scan_snapshot_effective', 'notice'); +SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice'); + + +CREATE SCHEMA cic_reset_snap; +CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); +INSERT INTO cic_reset_snap.tbl SELECT i, i * I FROM generate_series(1, 200) s(i); + +CREATE FUNCTION cic_reset_snap.predicate_stable(integer) RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN MOD($1, 2) = 0; +END; $$; + +CREATE FUNCTION cic_reset_snap.predicate_stable_no_param() RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN false; +END; $$; + +---------------- +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); + +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +-- The same in parallel mode +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); + +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i DESC NULLS LAST); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +BEGIN TRANSACTION; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +ROLLBACK ; + +SET default_transaction_isolation = 'repeatable read'; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +SET default_transaction_isolation = serializable; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +RESET default_transaction_isolation; + +DROP SCHEMA cic_reset_snap CASCADE; + +DROP EXTENSION injection_points; -- 2.43.0 ^ permalink raw reply [nested|flat] 6+ messages in thread
* Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY 2026-03-09 00:03 Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY Mihail Nikalayeu <[email protected]> 2026-03-20 01:15 ` Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY Mihail Nikalayeu <[email protected]> 2026-03-21 23:50 ` Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY Mihail Nikalayeu <[email protected]> 2026-04-06 17:55 ` Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY Mihail Nikalayeu <[email protected]> @ 2026-04-11 17:48 ` Mihail Nikalayeu <[email protected]> 2026-04-18 13:33 ` Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY Mihail Nikalayeu <[email protected]> 0 siblings, 1 reply; 6+ messages in thread From: Mihail Nikalayeu @ 2026-04-11 17:48 UTC (permalink / raw) To: Álvaro Herrera <[email protected]>; +Cc: PostgreSQL Hackers <[email protected]>; Matthias van de Meent <[email protected]>; Antonin Houska <[email protected]>; Sergey Sargsyan <[email protected]>; Hannu Krosing <[email protected]> Rebased + 1) extract duplicate key error reporting into _bt_report_duplicate() 2) limit heap fetches in the tuplesort fail-fast duplicate check to a configurable percentage Best regards, Mikhail. Attachments: [application/octet-stream] v5-0001-Add-stress-tests-for-concurrent-index-builds.patch (12.6K, 2-v5-0001-Add-stress-tests-for-concurrent-index-builds.patch) download | inline diff: From 3a2a9987d33cca714d46c83d146c7f543c604f8f Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <[email protected]> Date: Sat, 30 Nov 2024 16:24:20 +0100 Subject: [PATCH v5 1/4] Add stress tests for concurrent index builds Introduce stress tests for concurrent index operations: - test concurrent inserts/updates during CREATE/REINDEX INDEX CONCURRENTLY - cover various index types (btree, gin, gist, brin, hash, spgist) - test unique and non-unique indexes - test with expressions and predicates - test both parallel and non-parallel operations - test both read-committed and repeatable-read isolation levels These tests verify the behavior of the following commits. --- src/bin/pg_amcheck/meson.build | 1 + src/bin/pg_amcheck/t/006_cic.pl | 293 ++++++++++++++++++++++++++++++++ 2 files changed, 294 insertions(+) create mode 100644 src/bin/pg_amcheck/t/006_cic.pl diff --git a/src/bin/pg_amcheck/meson.build b/src/bin/pg_amcheck/meson.build index 592cef74ecb..51a62dccb7b 100644 --- a/src/bin/pg_amcheck/meson.build +++ b/src/bin/pg_amcheck/meson.build @@ -28,6 +28,7 @@ tests += { 't/003_check.pl', 't/004_verify_heapam.pl', 't/005_opclass_damage.pl', + 't/006_cic.pl', ], }, } diff --git a/src/bin/pg_amcheck/t/006_cic.pl b/src/bin/pg_amcheck/t/006_cic.pl new file mode 100644 index 00000000000..dd7a1eff0ef --- /dev/null +++ b/src/bin/pg_amcheck/t/006_cic.pl @@ -0,0 +1,293 @@ +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Test REINDEX CONCURRENTLY with concurrent modifications and HOT updates +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use constant STRESS_PGBENCH_CLIENTS => 30; +use constant STRESS_PGBENCH_JOBS => 8; +use constant STRESS_PGBENCH_TRANSACTIONS => 10000; +use constant STRESS_MAX_SLEEP_MS => 10; + +use constant DEFAULT_PGBENCH_CLIENTS => 15; +use constant DEFAULT_PGBENCH_JOBS => 4; +use constant DEFAULT_PGBENCH_TRANSACTIONS => 500; +use constant DEFAULT_MAX_SLEEP_MS => 1; + +Test::More->builder->todo_start('filesystem bug') + if PostgreSQL::Test::Utils::has_wal_read_bug; + +my $node; +my $pg_test_extra = $ENV{PG_TEST_EXTRA} // ''; +my $is_stress = $pg_test_extra =~ /\bstress\b/ ? 1 : 0; +my $pgbench_clients = + $is_stress ? STRESS_PGBENCH_CLIENTS : DEFAULT_PGBENCH_CLIENTS; +my $pgbench_jobs = $is_stress ? STRESS_PGBENCH_JOBS : DEFAULT_PGBENCH_JOBS; +my $pgbench_transactions = + $is_stress ? STRESS_PGBENCH_TRANSACTIONS : DEFAULT_PGBENCH_TRANSACTIONS; +my $max_sleep_ms = $is_stress ? STRESS_MAX_SLEEP_MS : DEFAULT_MAX_SLEEP_MS; +my $pgbench_options = sprintf( + '--no-vacuum --client=%d --jobs=%d --exit-on-abort --transactions=%d', + $pgbench_clients, + $pgbench_jobs, + $pgbench_transactions); +my $no_hot = $is_stress ? int(rand(2)) : 0; + +print( + sprintf( + 'settings: PG_TEST_EXTRA=%s stress=%d clients=%d jobs=%d transactions=%d max_sleep_ms=%d no_hot=%d', + defined($ENV{PG_TEST_EXTRA}) + ? ($pg_test_extra eq '' ? '(empty)' : $pg_test_extra) + : '(undef)', + $is_stress, + $pgbench_clients, + $pgbench_jobs, + $pgbench_transactions, + $max_sleep_ms, + $no_hot)); +print "\n"; + +# +# Test set-up +# +$node = PostgreSQL::Test::Cluster->new('RC_test'); +$node->init; +$node->append_conf('postgresql.conf', + 'lock_timeout = ' . (1000 * $PostgreSQL::Test::Utils::timeout_default)); +$node->append_conf('postgresql.conf', 'fsync = off'); +$node->append_conf('postgresql.conf', 'maintenance_work_mem = 32MB'); # to avoid OOM +$node->append_conf('postgresql.conf', 'shared_buffers = 32MB'); # to avoid OOM +$node->start; +$node->safe_psql('postgres', q(CREATE EXTENSION amcheck)); +$node->safe_psql('postgres', q(CREATE UNLOGGED TABLE tbl(i int primary key, + c1 money default 0, c2 money default 0, + c3 money default 0, updated_at timestamp, + ia int4[], p point))); + +if ($no_hot) { $node->safe_psql('postgres', q(CREATE INDEX CONCURRENTLY idx ON tbl(i, updated_at);)); } + +# create sequence +$node->safe_psql('postgres', q(CREATE UNLOGGED SEQUENCE in_row_rebuild START 1 INCREMENT 1;)); +$node->safe_psql('postgres', q(SELECT nextval('in_row_rebuild');)); + +# Create helper functions for predicate tests +$node->safe_psql('postgres', q( + CREATE FUNCTION predicate_stable() RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ + BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN true; + END; $$; +)); + +$node->safe_psql('postgres', q( + CREATE FUNCTION predicate_const(integer) RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ + BEGIN + RETURN MOD($1, 2) = 0; + END; $$; +)); + +# Run CIC/RIC in different options concurrently with upserts +$node->pgbench( + $pgbench_options, + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY', + { + 'concurrent_ops' => sprintf(q( + SET debug_parallel_query = off; -- this is because predicate_stable implementation + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set variant random(0, 5) + \set parallels random(0, 4) + \set use_rr random(0, 9) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + \if :use_rr = 0 + SET default_transaction_isolation = 'repeatable read'; + \endif + \if :variant = 0 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at); + \elif :variant = 1 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at) WHERE predicate_stable(); + \elif :variant = 2 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at) WHERE MOD(i, 2) = 0; + \elif :variant = 3 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at) WHERE predicate_const(i); + \elif :variant = 4 + CREATE INDEX CONCURRENTLY new_idx ON tbl(predicate_const(i)); + \elif :variant = 5 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, predicate_const(i), updated_at) WHERE predicate_const(i); + \endif + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + REINDEX INDEX CONCURRENTLY new_idx; + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + DROP INDEX CONCURRENTLY new_idx; + RESET default_transaction_isolation; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1000, 100000) + BEGIN; + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + SELECT setval('in_row_rebuild', 1); + COMMIT; + \endif + ), $max_sleep_ms, $max_sleep_ms) + }); + +$node->safe_psql('postgres', q(TRUNCATE TABLE tbl;)); + +# Run CIC/RIC for unique index concurrently with upserts +$node->pgbench( + $pgbench_options, + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for unique BTREE', + { + 'concurrent_ops_unique_idx' => sprintf(q( + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set parallels random(0, 4) + \set use_rr random(0, 9) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + \if :use_rr = 0 + SET default_transaction_isolation = 'repeatable read'; + \endif + CREATE UNIQUE INDEX CONCURRENTLY new_idx ON tbl(i); + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + REINDEX INDEX CONCURRENTLY new_idx; + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + DROP INDEX CONCURRENTLY new_idx; + RESET default_transaction_isolation; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1, power(10, random(1, 5))) + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + SELECT setval('in_row_rebuild', 1); + \endif + ), $max_sleep_ms, $max_sleep_ms) + }); + +$node->safe_psql('postgres', q(TRUNCATE TABLE tbl;)); + +# Run CIC/RIC for GIN with upserts +$node->pgbench( + $pgbench_options, + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for GIN', + { + 'concurrent_ops_gin_idx' => sprintf(q( + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set parallels random(0, 4) + \set use_rr random(0, 9) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + \if :use_rr = 0 + SET default_transaction_isolation = 'repeatable read'; + \endif + CREATE INDEX CONCURRENTLY new_idx ON tbl USING GIN (ia); + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + SELECT gin_index_check('new_idx'); + REINDEX INDEX CONCURRENTLY new_idx; + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + SELECT gin_index_check('new_idx'); + DROP INDEX CONCURRENTLY new_idx; + RESET default_transaction_isolation; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1, power(10, random(1, 5))) + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + SELECT setval('in_row_rebuild', 1); + \endif + ), $max_sleep_ms, $max_sleep_ms) + }); + +$node->safe_psql('postgres', q(TRUNCATE TABLE tbl;)); + +# Run CIC/RIC for GIST/BRIN/HASH/SPGIST index concurrently with upserts +$node->pgbench( + $pgbench_options, + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for GIST/BRIN/HASH/SPGIST', + { + 'concurrent_ops_other_idx' => sprintf(q( + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set parallels random(0, 4) + \set use_rr random(0, 9) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + \if :use_rr = 0 + SET default_transaction_isolation = 'repeatable read'; + \endif + \set variant random(0, 3) + \if :variant = 0 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING GIST (p); + \elif :variant = 1 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING BRIN (updated_at); + \elif :variant = 2 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING HASH (updated_at); + \elif :variant = 3 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING SPGIST (p); + \endif + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + REINDEX INDEX CONCURRENTLY new_idx; + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + DROP INDEX CONCURRENTLY new_idx; + RESET default_transaction_isolation; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1, power(10, random(1, 5))) + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + SELECT setval('in_row_rebuild', 1); + \endif + ), $max_sleep_ms, $max_sleep_ms) + }); + +$node->stop; +done_testing(); -- 2.43.0 [application/octet-stream] v5-0003-Support-snapshot-resets-in-parallel-concurrent-in.patch (38.9K, 3-v5-0003-Support-snapshot-resets-in-parallel-concurrent-in.patch) download | inline diff: From 7f88972af5bddcdc77d7b976a0ca94be2f9e37ae Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <[email protected]> Date: Fri, 3 Apr 2026 19:56:44 +0200 Subject: [PATCH v5 3/4] Support snapshot resets in parallel concurrent index builds Extend periodic snapshot reset support to parallel builds, previously limited to non-parallel operations. This allows the xmin horizon to advance during parallel concurrent index builds as well. The main limitation of applying that technique to parallel builds was a requirement to wait until worker processes restore their initial snapshot from leader. To address this, following changes applied: - add infrastructure to track snapshot restoration in parallel workers - extend parallel scan initialization to support periodic snapshot resets - wait for parallel workers to restore their initial snapshots before proceeding with scan - relax limitation for parallel worker to call GetLatestSnapshot --- src/backend/access/brin/brin.c | 54 ++++++++++++---- src/backend/access/gin/gininsert.c | 51 +++++++++++---- src/backend/access/heap/heapam_handler.c | 7 +-- src/backend/access/nbtree/nbtsort.c | 51 +++++++++++++-- src/backend/access/table/tableam.c | 37 +++++++++-- src/backend/access/transam/parallel.c | 62 +++++++++++++++++-- src/backend/executor/nodeSeqscan.c | 3 +- src/backend/executor/nodeTidrangescan.c | 3 +- src/backend/utils/time/snapmgr.c | 8 --- src/include/access/parallel.h | 3 +- src/include/access/relscan.h | 1 + src/include/access/tableam.h | 5 +- src/include/catalog/index.h | 4 +- .../expected/cic_reset_snapshots.out | 25 +++++++- .../sql/cic_reset_snapshots.sql | 5 +- 15 files changed, 259 insertions(+), 60 deletions(-) diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 09741186264..ff57fe61add 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -1275,6 +1275,9 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = idxtuples; + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); + return result; } @@ -2394,6 +2397,7 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, BufferUsage *bufferusage; bool leaderparticipates = true; bool need_pop_active_snapshot = true; + bool reset_snapshot; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -2411,12 +2415,16 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, scantuplesortstates = leaderparticipates ? request + 1 : request; + reset_snapshot = isconcurrent && !IsolationUsesXactSnapshot(); + /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a - * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that. + * concurrent build, we take a regular MVCC snapshot and push it as active. + * Later we index whatever's live according to that snapshot while that + * snapshot may be reset periodically (in non-xact-snapshot isolation + * modes) to allow the xmin horizon to advance. */ if (!isconcurrent) { @@ -2424,10 +2432,15 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, snapshot = SnapshotAny; need_pop_active_snapshot = false; } + else if (reset_snapshot) + { + snapshot = InvalidSnapshot; + PushActiveSnapshot(GetTransactionSnapshot()); + } else { snapshot = RegisterSnapshot(GetTransactionSnapshot()); - PushActiveSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(snapshot); } /* @@ -2473,7 +2486,7 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, { if (need_pop_active_snapshot) PopActiveSnapshot(); - if (IsMVCCSnapshot(snapshot)) + if (snapshot != InvalidSnapshot && IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); ExitParallelMode(); @@ -2499,7 +2512,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, table_parallelscan_initialize(heap, ParallelTableScanFromBrinShared(brinshared), - snapshot); + snapshot, + reset_snapshot); /* * Store shared tuplesort-private state, for which we reserved space. @@ -2561,6 +2575,13 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, /* Save leader state now that it's clear build will be parallel */ buildstate->bs_leader = brinleader; + /* + * In case of concurrent build with snapshot resets, wait until all + * workers imported initial snapshot. + */ + if (reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, true); + /* Join heap scan ourselves */ if (leaderparticipates) _brin_leader_participate_as_worker(buildstate, heap, index); @@ -2569,9 +2590,13 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - WaitForParallelWorkersToAttach(pcxt); + if (!reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); + + InvalidateCatalogSnapshot(); + Assert(!reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); } /* @@ -2592,8 +2617,7 @@ _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state) for (i = 0; i < brinleader->pcxt->nworkers_launched; i++) InstrAccumParallelQuery(&brinleader->bufferusage[i], &brinleader->walusage[i]); - /* Free last reference to MVCC snapshot, if one was used */ - if (IsMVCCSnapshot(brinleader->snapshot)) + if (brinleader->snapshot != InvalidSnapshot && IsMVCCSnapshot(brinleader->snapshot)) UnregisterSnapshot(brinleader->snapshot); DestroyParallelContext(brinleader->pcxt); ExitParallelMode(); @@ -2794,7 +2818,7 @@ _brin_parallel_merge(BrinBuildState *state) /* * Returns size of shared memory required to store state for a parallel - * brin index build based on the snapshot its parallel scan will use. + * brin index build. */ static Size _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot) @@ -2843,6 +2867,7 @@ _brin_parallel_scan_and_build(BrinBuildState *state, { SortCoordinate coordinate; TableScanDesc scan; + ParallelTableScanDesc pscan; double reltuples; IndexInfo *indexInfo; @@ -2859,13 +2884,18 @@ _brin_parallel_scan_and_build(BrinBuildState *state, /* Join parallel scan */ indexInfo = BuildIndexInfo(index); indexInfo->ii_Concurrent = brinshared->isconcurrent; + pscan = ParallelTableScanFromBrinShared(brinshared); scan = table_beginscan_parallel(heap, - ParallelTableScanFromBrinShared(brinshared), + pscan, SO_NONE); reltuples = table_index_build_scan(heap, index, indexInfo, true, true, brinbuildCallbackParallel, state, scan); + InvalidateCatalogSnapshot(); + if (pscan->phs_reset_snapshot) + PopActiveSnapshot(); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); /* insert the last item */ form_and_spill_tuple(state); @@ -2888,6 +2918,9 @@ _brin_parallel_scan_and_build(BrinBuildState *state, ConditionVariableSignal(&brinshared->workersdonecv); tuplesort_end(state->bs_sortstate); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); + if (pscan->phs_reset_snapshot) + PushActiveSnapshot(GetTransactionSnapshot()); } /* @@ -2964,7 +2997,6 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) _brin_parallel_scan_and_build(buildstate, brinshared, sharedsort, heapRel, indexRel, sortmem, false); - /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 5b035aa5dec..3251db0e617 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -809,6 +809,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); return result; } @@ -957,6 +958,7 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, BufferUsage *bufferusage; bool leaderparticipates = true; bool need_pop_active_snapshot = true; + bool reset_snapshot; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -973,12 +975,16 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, scantuplesortstates = leaderparticipates ? request + 1 : request; + reset_snapshot = isconcurrent && !IsolationUsesXactSnapshot(); + /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a - * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that. + * concurrent build, we take a regular MVCC snapshot and push it as active. + * Later we index whatever's live according to that snapshot while that + * snapshot may be reset periodically (in non-xact-snapshot isolation + * modes) to allow the xmin horizon to advance. */ if (!isconcurrent) { @@ -986,10 +992,15 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, snapshot = SnapshotAny; need_pop_active_snapshot = false; } + else if (reset_snapshot) + { + snapshot = InvalidSnapshot; + PushActiveSnapshot(GetTransactionSnapshot()); + } else { snapshot = RegisterSnapshot(GetTransactionSnapshot()); - PushActiveSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(snapshot); } /* @@ -1035,7 +1046,7 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, { if (need_pop_active_snapshot) PopActiveSnapshot(); - if (IsMVCCSnapshot(snapshot)) + if (snapshot != InvalidSnapshot && IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); ExitParallelMode(); @@ -1060,7 +1071,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, table_parallelscan_initialize(heap, ParallelTableScanFromGinBuildShared(ginshared), - snapshot); + snapshot, + reset_snapshot); /* * Store shared tuplesort-private state, for which we reserved space. @@ -1118,6 +1130,13 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, /* Save leader state now that it's clear build will be parallel */ buildstate->bs_leader = ginleader; + /* + * In case of concurrent build with snapshot resets, wait until all + * workers imported initial snapshot. + */ + if (reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, true); + /* Join heap scan ourselves */ if (leaderparticipates) _gin_leader_participate_as_worker(buildstate, heap, index); @@ -1126,9 +1145,12 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - WaitForParallelWorkersToAttach(pcxt); + if (!reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); + InvalidateCatalogSnapshot(); + Assert(!reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); } /* @@ -1149,8 +1171,7 @@ _gin_end_parallel(GinLeader *ginleader, GinBuildState *state) for (i = 0; i < ginleader->pcxt->nworkers_launched; i++) InstrAccumParallelQuery(&ginleader->bufferusage[i], &ginleader->walusage[i]); - /* Free last reference to MVCC snapshot, if one was used */ - if (IsMVCCSnapshot(ginleader->snapshot)) + if (ginleader->snapshot != InvalidSnapshot && IsMVCCSnapshot(ginleader->snapshot)) UnregisterSnapshot(ginleader->snapshot); DestroyParallelContext(ginleader->pcxt); ExitParallelMode(); @@ -1826,7 +1847,7 @@ _gin_parallel_merge(GinBuildState *state) /* * Returns size of shared memory required to store state for a parallel - * gin index build based on the snapshot its parallel scan will use. + * gin index build. */ static Size _gin_parallel_estimate_shared(Relation heap, Snapshot snapshot) @@ -2058,6 +2079,7 @@ _gin_parallel_scan_and_build(GinBuildState *state, { SortCoordinate coordinate; TableScanDesc scan; + ParallelTableScanDesc pscan; double reltuples; IndexInfo *indexInfo; @@ -2088,13 +2110,18 @@ _gin_parallel_scan_and_build(GinBuildState *state, /* Join parallel scan */ indexInfo = BuildIndexInfo(index); indexInfo->ii_Concurrent = ginshared->isconcurrent; + pscan = ParallelTableScanFromGinBuildShared(ginshared); scan = table_beginscan_parallel(heap, - ParallelTableScanFromGinBuildShared(ginshared), + pscan, SO_NONE); reltuples = table_index_build_scan(heap, index, indexInfo, true, progress, ginBuildCallbackParallel, state, scan); + InvalidateCatalogSnapshot(); + if (pscan->phs_reset_snapshot) + PopActiveSnapshot(); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); /* write remaining accumulated entries */ ginFlushBuildState(state, index); @@ -2124,6 +2151,9 @@ _gin_parallel_scan_and_build(GinBuildState *state, ConditionVariableSignal(&ginshared->workersdonecv); tuplesort_end(state->bs_sortstate); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); + if (pscan->phs_reset_snapshot) + PushActiveSnapshot(GetTransactionSnapshot()); } /* @@ -2219,7 +2249,6 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) _gin_parallel_scan_and_build(&buildstate, ginshared, sharedsort, heapRel, indexRel, sortmem, false); - /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index faf3e04c449..1184ed086fe 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1213,7 +1213,8 @@ heapam_index_build_range_scan(Relation heapRelation, * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, or during bootstrap, we take a regular MVCC snapshot - * and index whatever's live according to that. + * and index whatever's live according to that while that snapshot is reset + * every so often (in the case of non-unique index). */ OldestXmin = InvalidTransactionId; @@ -1221,8 +1222,6 @@ heapam_index_build_range_scan(Relation heapRelation, * For unique indexes we need a consistent snapshot for the whole scan. * Resetting snapshots also doesn't work in xact-snapshot isolation modes, * because those keep a registered transaction snapshot for the whole xact. - * In the case of parallel scan, some additional infrastructure is required - * to perform a scan with SO_RESET_SNAPSHOT which is not yet ready. */ reset_snapshots = IndexBuildResetsSnapshots(indexInfo) && !is_system_catalog; /* just for the case */ @@ -1284,7 +1283,7 @@ heapam_index_build_range_scan(Relation heapRelation, Assert(!IsBootstrapProcessingMode()); Assert(allow_sync); snapshot = scan->rs_snapshot; - if (IsMVCCSnapshot(snapshot)) + if (!reset_snapshots && IsMVCCSnapshot(snapshot)) { /* Don't expose SnapshotAny to SQL run by predicates/expressions. */ PushActiveSnapshot(snapshot); diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 8d804d6bcfe..6bb365c951d 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -1419,6 +1419,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) BufferUsage *bufferusage; bool leaderparticipates = true; bool need_pop_active_snapshot = true; + bool reset_snapshot; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -1436,12 +1437,24 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) scantuplesortstates = leaderparticipates ? request + 1 : request; + /* + * For concurrent non-unique index builds, we can periodically reset + * snapshots to allow the xmin horizon to advance. This is safe since + * these builds don't require a consistent view across the entire scan. + * Unique indexes still need a stable snapshot to properly enforce + * uniqueness constraints. Isolation modes where + * IsolationUsesXactSnapshot() is true also prevent resetting because + * they keep a registered transaction snapshot for the whole transaction. + */ + reset_snapshot = isconcurrent && !IsolationUsesXactSnapshot() && !btspool->isunique; + /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that. + * live according to that, while that snapshot may be reset periodically + * for non-unique indexes in non-xact-snapshot isolation modes. */ if (!isconcurrent) { @@ -1449,6 +1462,11 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) snapshot = SnapshotAny; need_pop_active_snapshot = false; } + else if (reset_snapshot) + { + snapshot = InvalidSnapshot; + PushActiveSnapshot(GetTransactionSnapshot()); + } else { snapshot = RegisterSnapshot(GetTransactionSnapshot()); @@ -1509,7 +1527,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) { if (need_pop_active_snapshot) PopActiveSnapshot(); - if (IsMVCCSnapshot(snapshot)) + if (snapshot != InvalidSnapshot && IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); ExitParallelMode(); @@ -1536,7 +1554,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) btshared->brokenhotchain = false; table_parallelscan_initialize(btspool->heap, ParallelTableScanFromBTShared(btshared), - snapshot); + snapshot, + reset_snapshot); /* * Store shared tuplesort-private state, for which we reserved space. @@ -1612,6 +1631,13 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) /* Save leader state now that it's clear build will be parallel */ buildstate->btleader = btleader; + /* + * In the case of concurrent build, snapshots are going to be reset periodically. + * Wait until all workers imported initial snapshot. + */ + if (reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, true); + /* Join heap scan ourselves */ if (leaderparticipates) _bt_leader_participate_as_worker(buildstate); @@ -1620,9 +1646,13 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - WaitForParallelWorkersToAttach(pcxt); + if (!reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); + + InvalidateCatalogSnapshot(); + Assert(!reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); } /* @@ -1644,7 +1674,7 @@ _bt_end_parallel(BTLeader *btleader) InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]); /* Free last reference to MVCC snapshot, if one was used */ - if (IsMVCCSnapshot(btleader->snapshot)) + if (btleader->snapshot != InvalidSnapshot && IsMVCCSnapshot(btleader->snapshot)) UnregisterSnapshot(btleader->snapshot); DestroyParallelContext(btleader->pcxt); ExitParallelMode(); @@ -1894,6 +1924,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, SortCoordinate coordinate; BTBuildState buildstate; TableScanDesc scan; + ParallelTableScanDesc pscan; double reltuples; IndexInfo *indexInfo; @@ -1948,12 +1979,17 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, /* Join parallel scan */ indexInfo = BuildIndexInfo(btspool->index); indexInfo->ii_Concurrent = btshared->isconcurrent; + pscan = ParallelTableScanFromBTShared(btshared); scan = table_beginscan_parallel(btspool->heap, - ParallelTableScanFromBTShared(btshared), + pscan, SO_NONE); reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo, true, progress, _bt_build_callback, &buildstate, scan); + InvalidateCatalogSnapshot(); + if (pscan->phs_reset_snapshot) + PopActiveSnapshot(); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); /* Execute this worker's part of the sort */ if (progress) @@ -1989,4 +2025,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, tuplesort_end(btspool->sortstate); if (btspool2) tuplesort_end(btspool2->sortstate); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); + if (pscan->phs_reset_snapshot) + PushActiveSnapshot(GetTransactionSnapshot()); } diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 68ff0966f1c..cf337eda5f6 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -132,10 +132,10 @@ table_parallelscan_estimate(Relation rel, Snapshot snapshot) { Size sz = 0; - if (IsMVCCSnapshot(snapshot)) + if (snapshot != InvalidSnapshot && IsMVCCSnapshot(snapshot)) sz = add_size(sz, EstimateSnapshotSpace(snapshot)); else - Assert(snapshot == SnapshotAny); + Assert(snapshot == SnapshotAny || snapshot == InvalidSnapshot); sz = add_size(sz, rel->rd_tableam->parallelscan_estimate(rel)); @@ -144,21 +144,36 @@ table_parallelscan_estimate(Relation rel, Snapshot snapshot) void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, - Snapshot snapshot) + Snapshot snapshot, bool reset_snapshot) { Size snapshot_off = rel->rd_tableam->parallelscan_initialize(rel, pscan); pscan->phs_snapshot_off = snapshot_off; - if (IsMVCCSnapshot(snapshot)) + /* + * Initialize parallel scan description. For normal scans with a regular + * MVCC snapshot, serialize the snapshot info. For scans that use periodic + * snapshot resets, mark the scan accordingly. + */ + if (reset_snapshot) + { + Assert(snapshot == InvalidSnapshot); + pscan->phs_snapshot_any = false; + pscan->phs_reset_snapshot = true; + INJECTION_POINT("table_parallelscan_initialize", NULL); + } + else if (IsMVCCSnapshot(snapshot)) { SerializeSnapshot(snapshot, (char *) pscan + pscan->phs_snapshot_off); pscan->phs_snapshot_any = false; + pscan->phs_reset_snapshot = false; } else { Assert(snapshot == SnapshotAny); + Assert(!reset_snapshot); pscan->phs_snapshot_any = true; + pscan->phs_reset_snapshot = false; } } @@ -172,7 +187,19 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan, Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator)); - if (!pscan->phs_snapshot_any) + /* + * For scans that + * use periodic snapshot resets, mark the scan accordingly and use the active + * snapshot as the initial state. + */ + if (pscan->phs_reset_snapshot) + { + Assert(ActiveSnapshotSet()); + internal_flags |= SO_RESET_SNAPSHOT; + /* Start with current active snapshot. */ + snapshot = GetActiveSnapshot(); + } + else if (!pscan->phs_snapshot_any) { /* Snapshot was serialized -- restore it */ snapshot = RestoreSnapshot((char *) pscan + pscan->phs_snapshot_off); diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c index 89e9d224eec..6f50f5aff9c 100644 --- a/src/backend/access/transam/parallel.c +++ b/src/backend/access/transam/parallel.c @@ -79,6 +79,7 @@ #define PARALLEL_KEY_RELMAPPER_STATE UINT64CONST(0xFFFFFFFFFFFF000D) #define PARALLEL_KEY_UNCOMMITTEDENUMS UINT64CONST(0xFFFFFFFFFFFF000E) #define PARALLEL_KEY_CLIENTCONNINFO UINT64CONST(0xFFFFFFFFFFFF000F) +#define PARALLEL_KEY_SNAPSHOT_RESTORED UINT64CONST(0xFFFFFFFFFFFF0010) /* Fixed-size parallel state. */ typedef struct FixedParallelState @@ -308,6 +309,10 @@ InitializeParallelDSM(ParallelContext *pcxt) pcxt->nworkers)); shm_toc_estimate_keys(&pcxt->estimator, 1); + shm_toc_estimate_chunk(&pcxt->estimator, mul_size(sizeof(bool), + pcxt->nworkers)); + shm_toc_estimate_keys(&pcxt->estimator, 1); + /* Estimate how much we'll need for the entrypoint info. */ shm_toc_estimate_chunk(&pcxt->estimator, strlen(pcxt->library_name) + strlen(pcxt->function_name) + 2); @@ -379,6 +384,7 @@ InitializeParallelDSM(ParallelContext *pcxt) char *entrypointstate; char *uncommittedenumsspace; char *clientconninfospace; + bool *snapshot_set_flag_space; Size lnamelen; /* Serialize shared libraries we have loaded. */ @@ -494,6 +500,19 @@ InitializeParallelDSM(ParallelContext *pcxt) strcpy(entrypointstate, pcxt->library_name); strcpy(entrypointstate + lnamelen + 1, pcxt->function_name); shm_toc_insert(pcxt->toc, PARALLEL_KEY_ENTRYPOINT, entrypointstate); + + /* + * Establish dynamic shared memory to pass information about importing + * of snapshot. + */ + snapshot_set_flag_space = + shm_toc_allocate(pcxt->toc, mul_size(sizeof(bool), pcxt->nworkers)); + for (i = 0; i < pcxt->nworkers; ++i) + { + pcxt->worker[i].snapshot_restored = &snapshot_set_flag_space[i]; + *pcxt->worker[i].snapshot_restored = false; + } + shm_toc_insert(pcxt->toc, PARALLEL_KEY_SNAPSHOT_RESTORED, snapshot_set_flag_space); } /* Update nworkers_to_launch, in case we changed nworkers above. */ @@ -670,6 +689,10 @@ LaunchParallelWorkers(ParallelContext *pcxt) * Wait for all workers to attach to their error queues, and throw an error if * any worker fails to do this. * + * wait_for_snapshot: track whether each parallel worker has successfully restored + * its snapshot. This is needed when using periodic snapshot resets to ensure all + * workers have a valid initial snapshot before proceeding with the scan. + * * Callers can assume that if this function returns successfully, then the * number of workers given by pcxt->nworkers_launched have initialized and * attached to their error queues. Whether or not these workers are guaranteed @@ -699,7 +722,7 @@ LaunchParallelWorkers(ParallelContext *pcxt) * call this function at all. */ void -WaitForParallelWorkersToAttach(ParallelContext *pcxt) +WaitForParallelWorkersToAttach(ParallelContext *pcxt, bool wait_for_snapshot) { int i; @@ -743,9 +766,24 @@ WaitForParallelWorkersToAttach(ParallelContext *pcxt) mq = shm_mq_get_queue(pcxt->worker[i].error_mqh); if (shm_mq_get_sender(mq) != NULL) { - /* Yes, so it is known to be attached. */ - pcxt->known_attached_workers[i] = true; - ++pcxt->nknown_attached_workers; + pg_read_barrier(); + if (!wait_for_snapshot || *pcxt->worker[i].snapshot_restored) + { + /* Worker is known to be attached. */ + pcxt->known_attached_workers[i] = true; + ++pcxt->nknown_attached_workers; + } + else + { + /* + * Worker attached but hasn't restored its snapshot yet. + */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 10, WAIT_EVENT_BGWORKER_STARTUP); + if (rc & WL_LATCH_SET) + ResetLatch(MyLatch); + } } } else if (status == BGWH_STOPPED) @@ -788,6 +826,16 @@ WaitForParallelWorkersToAttach(ParallelContext *pcxt) break; } } + + /* Set snapshot restored flag to false. */ + if (pcxt->nworkers > 0) + { + bool *snapshot_restored_space; + snapshot_restored_space = + shm_toc_lookup(pcxt->toc, PARALLEL_KEY_SNAPSHOT_RESTORED, false); + for (i = 0; i < pcxt->nworkers; ++i) + snapshot_restored_space[i] = false; + } } /* @@ -1304,6 +1352,7 @@ ParallelWorkerMain(Datum main_arg) shm_toc *toc; FixedParallelState *fps; char *error_queue_space; + bool *snapshot_restored_space; shm_mq *mq; shm_mq_handle *mqh; char *libraryspace; @@ -1507,6 +1556,11 @@ ParallelWorkerMain(Datum main_arg) fps->parallel_leader_pgproc); PushActiveSnapshot(asnapshot); + /* Snapshot is restored, set flag to make leader know about it. */ + snapshot_restored_space = shm_toc_lookup(toc, PARALLEL_KEY_SNAPSHOT_RESTORED, false); + snapshot_restored_space[ParallelWorkerNumber] = true; + pg_write_barrier(); + /* * We've changed which tuples we can see, and must therefore invalidate * system caches. diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c index 5bcb0a861d7..122d7458561 100644 --- a/src/backend/executor/nodeSeqscan.c +++ b/src/backend/executor/nodeSeqscan.c @@ -404,7 +404,8 @@ ExecSeqScanInitializeDSM(SeqScanState *node, pscan = shm_toc_allocate(pcxt->toc, node->pscan_len); table_parallelscan_initialize(node->ss.ss_currentRelation, pscan, - estate->es_snapshot); + estate->es_snapshot, + false); shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); node->ss.ss_currentScanDesc = diff --git a/src/backend/executor/nodeTidrangescan.c b/src/backend/executor/nodeTidrangescan.c index b387ed6c308..07e575164a1 100644 --- a/src/backend/executor/nodeTidrangescan.c +++ b/src/backend/executor/nodeTidrangescan.c @@ -488,7 +488,8 @@ ExecTidRangeScanInitializeDSM(TidRangeScanState *node, ParallelContext *pcxt) pscan = shm_toc_allocate(pcxt->toc, node->trss_pscanlen); table_parallelscan_initialize(node->ss.ss_currentRelation, pscan, - estate->es_snapshot); + estate->es_snapshot, + false); shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); node->ss.ss_currentScanDesc = table_beginscan_parallel_tidrange(node->ss.ss_currentRelation, diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 10fe18df2e7..f22da271b9e 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -353,14 +353,6 @@ GetTransactionSnapshot(void) Snapshot GetLatestSnapshot(void) { - /* - * We might be able to relax this, but nothing that could otherwise work - * needs it. - */ - if (IsInParallelMode()) - elog(ERROR, - "cannot update SecondarySnapshot during a parallel operation"); - /* * So far there are no cases requiring support for GetLatestSnapshot() * during logical decoding, but it wouldn't be hard to add if required. diff --git a/src/include/access/parallel.h b/src/include/access/parallel.h index 60f857675e0..9a3e14e0c0e 100644 --- a/src/include/access/parallel.h +++ b/src/include/access/parallel.h @@ -28,6 +28,7 @@ typedef struct ParallelWorkerInfo { BackgroundWorkerHandle *bgwhandle; shm_mq_handle *error_mqh; + bool *snapshot_restored; } ParallelWorkerInfo; typedef struct ParallelContext @@ -67,7 +68,7 @@ extern void InitializeParallelDSM(ParallelContext *pcxt); extern void ReinitializeParallelDSM(ParallelContext *pcxt); extern void ReinitializeParallelWorkers(ParallelContext *pcxt, int nworkers_to_launch); extern void LaunchParallelWorkers(ParallelContext *pcxt); -extern void WaitForParallelWorkersToAttach(ParallelContext *pcxt); +extern void WaitForParallelWorkersToAttach(ParallelContext *pcxt, bool wait_for_snapshot); extern void WaitForParallelWorkersToFinish(ParallelContext *pcxt); extern void DestroyParallelContext(ParallelContext *pcxt); extern bool ParallelContextActive(void); diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 2ea06a67a63..8d54fdc169e 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -87,6 +87,7 @@ typedef struct ParallelTableScanDescData RelFileLocator phs_locator; /* physical relation to scan */ bool phs_syncscan; /* report location to syncscan logic? */ bool phs_snapshot_any; /* SnapshotAny, not phs_snapshot_data? */ + bool phs_reset_snapshot; /* use SO_RESET_SNAPSHOT? */ Size phs_snapshot_off; /* data for snapshot */ } ParallelTableScanDescData; typedef struct ParallelTableScanDescData *ParallelTableScanDesc; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 3560ba40fc2..219525dbcb0 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -1212,7 +1212,8 @@ extern Size table_parallelscan_estimate(Relation rel, Snapshot snapshot); */ extern void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, - Snapshot snapshot); + Snapshot snapshot, + bool reset_snapshot); /* * Begin a parallel scan. `pscan` needs to have been initialized with @@ -1866,7 +1867,7 @@ table_scan_analyze_next_tuple(TableScanDesc scan, * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. * - * In case of non-unique index and non-parallel concurrent build, + * In case of non-unique concurrent build, * concurrent_index_reset_snapshot_every_n_pages is applied for the scan. * That leads to changing snapshots on the fly to allow xmin horizon propagate. */ diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 3d93232361f..60f8356ed82 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -35,13 +35,11 @@ typedef struct AttrMap AttrMap; * - the index is non-unique (unique needs consistent snapshot) * - isolation level is not REPEATABLE READ/SERIALIZABLE (those keep a * registered transaction snapshot) - * - the build is not parallel (parallel needs separate infrastructure) */ #define IndexBuildResetsSnapshots(indexInfo) \ ((indexInfo)->ii_Concurrent && \ !(indexInfo)->ii_Unique && \ - !IsolationUsesXactSnapshot() && \ - !(indexInfo)->ii_ParallelWorkers) + !IsolationUsesXactSnapshot()) /* Action code for index_set_state_flags */ typedef enum diff --git a/src/test/modules/injection_points/expected/cic_reset_snapshots.out b/src/test/modules/injection_points/expected/cic_reset_snapshots.out index ed4aaaf3463..e5a8a7f3a79 100644 --- a/src/test/modules/injection_points/expected/cic_reset_snapshots.out +++ b/src/test/modules/injection_points/expected/cic_reset_snapshots.out @@ -17,6 +17,12 @@ SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice' (1 row) +SELECT injection_points_attach('table_parallelscan_initialize', 'notice'); + injection_points_attach +------------------------- + +(1 row) + CREATE SCHEMA cic_reset_snap; CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); INSERT INTO cic_reset_snap.tbl SELECT i, i * I FROM generate_series(1, 200) s(i); @@ -72,30 +78,45 @@ NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective DROP INDEX CONCURRENTLY cic_reset_snap.idx; -- The same in parallel mode ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); +-- Detach to keep test stable, since parallel worker may complete scan before leader +SELECT injection_points_detach('heap_reset_scan_snapshot_effective'); + injection_points_detach +------------------------- + +(1 row) + CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots -NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots -NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i DESC NULLS LAST); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; BEGIN TRANSACTION; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); diff --git a/src/test/modules/injection_points/sql/cic_reset_snapshots.sql b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql index 553787539b3..a85b0cbf575 100644 --- a/src/test/modules/injection_points/sql/cic_reset_snapshots.sql +++ b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql @@ -3,7 +3,7 @@ CREATE EXTENSION injection_points; SELECT injection_points_set_local(); SELECT injection_points_attach('heap_reset_scan_snapshot_effective', 'notice'); SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice'); - +SELECT injection_points_attach('table_parallelscan_initialize', 'notice'); CREATE SCHEMA cic_reset_snap; CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); @@ -53,6 +53,9 @@ DROP INDEX CONCURRENTLY cic_reset_snap.idx; -- The same in parallel mode ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); +-- Detach to keep test stable, since parallel worker may complete scan before leader +SELECT injection_points_detach('heap_reset_scan_snapshot_effective'); + CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; DROP INDEX CONCURRENTLY cic_reset_snap.idx; -- 2.43.0 [application/octet-stream] v5-0002-Reset-snapshots-periodically-in-non-unique-non-pa.patch (52.5K, 4-v5-0002-Reset-snapshots-periodically-in-non-unique-non-pa.patch) download | inline diff: From d3140172d2d6527a0b76679257c12d5bb646430c Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <[email protected]> Date: Fri, 3 Apr 2026 18:23:01 +0200 Subject: [PATCH v5 2/4] Reset snapshots periodically in non-unique non-parallel concurrent index builds Long-living snapshots used by CREATE INDEX CONCURRENTLY and REINDEX CONCURRENTLY can hold back the global xmin horizon. Commit d9d0762 attempted to allow VACUUM to ignore such snapshots to mitigate this problem. However, this was reverted in commit e28bb88 because it could cause indexes to miss heap tuples that were HOT-updated and HOT-pruned during the index creation, leading to index corruption. This patch introduces an alternative by periodically resetting the snapshot used during the first phase. By resetting the snapshot every N pages during the heap scan, it allows the xmin horizon to advance. Currently, this technique is applied to: - only during the first scan of the heap: The second scan during index validation still uses a single snapshot to ensure index correctness - non-parallel index builds: Parallel index builds are not yet supported and will be addressed in following commits - non-unique indexes: Unique index builds still require a consistent snapshot to enforce uniqueness constraints, will be addressed in following commits A new scan option SO_RESET_SNAPSHOT is introduced. When set, it causes the snapshot to be reset "between" every SO_RESET_SNAPSHOT_EACH_N_PAGE pages during the scan. The heap scan code is adjusted to support this option, and the index build code is modified to use it for applicable concurrent index builds that are not on system catalogs and not using parallel workers. --- contrib/amcheck/verify_nbtree.c | 3 +- contrib/pgstattuple/pgstattuple.c | 2 +- src/backend/access/brin/brin.c | 19 ++- src/backend/access/gin/gininsert.c | 22 ++++ src/backend/access/gist/gistbuild.c | 5 + src/backend/access/hash/hash.c | 4 + src/backend/access/heap/heapam.c | 50 +++++++- src/backend/access/heap/heapam_handler.c | 74 +++++++++-- src/backend/access/index/genam.c | 2 +- src/backend/access/nbtree/nbtsort.c | 27 +++- src/backend/access/spgist/spginsert.c | 4 + src/backend/catalog/index.c | 30 ++++- src/backend/commands/indexcmds.c | 14 +-- src/backend/optimizer/plan/planner.c | 10 ++ src/backend/utils/errcodes.txt | 1 + src/backend/utils/misc/guc_parameters.dat | 10 ++ src/include/access/tableam.h | 31 ++++- src/include/catalog/index.h | 17 +++ src/include/miscadmin.h | 1 + src/test/modules/injection_points/Makefile | 2 +- .../expected/cic_reset_snapshots.out | 118 ++++++++++++++++++ src/test/modules/injection_points/meson.build | 1 + .../sql/cic_reset_snapshots.sql | 101 +++++++++++++++ 23 files changed, 509 insertions(+), 39 deletions(-) create mode 100644 src/test/modules/injection_points/expected/cic_reset_snapshots.out create mode 100644 src/test/modules/injection_points/sql/cic_reset_snapshots.sql diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index b74ab5f7a05..40874167631 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -557,7 +557,8 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - true); /* syncscan OK? */ + true, /* syncscan OK? */ + false); /* * Scan will behave as the first scan of a CREATE INDEX CONCURRENTLY diff --git a/contrib/pgstattuple/pgstattuple.c b/contrib/pgstattuple/pgstattuple.c index 6a7f8cb4a7c..a0e7ed2e137 100644 --- a/contrib/pgstattuple/pgstattuple.c +++ b/contrib/pgstattuple/pgstattuple.c @@ -335,7 +335,7 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo) errmsg("only heap AM is supported"))); /* Disable syncscan because we assume we scan from block zero upwards */ - scan = table_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false); + scan = table_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false, false); hscan = (HeapScanDesc) scan; InitDirtySnapshot(SnapshotDirty); diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index bdb30752e09..09741186264 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -1221,11 +1221,12 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) state->bs_sortstate = tuplesort_begin_index_brin(maintenance_work_mem, coordinate, TUPLESORT_NONE); - + InvalidateCatalogSnapshot(); /* scan the relation and merge per-worker results */ reltuples = _brin_parallel_merge(state); _brin_end_parallel(state->bs_leader, state); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); } else /* no parallel index build */ { @@ -1238,6 +1239,7 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) reltuples = table_index_build_scan(heap, index, indexInfo, false, true, brinbuildCallback, state, NULL); + InvalidateCatalogSnapshot(); /* * process the final batch * @@ -1257,6 +1259,7 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) brin_fill_empty_ranges(state, state->bs_currRangeStart, state->bs_maxRangeStart); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); } /* release resources */ @@ -2390,6 +2393,7 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; + bool need_pop_active_snapshot = true; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -2415,9 +2419,16 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * live according to that. */ if (!isconcurrent) + { + Assert(ActiveSnapshotSet()); snapshot = SnapshotAny; + need_pop_active_snapshot = false; + } else + { snapshot = RegisterSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(GetTransactionSnapshot()); + } /* * Estimate size for our own PARALLEL_KEY_BRIN_SHARED workspace. @@ -2460,6 +2471,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, /* If no DSM segment was available, back out (do serial build) */ if (pcxt->seg == NULL) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); if (IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); @@ -2539,6 +2552,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); _brin_end_parallel(brinleader, NULL); return; } @@ -2555,6 +2570,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * sure that the failure-to-start case will not hang forever. */ WaitForParallelWorkersToAttach(pcxt); + if (need_pop_active_snapshot) + PopActiveSnapshot(); } /* diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 9d83a495775..5b035aa5dec 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -31,6 +31,7 @@ #include "storage/condition_variable.h" #include "storage/proc.h" #include "storage/predicate.h" +#include "storage/proc.h" #include "tcop/tcopprot.h" #include "utils/datum.h" #include "utils/memutils.h" @@ -683,6 +684,9 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) buildstate.accum.ginstate = &buildstate.ginstate; ginInitBA(&buildstate.accum); + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); + /* Report table scan phase started */ pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_GIN_PHASE_INDEXBUILD_TABLESCAN); @@ -745,11 +749,13 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) tuplesort_begin_index_gin(heap, index, maintenance_work_mem, coordinate, TUPLESORT_NONE); + InvalidateCatalogSnapshot(); /* scan the relation in parallel and merge per-worker results */ reltuples = _gin_parallel_merge(state); _gin_end_parallel(state->bs_leader, state); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); } else /* no parallel index build */ { @@ -759,6 +765,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) */ reltuples = table_index_build_scan(heap, index, indexInfo, false, true, ginBuildCallback, &buildstate, NULL); + InvalidateCatalogSnapshot(); /* dump remaining entries to the index */ oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx); @@ -772,6 +779,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) list, nlist, &buildstate.buildStats); } MemoryContextSwitchTo(oldCtx); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); } MemoryContextDelete(buildstate.funcCtx); @@ -948,6 +956,7 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; + bool need_pop_active_snapshot = true; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -972,9 +981,16 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * live according to that. */ if (!isconcurrent) + { + Assert(ActiveSnapshotSet()); snapshot = SnapshotAny; + need_pop_active_snapshot = false; + } else + { snapshot = RegisterSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(GetTransactionSnapshot()); + } /* * Estimate size for our own PARALLEL_KEY_GIN_SHARED workspace. @@ -1017,6 +1033,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, /* If no DSM segment was available, back out (do serial build) */ if (pcxt->seg == NULL) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); if (IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); @@ -1091,6 +1109,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); _gin_end_parallel(ginleader, NULL); return; } @@ -1107,6 +1127,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * sure that the failure-to-start case will not hang forever. */ WaitForParallelWorkersToAttach(pcxt); + if (need_pop_active_snapshot) + PopActiveSnapshot(); } /* diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 7f57c787f4c..86af1e11317 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -38,11 +38,13 @@ #include "access/gist_private.h" #include "access/tableam.h" #include "access/xloginsert.h" +#include "catalog/index.h" #include "miscadmin.h" #include "nodes/execnodes.h" #include "optimizer/optimizer.h" #include "storage/bufmgr.h" #include "storage/bulk_write.h" +#include "storage/proc.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -259,6 +261,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) buildstate.indtuples = 0; buildstate.indtuplesSize = 0; + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); if (buildstate.buildMode == GIST_SORTED_BUILD) { /* @@ -350,6 +353,8 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = (double) buildstate.indtuples; + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); return result; } diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 8d8cd30dc38..44a374c0d57 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -24,12 +24,14 @@ #include "access/stratnum.h" #include "access/tableam.h" #include "access/xloginsert.h" +#include "catalog/index.h" #include "commands/progress.h" #include "commands/vacuum.h" #include "miscadmin.h" #include "nodes/execnodes.h" #include "optimizer/plancat.h" #include "pgstat.h" +#include "storage/proc.h" #include "storage/read_stream.h" #include "utils/fmgrprotos.h" #include "utils/index_selfuncs.h" @@ -210,6 +212,8 @@ hashbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); return result; } diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index abfd8e8970a..83344334e76 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -56,6 +56,9 @@ #include "utils/spccache.h" #include "utils/syscache.h" +/* GUCs */ +int concurrent_index_reset_snapshot_every_n_pages = 4096; + static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, CommandId cid, uint32 options); @@ -700,6 +703,36 @@ heap_prepare_pagescan(TableScanDesc sscan) LockBuffer(buffer, BUFFER_LOCK_UNLOCK); } +/* + * Reset the active snapshot during a scan. + * This ensures the xmin horizon can advance while maintaining safe tuple visibility. + * Note: No other snapshot should be active during this operation. + */ +static inline void +heap_reset_scan_snapshot(TableScanDesc sscan) +{ + /* Make sure no other snapshot was set as active. */ + Assert(GetActiveSnapshot() == sscan->rs_snapshot); + /* And make sure active snapshot is not registered. */ + Assert(GetActiveSnapshot()->regd_count == 0); + PopActiveSnapshot(); + + sscan->rs_snapshot = InvalidSnapshot; /* just to be tidy */ + Assert(!HaveRegisteredOrActiveSnapshot()); + InvalidateCatalogSnapshot(); + + /* The goal of snapshot reset is to allow horizon to advance. */ + Assert(!TransactionIdIsValid(MyProc->xmin)); +#if USE_INJECTION_POINTS + /* In some cases it is still not possible due xid assign. */ + if (!TransactionIdIsValid(MyProc->xid)) + INJECTION_POINT("heap_reset_scan_snapshot_effective", NULL); +#endif + + PushActiveSnapshot(GetLatestSnapshot()); + sscan->rs_snapshot = GetActiveSnapshot(); +} + /* * heap_fetch_next_buffer - read and pin the next block from MAIN_FORKNUM. * @@ -741,7 +774,13 @@ heap_fetch_next_buffer(HeapScanDesc scan, ScanDirection dir) scan->rs_cbuf = read_stream_next_buffer(scan->rs_read_stream, NULL); if (BufferIsValid(scan->rs_cbuf)) + { scan->rs_cblock = BufferGetBlockNumber(scan->rs_cbuf); + if (scan->rs_base.rs_flags & SO_RESET_SNAPSHOT && + concurrent_index_reset_snapshot_every_n_pages > 0 && + scan->rs_cblock % concurrent_index_reset_snapshot_every_n_pages == 0) + heap_reset_scan_snapshot((TableScanDesc) scan); + } } /* @@ -1421,7 +1460,16 @@ heap_endscan(TableScanDesc sscan) if (scan->rs_parallelworkerdata != NULL) pfree(scan->rs_parallelworkerdata); - +#ifdef USE_ASSERT_CHECKING + if (scan->rs_base.rs_flags & SO_RESET_SNAPSHOT) + { + Assert(!(scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)); + /* Make sure no other snapshot was set as active. */ + Assert(GetActiveSnapshot() == sscan->rs_snapshot); + /* And make sure snapshot is not registered. */ + Assert(GetActiveSnapshot()->regd_count == 0); + } +#endif if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT) UnregisterSnapshot(scan->rs_base.rs_snapshot); diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 20d3b46e062..faf3e04c449 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1163,8 +1163,17 @@ heapam_index_build_range_scan(Relation heapRelation, TupleTableSlot *slot; EState *estate; ExprContext *econtext; - Snapshot snapshot; + /* + * In isolation modes where IsolationUsesXactSnapshot() is true, the + * registered scan snapshot can differ from the active snapshot copy + * pushed for expression evaluation, so remember the registered one + * separately for later UnregisterSnapshot(). + */ + Snapshot snapshot, + registered_snapshot = InvalidSnapshot; bool need_unregister_snapshot = false; + bool need_pop_active_snapshot = false; + bool reset_snapshots = false; TransactionId OldestXmin; BlockNumber previous_blkno = InvalidBlockNumber; BlockNumber root_blkno = InvalidBlockNumber; @@ -1199,9 +1208,6 @@ heapam_index_build_range_scan(Relation heapRelation, /* Arrange for econtext's scan tuple to be the tuple under test */ econtext->ecxt_scantuple = slot; - /* Set up execution state for predicate, if any. */ - predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); - /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time @@ -1211,6 +1217,16 @@ heapam_index_build_range_scan(Relation heapRelation, */ OldestXmin = InvalidTransactionId; + /* + * For unique indexes we need a consistent snapshot for the whole scan. + * Resetting snapshots also doesn't work in xact-snapshot isolation modes, + * because those keep a registered transaction snapshot for the whole xact. + * In the case of parallel scan, some additional infrastructure is required + * to perform a scan with SO_RESET_SNAPSHOT which is not yet ready. + */ + reset_snapshots = IndexBuildResetsSnapshots(indexInfo) && + !is_system_catalog; /* just for the case */ + /* okay to ignore lazy VACUUMs here */ if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent) OldestXmin = GetOldestNonRemovableTransactionId(heapRelation); @@ -1219,24 +1235,42 @@ heapam_index_build_range_scan(Relation heapRelation, { /* * Serial index build. - * - * Must begin our own heap scan in this case. We may also need to - * register a snapshot whose lifetime is under our direct control. */ if (!TransactionIdIsValid(OldestXmin)) { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); - need_unregister_snapshot = true; + snapshot = GetTransactionSnapshot(); + /* + * Must begin our own heap scan in this case. We may also need to + * register a snapshot whose lifetime is under our direct control. + * In case of resetting of a snapshot during the scan, registration is + * not allowed because snapshot is going to be changed every so + * often. + */ + if (!reset_snapshots) + { + /* Store active snapshot because PushActiveSnapshot() may copy */ + snapshot = registered_snapshot = RegisterSnapshot(snapshot); + need_unregister_snapshot = true; + } + Assert(!ActiveSnapshotSet()); + PushActiveSnapshot(snapshot); + /* table_beginscan_strat() needs the exact active snapshot pointer */ + snapshot = GetActiveSnapshot(); + need_pop_active_snapshot = true; } else + { + Assert(!indexInfo->ii_Concurrent); snapshot = SnapshotAny; + } scan = table_beginscan_strat(heapRelation, /* relation */ snapshot, /* snapshot */ 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - allow_sync); /* syncscan OK? */ + allow_sync, /* syncscan OK? */ + reset_snapshots /* reset snapshots? */); } else { @@ -1250,6 +1284,12 @@ heapam_index_build_range_scan(Relation heapRelation, Assert(!IsBootstrapProcessingMode()); Assert(allow_sync); snapshot = scan->rs_snapshot; + if (IsMVCCSnapshot(snapshot)) + { + /* Don't expose SnapshotAny to SQL run by predicates/expressions. */ + PushActiveSnapshot(snapshot); + need_pop_active_snapshot = true; + } } hscan = (HeapScanDesc) scan; @@ -1264,6 +1304,13 @@ heapam_index_build_range_scan(Relation heapRelation, Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) : !TransactionIdIsValid(OldestXmin)); Assert(snapshot == SnapshotAny || !anyvisible); + Assert(snapshot == SnapshotAny || ActiveSnapshotSet()); + + /* Set up execution state for predicate, if any. */ + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + /* Clear reference to snapshot since it may be changed by the scan itself. */ + if (reset_snapshots) + snapshot = InvalidSnapshot; /* Publish number of blocks to scan */ if (progress) @@ -1699,9 +1746,11 @@ heapam_index_build_range_scan(Relation heapRelation, table_endscan(scan); + if (need_pop_active_snapshot) + PopActiveSnapshot(); /* we can now forget our snapshot, if set and registered by us */ if (need_unregister_snapshot) - UnregisterSnapshot(snapshot); + UnregisterSnapshot(registered_snapshot); ExecDropSingleTupleTableSlot(slot); @@ -1771,7 +1820,8 @@ heapam_index_validate_scan(Relation heapRelation, 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - false); /* syncscan not OK */ + false, /* syncscan not OK */ + false); hscan = (HeapScanDesc) scan; pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL, diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 97d44b84622..204b5b614ba 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -481,7 +481,7 @@ systable_beginscan(Relation heapRelation, */ sysscan->scan = table_beginscan_strat(heapRelation, snapshot, nkeys, key, - true, false); + true, false, false); sysscan->iscan = NULL; } diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 756dfa3dcf4..8d804d6bcfe 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -262,7 +262,7 @@ static double _bt_spools_heapscan(Relation heap, Relation index, static void _bt_spooldestroy(BTSpool *btspool); static void _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull); -static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2); +static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots); static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state); static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level); @@ -325,18 +325,20 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) RelationGetRelationName(index)); reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); /* * Finish the build by (1) completing the sort of the spool file, (2) * inserting the sorted tuples into btree pages and (3) building the upper * levels. Finally, it may also be necessary to end use of parallelism. */ - _bt_leafbuild(buildstate.spool, buildstate.spool2); + _bt_leafbuild(buildstate.spool, buildstate.spool2, IndexBuildResetsSnapshots(indexInfo)); _bt_spooldestroy(buildstate.spool); if (buildstate.spool2) _bt_spooldestroy(buildstate.spool2); if (buildstate.btleader) _bt_end_parallel(buildstate.btleader); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); result = palloc_object(IndexBuildResult); @@ -484,6 +486,8 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, else reltuples = _bt_parallel_heapscan(buildstate, &indexInfo->ii_BrokenHotChain); + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); /* * Set the progress target for the next phase. Reset the block number @@ -539,7 +543,7 @@ _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, co * create an entire btree. */ static void -_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) +_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots) { BTWriteState wstate; @@ -561,18 +565,21 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) PROGRESS_BTREE_PHASE_PERFORMSORT_2); tuplesort_performsort(btspool2->sortstate); } + Assert(!reset_snapshots || !TransactionIdIsValid(MyProc->xmin)); wstate.heap = btspool->heap; wstate.index = btspool->index; wstate.inskey = _bt_mkscankey(wstate.index, NULL); /* _bt_mkscankey() won't set allequalimage without metapage */ wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true); + InvalidateCatalogSnapshot(); /* reserve the metapage */ wstate.btws_pages_alloced = BTREE_METAPAGE + 1; pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_BTREE_PHASE_LEAF_LOAD); + Assert(!reset_snapshots || !TransactionIdIsValid(MyProc->xmin)); _bt_load(&wstate, btspool, btspool2); } @@ -1411,6 +1418,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; + bool need_pop_active_snapshot = true; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -1436,9 +1444,16 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * live according to that. */ if (!isconcurrent) + { + Assert(ActiveSnapshotSet()); snapshot = SnapshotAny; + need_pop_active_snapshot = false; + } else + { snapshot = RegisterSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(snapshot); + } /* * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and @@ -1492,6 +1507,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) /* If no DSM segment was available, back out (do serial build) */ if (pcxt->seg == NULL) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); if (IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); @@ -1586,6 +1603,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); _bt_end_parallel(btleader); return; } @@ -1602,6 +1621,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * sure that the failure-to-start case will not hang forever. */ WaitForParallelWorkersToAttach(pcxt); + if (need_pop_active_snapshot) + PopActiveSnapshot(); } /* diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 780ef646a54..ff457e3bbfa 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -20,10 +20,12 @@ #include "access/spgist_private.h" #include "access/tableam.h" #include "access/xloginsert.h" +#include "catalog/index.h" #include "miscadmin.h" #include "nodes/execnodes.h" #include "storage/bufmgr.h" #include "storage/bulk_write.h" +#include "storage/proc.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -143,6 +145,8 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) result = palloc0_object(IndexBuildResult); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); return result; } diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 9407c357f27..ce017e836be 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -81,6 +81,7 @@ #include "utils/snapmgr.h" #include "utils/syscache.h" #include "utils/tuplesort.h" +#include "storage/proc.h" /* Potentially set by pg_upgrade_support functions */ Oid binary_upgrade_next_index_pg_class_oid = InvalidOid; @@ -1510,8 +1511,8 @@ index_concurrently_build(Oid heapRelationId, Relation indexRelation; IndexInfo *indexInfo; - /* This had better make sure that a snapshot is active */ - Assert(ActiveSnapshotSet()); + Assert(!TransactionIdIsValid(MyProc->xmin)); + Assert(!TransactionIdIsValid(MyProc->xid)); /* Open and lock the parent heap relation */ heapRel = table_open(heapRelationId, ShareUpdateExclusiveLock); @@ -1529,19 +1530,28 @@ index_concurrently_build(Oid heapRelationId, indexRelation = index_open(indexRelationId, RowExclusiveLock); + /* BuildIndexInfo may require as snapshot for expressions and predicates */ + PushActiveSnapshot(GetTransactionSnapshot()); /* * We have to re-build the IndexInfo struct, since it was lost in the * commit of the transaction where this concurrent index was created at * the catalog level. */ indexInfo = BuildIndexInfo(indexRelation); + /* Done with snapshot */ + PopActiveSnapshot(); Assert(!indexInfo->ii_ReadyForInserts); indexInfo->ii_Concurrent = true; indexInfo->ii_BrokenHotChain = false; + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); /* Now build the index */ index_build(heapRel, indexRelation, indexInfo, false, true, true); + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); + /* Roll back any GUC changes executed by index functions */ AtEOXact_GUC(false, save_nestlevel); @@ -1552,12 +1562,19 @@ index_concurrently_build(Oid heapRelationId, table_close(heapRel, NoLock); index_close(indexRelation, NoLock); + /* + * Updating pg_index might involve TOAST table access, so ensure we + * have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); /* * Update the pg_index row to mark the index as ready for inserts. Once we * commit this transaction, any new transactions that open the table must * insert new entries into the index for insertions and non-HOT updates. */ index_set_state_flags(indexRelationId, INDEX_CREATE_SET_READY); + /* we can do away with our snapshot */ + PopActiveSnapshot(); } /* @@ -3257,7 +3274,8 @@ IndexCheckExclusion(Relation heapRelation, 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - true); /* syncscan OK */ + true, /* syncscan OK */ + false); while (table_scan_getnextslot(scan, ForwardScanDirection, slot)) { @@ -3320,12 +3338,16 @@ IndexCheckExclusion(Relation heapRelation, * as of the start of the scan (see table_index_build_scan), whereas a normal * build takes care to include recently-dead tuples. This is OK because * we won't mark the index valid until all transactions that might be able - * to see those tuples are gone. The reason for doing that is to avoid + * to see those tuples are gone. One of reasons for doing that is to avoid * bogus unique-index failures due to concurrent UPDATEs (we might see * different versions of the same row as being valid when we pass over them, * if we used HeapTupleSatisfiesVacuum). This leaves us with an index that * does not contain any tuples added to the table while we built the index. * + * Furthermore, in case of non-unique index we set SO_RESET_SNAPSHOT for the + * scan, which causes new snapshot to be set as active every so often. The reason + * for that is to propagate the xmin horizon forward. + * * Next, we mark the index "indisready" (but still not "indisvalid") and * commit the second transaction and start a third. Again we wait for all * transactions that could have been modifying the table to terminate. Now diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 9ab74c8df0a..be3dc5e8d28 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1704,23 +1704,17 @@ DefineIndex(ParseState *pstate, * chains can be created where the new tuple and the old tuple in the * chain have different index keys. * - * We now take a new snapshot, and build the index using all tuples that - * are visible in this snapshot. We can be sure that any HOT updates to + * We build the index using all tuples that are visible using single or + * multiple refreshing snapshots. We can be sure that any HOT updates to * these tuples will be compatible with the index, since any updates made * by transactions that didn't know about the index are now committed or * rolled back. Thus, each visible tuple is either the end of its * HOT-chain or the extension of the chain is HOT-safe for this index. */ - /* Set ActiveSnapshot since functions in the indexes may need it */ - PushActiveSnapshot(GetTransactionSnapshot()); - /* Perform concurrent build of index */ index_concurrently_build(tableId, indexRelationId); - /* we can do away with our snapshot */ - PopActiveSnapshot(); - /* * Commit this transaction to make the indisready update visible. */ @@ -4137,9 +4131,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein if (newidx->safe) set_indexsafe_procflags(); - /* Set ActiveSnapshot since functions in the indexes may need it */ - PushActiveSnapshot(GetTransactionSnapshot()); - /* * Update progress for the index to build, with the correct parent * table involved. @@ -4154,7 +4145,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein /* Perform concurrent build of new index */ index_concurrently_build(newidx->tableId, newidx->indexId); - PopActiveSnapshot(); CommitTransactionCommand(); } diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 4ec76ce31a9..6a086e23a6c 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -63,6 +63,7 @@ #include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/selfuncs.h" +#include "utils/snapmgr.h" /* GUC parameters */ double cursor_tuple_fraction = DEFAULT_CURSOR_TUPLE_FRACTION; @@ -7047,6 +7048,7 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) Relation heap; Relation index; RelOptInfo *rel; + bool need_pop_active_snapshot = false; int parallel_workers; BlockNumber heap_blocks; double reltuples; @@ -7102,6 +7104,12 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) heap = table_open(tableOid, NoLock); index = index_open(indexOid, NoLock); + /* Set ActiveSnapshot since functions in the indexes may need it */ + if (!ActiveSnapshotSet()) + { + PushActiveSnapshot(GetTransactionSnapshot()); + need_pop_active_snapshot = true; + } /* * Determine if it's safe to proceed. * @@ -7159,6 +7167,8 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) parallel_workers--; done: + if (need_pop_active_snapshot) + PopActiveSnapshot(); index_close(index, NoLock); table_close(heap, NoLock); diff --git a/src/backend/utils/errcodes.txt b/src/backend/utils/errcodes.txt index 5b25402ebbe..a80f08af241 100644 --- a/src/backend/utils/errcodes.txt +++ b/src/backend/utils/errcodes.txt @@ -259,6 +259,7 @@ Section: Class 25 - Invalid Transaction State 25P02 E ERRCODE_IN_FAILED_SQL_TRANSACTION in_failed_sql_transaction 25P03 E ERRCODE_IDLE_IN_TRANSACTION_SESSION_TIMEOUT idle_in_transaction_session_timeout 25P04 E ERRCODE_TRANSACTION_TIMEOUT transaction_timeout +25P05 E ERRCODE_INAPPROPRIATE_ISOLATION_LEVEL_FOR_COMMAND inappropriate_isolation_level_for_command Section: Class 26 - Invalid SQL Statement Name diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 83af594d4af..df92b9fee68 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -517,6 +517,16 @@ options => 'compute_query_id_options', }, +{ name => 'concurrent_index_reset_snapshot_every_n_pages', type => 'int', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Sets how often concurrent index builds refresh their snapshot.', + long_desc => 'Zero disables periodic snapshot refresh during the first heap scan of eligible CREATE INDEX CONCURRENTLY and REINDEX CONCURRENTLY builds.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'concurrent_index_reset_snapshot_every_n_pages', + boot_val => '4096', + min => '0', + max => 'INT_MAX', +}, + { name => 'config_file', type => 'string', context => 'PGC_POSTMASTER', group => 'FILE_LOCATIONS', short_desc => 'Sets the server\'s main configuration file.', flags => 'GUC_DISALLOW_IN_FILE | GUC_SUPERUSER_ONLY', diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index c13f05d39db..3560ba40fc2 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -24,6 +24,7 @@ #include "storage/read_stream.h" #include "utils/rel.h" #include "utils/snapshot.h" +#include "utils/injection_point.h" #define DEFAULT_TABLE_ACCESS_METHOD "heap" @@ -72,6 +73,18 @@ typedef enum ScanOptions /* collect scan instrumentation */ SO_SCAN_INSTRUMENT = 1 << 11, + /* + * Reset scan and catalog snapshot every so often? If so, the + * concurrent_index_reset_snapshot_every_n_pages GUC decides when the + * active snapshot is popped, the catalog snapshot invalidated, and the + * latest snapshot pushed as active. + * + * At the end of the scan snapshot is not popped. + * Goal of such mode is keep xmin propagating horizon forward. + * + * see heap_reset_scan_snapshot for details. + */ + SO_RESET_SNAPSHOT = 1 << 12, } ScanOptions; /* @@ -85,7 +98,7 @@ typedef enum ScanOptions (SO_TYPE_SEQSCAN | SO_TYPE_BITMAPSCAN | SO_TYPE_SAMPLESCAN | \ SO_TYPE_TIDSCAN | SO_TYPE_TIDRANGESCAN | SO_TYPE_ANALYZE | \ SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE | \ - SO_TEMP_SNAPSHOT) + SO_TEMP_SNAPSHOT | SO_RESET_SNAPSHOT) /* * Result codes for table_{update,delete,lock_tuple}, and for visibility @@ -967,7 +980,8 @@ extern TableScanDesc table_beginscan_catalog(Relation relation, int nkeys, static inline TableScanDesc table_beginscan_strat(Relation rel, Snapshot snapshot, int nkeys, ScanKeyData *key, - bool allow_strat, bool allow_sync) + bool allow_strat, bool allow_sync, + bool reset_snapshot) { uint32 flags = SO_TYPE_SEQSCAN | SO_ALLOW_PAGEMODE; @@ -975,6 +989,15 @@ table_beginscan_strat(Relation rel, Snapshot snapshot, flags |= SO_ALLOW_STRAT; if (allow_sync) flags |= SO_ALLOW_SYNC; + if (reset_snapshot) + { + INJECTION_POINT("table_beginscan_strat_reset_snapshots", NULL); + /* Active snapshot is required on start. */ + Assert(GetActiveSnapshot() == snapshot); + /* Active snapshot should not be registered to keep xmin propagating. */ + Assert(GetActiveSnapshot()->regd_count == 0); + flags |= (SO_RESET_SNAPSHOT); + } return table_beginscan_common(rel, snapshot, nkeys, key, NULL, flags, SO_NONE); @@ -1842,6 +1865,10 @@ table_scan_analyze_next_tuple(TableScanDesc scan, * very hard to detect whether they're really incompatible with the chain tip. * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. + * + * In case of non-unique index and non-parallel concurrent build, + * concurrent_index_reset_snapshot_every_n_pages is applied for the scan. + * That leads to changing snapshots on the fly to allow xmin horizon propagate. */ static inline double table_index_build_scan(Relation table_rel, diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 9aee8226347..3d93232361f 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -14,6 +14,7 @@ #ifndef INDEX_H #define INDEX_H +#include "access/xact.h" #include "catalog/objectaddress.h" #include "nodes/execnodes.h" @@ -26,6 +27,22 @@ typedef struct AttrMap AttrMap; #define DEFAULT_INDEX_TYPE "btree" +/* + * Does this concurrent index build use periodic snapshot resets? + * + * Snapshot resetting is only applicable when all of: + * - the build is concurrent (ii_Concurrent) + * - the index is non-unique (unique needs consistent snapshot) + * - isolation level is not REPEATABLE READ/SERIALIZABLE (those keep a + * registered transaction snapshot) + * - the build is not parallel (parallel needs separate infrastructure) + */ +#define IndexBuildResetsSnapshots(indexInfo) \ + ((indexInfo)->ii_Concurrent && \ + !(indexInfo)->ii_Unique && \ + !IsolationUsesXactSnapshot() && \ + !(indexInfo)->ii_ParallelWorkers) + /* Action code for index_set_state_flags */ typedef enum { diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 8ccdf61246b..09cbb4b1d99 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -272,6 +272,7 @@ extern PGDLLIMPORT int work_mem; extern PGDLLIMPORT double hash_mem_multiplier; extern PGDLLIMPORT int maintenance_work_mem; extern PGDLLIMPORT int max_parallel_maintenance_workers; +extern PGDLLIMPORT int concurrent_index_reset_snapshot_every_n_pages; /* * Upper and lower hard limits for the buffer access strategy ring size diff --git a/src/test/modules/injection_points/Makefile b/src/test/modules/injection_points/Makefile index f057d143d1a..0ba0e47f3b8 100644 --- a/src/test/modules/injection_points/Makefile +++ b/src/test/modules/injection_points/Makefile @@ -9,7 +9,7 @@ EXTENSION = injection_points DATA = injection_points--1.0.sql PGFILEDESC = "injection_points - facility for injection points" -REGRESS = injection_points hashagg reindex_conc vacuum +REGRESS = injection_points hashagg reindex_conc vacuum cic_reset_snapshots REGRESS_OPTS = --dlpath=$(top_builddir)/src/test/regress ISOLATION = basic \ diff --git a/src/test/modules/injection_points/expected/cic_reset_snapshots.out b/src/test/modules/injection_points/expected/cic_reset_snapshots.out new file mode 100644 index 00000000000..ed4aaaf3463 --- /dev/null +++ b/src/test/modules/injection_points/expected/cic_reset_snapshots.out @@ -0,0 +1,118 @@ +CREATE EXTENSION injection_points; +SELECT injection_points_set_local(); + injection_points_set_local +---------------------------- + +(1 row) + +SELECT injection_points_attach('heap_reset_scan_snapshot_effective', 'notice'); + injection_points_attach +------------------------- + +(1 row) + +SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice'); + injection_points_attach +------------------------- + +(1 row) + +CREATE SCHEMA cic_reset_snap; +CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); +INSERT INTO cic_reset_snap.tbl SELECT i, i * I FROM generate_series(1, 200) s(i); +CREATE FUNCTION cic_reset_snap.predicate_stable(integer) RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN MOD($1, 2) = 0; +END; $$; +CREATE FUNCTION cic_reset_snap.predicate_stable_no_param() RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN false; +END; $$; +---------------- +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +-- The same in parallel mode +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i DESC NULLS LAST); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +BEGIN TRANSACTION; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +ERROR: CREATE INDEX CONCURRENTLY cannot run inside a transaction block +ROLLBACK ; +SET default_transaction_isolation = 'repeatable read'; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +SET default_transaction_isolation = serializable; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +RESET default_transaction_isolation; +DROP SCHEMA cic_reset_snap CASCADE; +NOTICE: drop cascades to 3 other objects +DETAIL: drop cascades to table cic_reset_snap.tbl +drop cascades to function cic_reset_snap.predicate_stable(integer) +drop cascades to function cic_reset_snap.predicate_stable_no_param() +DROP EXTENSION injection_points; diff --git a/src/test/modules/injection_points/meson.build b/src/test/modules/injection_points/meson.build index fb1418e2caa..fe58023f904 100644 --- a/src/test/modules/injection_points/meson.build +++ b/src/test/modules/injection_points/meson.build @@ -36,6 +36,7 @@ tests += { 'hashagg', 'reindex_conc', 'vacuum', + 'cic_reset_snapshots', ], 'regress_args': ['--dlpath', meson.project_build_root() / 'src/test/regress'], # The injection points are cluster-wide, so disable installcheck diff --git a/src/test/modules/injection_points/sql/cic_reset_snapshots.sql b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql new file mode 100644 index 00000000000..553787539b3 --- /dev/null +++ b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql @@ -0,0 +1,101 @@ +CREATE EXTENSION injection_points; + +SELECT injection_points_set_local(); +SELECT injection_points_attach('heap_reset_scan_snapshot_effective', 'notice'); +SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice'); + + +CREATE SCHEMA cic_reset_snap; +CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); +INSERT INTO cic_reset_snap.tbl SELECT i, i * I FROM generate_series(1, 200) s(i); + +CREATE FUNCTION cic_reset_snap.predicate_stable(integer) RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN MOD($1, 2) = 0; +END; $$; + +CREATE FUNCTION cic_reset_snap.predicate_stable_no_param() RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN false; +END; $$; + +---------------- +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); + +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +-- The same in parallel mode +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); + +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i DESC NULLS LAST); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +BEGIN TRANSACTION; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +ROLLBACK ; + +SET default_transaction_isolation = 'repeatable read'; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +SET default_transaction_isolation = serializable; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +RESET default_transaction_isolation; + +DROP SCHEMA cic_reset_snap CASCADE; + +DROP EXTENSION injection_points; -- 2.43.0 [application/octet-stream] v5-0004-Support-snapshot-resets-in-concurrent-builds-of-u.patch (41.9K, 5-v5-0004-Support-snapshot-resets-in-concurrent-builds-of-u.patch) download | inline diff: From eb921ae95108d78a79341fc78c8c06186f62dba4 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <[email protected]> Date: Sat, 11 Apr 2026 18:45:47 +0200 Subject: [PATCH v5 4/4] Support snapshot resets in concurrent builds of unique indexes Previously, concurrent builds of unique index used a fixed snapshot for the entire scan to ensure proper uniqueness checks. Now reset snapshots periodically during concurrent unique index builds, while still maintaining uniqueness by: - ignoring SnapshotSelf dead tuples during uniqueness checks in tuplesort as not a guarantee, but a fail-fast mechanics - adding a uniqueness check in _bt_load that detects multiple alive tuples with the same key values as a guarantee of correctness Tuples are SnapshotSelf tested only in the case of equal index key values, otherwise _bt_load works like before. --- src/backend/access/heap/README.HOT | 12 +- src/backend/access/heap/heapam_handler.c | 7 +- src/backend/access/nbtree/nbtdedup.c | 8 +- src/backend/access/nbtree/nbtreadpage.c | 2 +- src/backend/access/nbtree/nbtsort.c | 158 ++++++++++++++---- src/backend/access/nbtree/nbtsplitloc.c | 12 +- src/backend/access/nbtree/nbtutils.c | 59 ++++++- src/backend/catalog/index.c | 6 +- src/backend/commands/indexcmds.c | 4 +- src/backend/utils/misc/guc_parameters.dat | 10 ++ src/backend/utils/sort/tuplesort.c | 2 + src/backend/utils/sort/tuplesortvariants.c | 101 +++++++++-- src/include/access/nbtree.h | 6 +- src/include/access/tableam.h | 7 +- src/include/catalog/index.h | 2 - src/include/miscadmin.h | 1 + src/include/utils/tuplesort.h | 3 + .../expected/cic_reset_snapshots.out | 6 + 18 files changed, 320 insertions(+), 86 deletions(-) diff --git a/src/backend/access/heap/README.HOT b/src/backend/access/heap/README.HOT index 74e407f375a..829dad1194e 100644 --- a/src/backend/access/heap/README.HOT +++ b/src/backend/access/heap/README.HOT @@ -386,12 +386,12 @@ have the HOT-safety property enforced before we start to build the new index. After waiting for transactions which had the table open, we build the index -for all rows that are valid in a fresh snapshot. Any tuples visible in the -snapshot will have only valid forward-growing HOT chains. (They might have -older HOT updates behind them which are broken, but this is OK for the same -reason it's OK in a regular index build.) As above, we point the index -entry at the root of the HOT-update chain but we use the key value from the -live tuple. +for all rows that are valid in a fresh snapshot, which is updated every so +often. Any tuples visible in the snapshot will have only valid forward-growing +HOT chains. (They might have older HOT updates behind them which are broken, +but this is OK for the same reason it's OK in a regular index build.) +As above, we point the index entry at the root of the HOT-update chain but we +use the key value from the live tuple. We mark the index open for inserts (but still not ready for reads) then we again wait for transactions which have the table open. Then we take diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 1184ed086fe..68f6e6731df 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1214,13 +1214,14 @@ heapam_index_build_range_scan(Relation heapRelation, * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, or during bootstrap, we take a regular MVCC snapshot * and index whatever's live according to that while that snapshot is reset - * every so often (in the case of non-unique index). + * every so often. */ OldestXmin = InvalidTransactionId; /* - * For unique indexes we need a consistent snapshot for the whole scan. - * Resetting snapshots also doesn't work in xact-snapshot isolation modes, + * For concurrent builds of non-system indexes, we want to periodically + * reset snapshots. + * Resetting snapshots doesn't work in xact-snapshot isolation modes, * because those keep a registered transaction snapshot for the whole xact. */ reset_snapshots = IndexBuildResetsSnapshots(indexInfo) && diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c index af7affdf409..10f4f7eeba9 100644 --- a/src/backend/access/nbtree/nbtdedup.c +++ b/src/backend/access/nbtree/nbtdedup.c @@ -149,7 +149,7 @@ _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, _bt_dedup_start_pending(state, itup, offnum); } else if (state->deduplicate && - _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + _bt_keep_natts_fast(rel, state->base, itup, NULL) > nkeyatts && _bt_dedup_save_htid(state, itup)) { /* @@ -376,7 +376,7 @@ _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, /* itup starts first pending interval */ _bt_dedup_start_pending(state, itup, offnum); } - else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + else if (_bt_keep_natts_fast(rel, state->base, itup, NULL) > nkeyatts && _bt_dedup_save_htid(state, itup)) { /* Tuple is equal; just added its TIDs to pending interval */ @@ -789,12 +789,12 @@ _bt_do_singleval(Relation rel, Page page, BTDedupState state, itemid = PageGetItemId(page, minoff); itup = (IndexTuple) PageGetItem(page, itemid); - if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + if (_bt_keep_natts_fast(rel, newitem, itup, NULL) > nkeyatts) { itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page)); itup = (IndexTuple) PageGetItem(page, itemid); - if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + if (_bt_keep_natts_fast(rel, newitem, itup, NULL) > nkeyatts) return true; } diff --git a/src/backend/access/nbtree/nbtreadpage.c b/src/backend/access/nbtree/nbtreadpage.c index 2ba1ca66023..c4c278e777a 100644 --- a/src/backend/access/nbtree/nbtreadpage.c +++ b/src/backend/access/nbtree/nbtreadpage.c @@ -623,7 +623,7 @@ _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate) lasttup = (IndexTuple) PageGetItem(pstate->page, iid); /* Determine the first attribute whose values change on caller's page */ - firstchangingattnum = _bt_keep_natts_fast(rel, firsttup, lasttup); + firstchangingattnum = _bt_keep_natts_fast(rel, firsttup, lasttup, NULL); for (; startikey < so->numberOfKeys; startikey++) { diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 6bb365c951d..3982e5e8a1d 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -87,6 +87,7 @@ typedef struct BTSpool Relation index; bool isunique; bool nulls_not_distinct; + bool unique_dead_ignored; } BTSpool; /* @@ -105,6 +106,7 @@ typedef struct BTShared Oid indexrelid; bool isunique; bool nulls_not_distinct; + bool unique_dead_ignored; bool isconcurrent; int scantuplesortstates; @@ -207,15 +209,14 @@ typedef struct BTLeader */ typedef struct BTBuildState { - bool isunique; - bool nulls_not_distinct; bool havedead; Relation heap; BTSpool *spool; /* - * spool2 is needed only when the index is a unique index. Dead tuples are - * put into spool2 instead of spool in order to avoid uniqueness check. + * spool2 is needed only when the index is a unique index and built + * non-concurrently. Dead tuples are put into spool2 instead of spool + * in order to avoid uniqueness check. */ BTSpool *spool2; double indtuples; @@ -262,7 +263,7 @@ static double _bt_spools_heapscan(Relation heap, Relation index, static void _bt_spooldestroy(BTSpool *btspool); static void _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull); -static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots); +static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool isconcurrent); static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state); static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level); @@ -307,8 +308,6 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) ResetUsage(); #endif /* BTREE_BUILD_STATS */ - buildstate.isunique = indexInfo->ii_Unique; - buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct; buildstate.havedead = false; buildstate.heap = heap; buildstate.spool = NULL; @@ -385,6 +384,12 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, btspool->index = index; btspool->isunique = indexInfo->ii_Unique; btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct; + /* + * We need to ignore dead tuples for unique checks only when concurrent + * unique builds actually use periodic snapshot resets. + */ + btspool->unique_dead_ignored = IndexBuildResetsSnapshots(indexInfo) && + indexInfo->ii_Unique; /* Save as primary spool */ buildstate->spool = btspool; @@ -433,8 +438,9 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * the use of parallelism or any other factor. */ buildstate->spool->sortstate = - tuplesort_begin_index_btree(heap, index, buildstate->isunique, - buildstate->nulls_not_distinct, + tuplesort_begin_index_btree(heap, index, btspool->isunique, + btspool->nulls_not_distinct, + btspool->unique_dead_ignored, maintenance_work_mem, coordinate, TUPLESORT_NONE); @@ -442,8 +448,12 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * If building a unique index, put dead tuples in a second spool to keep * them out of the uniqueness check. We expect that the second spool (for * dead tuples) won't get very full, so we give it only work_mem. + * + * In case of concurrent build dead tuples do not need to be put into index + * since we wait for all snapshots older than reference snapshot during the + * validation phase. */ - if (indexInfo->ii_Unique) + if (indexInfo->ii_Unique && !IndexBuildResetsSnapshots(indexInfo)) { BTSpool *btspool2 = palloc0_object(BTSpool); SortCoordinate coordinate2 = NULL; @@ -474,7 +484,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * full, so we give it only work_mem */ buildstate->spool2->sortstate = - tuplesort_begin_index_btree(heap, index, false, false, work_mem, + tuplesort_begin_index_btree(heap, index, false, false, false, work_mem, coordinate2, TUPLESORT_NONE); } @@ -1155,14 +1165,100 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index); SortSupport sortKeys; int64 tuples_done = 0; - bool deduplicate; + bool deduplicate, + fail_on_alive_duplicate; wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM); deduplicate = wstate->inskey->allequalimage && !btspool->isunique && BTGetDeduplicateItems(wstate->index); + /* + * The unique_dead_ignored does not guarantee absence of multiple alive + * tuples with the same values in the spool. Such a case may happen if + * alive tuples are located between a few dead tuples, like this: addda. + */ + fail_on_alive_duplicate = btspool->unique_dead_ignored; - if (merge) + if (fail_on_alive_duplicate) + { + bool seen_alive = false, + prev_tested = false; + IndexTuple prev = NULL; + TupleTableSlot *slot = MakeSingleTupleTableSlot(RelationGetDescr(wstate->heap), + &TTSOpsBufferHeapTuple); + IndexFetchTableData *fetch = table_index_fetch_begin(wstate->heap, SO_NONE); + + Assert(btspool->isunique); + Assert(!btspool2); + + while ((itup = tuplesort_getindextuple(btspool->sortstate, true)) != NULL) + { + bool tuples_equal = false; + + /* When we see first tuple, create first index page */ + if (state == NULL) + state = _bt_pagestate(wstate, 0); + + if (prev != NULL) /* if this is not the first tuple */ + { + bool has_nulls = false, + call_again = false, + ignored = false, + now_alive; + ItemPointerData tid; + + /* is this tuple equal to previous one? */ + if (wstate->inskey->allequalimage) + tuples_equal = _bt_keep_natts_fast(wstate->index, prev, itup, &has_nulls) > keysz; + else + tuples_equal = _bt_keep_natts(wstate->index, prev, itup, wstate->inskey, &has_nulls) > keysz; + + /* handle null values correctly */ + if (has_nulls && !btspool->nulls_not_distinct) + tuples_equal = false; + + if (tuples_equal) + { + /* check previous tuple if not yet */ + if (!prev_tested) + { + call_again = false; + tid = prev->t_tid; + seen_alive = table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + prev_tested = true; + } + + call_again = false; + tid = itup->t_tid; + now_alive = table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + + /* are multiple alive tuples detected in equal group? */ + if (seen_alive && now_alive) + _bt_report_duplicate(wstate->index, wstate->heap, itup); + seen_alive |= now_alive; + } + } + + if (!tuples_equal) + { + seen_alive = false; + prev_tested = false; + } + + _bt_buildadd(wstate, state, itup, 0); + if (prev) pfree(prev); + prev = CopyIndexTuple(itup); + + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + if (prev) pfree(prev); + ExecDropSingleTupleTableSlot(slot); + table_index_fetch_end(fetch); + Assert(!TransactionIdIsValid(MyProc->xmin)); + } + else if (merge) { /* * Another BTSpool for dead tuples exists. Now we have to merge @@ -1322,7 +1418,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) InvalidOffsetNumber); } else if (_bt_keep_natts_fast(wstate->index, dstate->base, - itup) > keysz && + itup, NULL) > keysz && _bt_dedup_save_htid(dstate, itup)) { /* @@ -1438,15 +1534,13 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) scantuplesortstates = leaderparticipates ? request + 1 : request; /* - * For concurrent non-unique index builds, we can periodically reset - * snapshots to allow the xmin horizon to advance. This is safe since - * these builds don't require a consistent view across the entire scan. - * Unique indexes still need a stable snapshot to properly enforce - * uniqueness constraints. Isolation modes where + * For concurrent index builds, we can periodically reset snapshots to + * allow the xmin horizon to advance. + * Isolation modes where * IsolationUsesXactSnapshot() is true also prevent resetting because * they keep a registered transaction snapshot for the whole transaction. */ - reset_snapshot = isconcurrent && !IsolationUsesXactSnapshot() && !btspool->isunique; + reset_snapshot = isconcurrent && !IsolationUsesXactSnapshot(); /* * Prepare for scan of the base relation. In a normal index build, we use @@ -1454,7 +1548,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, we take a regular MVCC snapshot and index whatever's * live according to that, while that snapshot may be reset periodically - * for non-unique indexes in non-xact-snapshot isolation modes. + * in non-xact-snapshot isolation modes. */ if (!isconcurrent) { @@ -1483,10 +1577,10 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) shm_toc_estimate_chunk(&pcxt->estimator, estsort); /* - * Unique case requires a second spool, and so we may have to account for - * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2 + * Non-concurrent unique case requires a second spool, and so we may have + * to account for another shared workspace -- PARALLEL_KEY_TUPLESORT_SPOOL2 */ - if (!btspool->isunique) + if (!btspool->isunique || reset_snapshot) shm_toc_estimate_keys(&pcxt->estimator, 2); else { @@ -1541,6 +1635,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) btshared->indexrelid = RelationGetRelid(btspool->index); btshared->isunique = btspool->isunique; btshared->nulls_not_distinct = btspool->nulls_not_distinct; + btshared->unique_dead_ignored = btspool->unique_dead_ignored; btshared->isconcurrent = isconcurrent; btshared->scantuplesortstates = scantuplesortstates; btshared->queryid = pgstat_get_my_query_id(); @@ -1568,8 +1663,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared); shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort); - /* Unique case requires a second spool, and associated shared state */ - if (!btspool->isunique) + /* Non-concurrent unique case requires a second spool and shared state */ + if (!btspool->isunique || reset_snapshot) sharedsort2 = NULL; else { @@ -1752,9 +1847,10 @@ _bt_leader_participate_as_worker(BTBuildState *buildstate) leaderworker->index = buildstate->spool->index; leaderworker->isunique = buildstate->spool->isunique; leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct; + leaderworker->unique_dead_ignored = buildstate->spool->unique_dead_ignored; /* Initialize second spool, if required */ - if (!btleader->btshared->isunique) + if (!btleader->btshared->isunique || (btleader->btshared->isconcurrent && !IsolationUsesXactSnapshot())) leaderworker2 = NULL; else { @@ -1855,11 +1951,12 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) btspool->index = indexRel; btspool->isunique = btshared->isunique; btspool->nulls_not_distinct = btshared->nulls_not_distinct; + btspool->unique_dead_ignored = btshared->unique_dead_ignored; /* Look up shared state private to tuplesort.c */ sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false); tuplesort_attach_shared(sharedsort, seg); - if (!btshared->isunique) + if (!btshared->isunique || (btshared->isconcurrent && !IsolationUsesXactSnapshot())) { btspool2 = NULL; sharedsort2 = NULL; @@ -1939,6 +2036,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, btspool->index, btspool->isunique, btspool->nulls_not_distinct, + btspool->unique_dead_ignored, sortmem, coordinate, TUPLESORT_NONE); @@ -1961,14 +2059,12 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, coordinate2->nParticipants = -1; coordinate2->sharedsort = sharedsort2; btspool2->sortstate = - tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false, + tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false, false, Min(sortmem, work_mem), coordinate2, false); } /* Fill in buildstate for _bt_build_callback() */ - buildstate.isunique = btshared->isunique; - buildstate.nulls_not_distinct = btshared->nulls_not_distinct; buildstate.havedead = false; buildstate.heap = btspool->heap; buildstate.spool = btspool; diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index de9eca3c8b2..b174fb8d3ee 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -688,7 +688,7 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, { itemid = PageGetItemId(state->origpage, maxoff); tup = (IndexTuple) PageGetItem(state->origpage, itemid); - keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); + keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem, NULL); if (keepnatts > 1 && keepnatts <= nkeyatts) { @@ -719,7 +719,7 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) return false; /* Check same conditions as rightmost item case, too */ - keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); + keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem, NULL); if (keepnatts > 1 && keepnatts <= nkeyatts) { @@ -968,7 +968,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, * avoid appending a heap TID in new high key, we're done. Finish split * with default strategy and initial split interval. */ - perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost, NULL); if (perfectpenalty <= indnkeyatts) return perfectpenalty; @@ -989,7 +989,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, * If page is entirely full of duplicates, a single value strategy split * will be performed. */ - perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost, NULL); if (perfectpenalty <= indnkeyatts) { *strategy = SPLIT_MANY_DUPLICATES; @@ -1028,7 +1028,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, itemid = PageGetItemId(state->origpage, P_HIKEY); hikey = (IndexTuple) PageGetItem(state->origpage, itemid); perfectpenalty = _bt_keep_natts_fast(state->rel, hikey, - state->newitem); + state->newitem, NULL); if (perfectpenalty <= indnkeyatts) *strategy = SPLIT_SINGLE_VALUE; else @@ -1150,7 +1150,7 @@ _bt_split_penalty(FindSplitData *state, SplitPoint *split) lastleft = _bt_split_lastleft(state, split); firstright = _bt_split_firstright(state, split); - return _bt_keep_natts_fast(state->rel, lastleft, firstright); + return _bt_keep_natts_fast(state->rel, lastleft, firstright, NULL); } /* diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 014faa1622f..e9351e54d7f 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -17,6 +17,7 @@ #include <time.h> +#include "access/genam.h" #include "access/nbtree.h" #include "access/reloptions.h" #include "access/relscan.h" @@ -32,9 +33,6 @@ static int _bt_compare_int(const void *va, const void *vb); -static int _bt_keep_natts(Relation rel, IndexTuple lastleft, - IndexTuple firstright, BTScanInsert itup_key); - /* * _bt_mkscankey @@ -707,7 +705,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright)); /* Determine how many attributes must be kept in truncated tuple */ - keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key); + keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key, NULL); #ifdef DEBUG_NO_TRUNCATE /* Force truncation to be ineffective for testing purposes */ @@ -825,17 +823,24 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, /* * _bt_keep_natts - how many key attributes to keep when truncating. * + * This is exported to be used as comparison function during concurrent + * unique index build in case _bt_keep_natts_fast is not suitable because + * collation is not "allequalimage"/deduplication-safe. + * * Caller provides two tuples that enclose a split point. Caller's insertion * scankey is used to compare the tuples; the scankey's argument values are * not considered here. * + * hasnulls value set to true in case of any null column in any tuple. + * * This can return a number of attributes that is one greater than the * number of key attributes for the index relation. This indicates that the * caller must use a heap TID as a unique-ifier in new pivot tuple. */ -static int +int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, - BTScanInsert itup_key) + BTScanInsert itup_key, + bool *hasnulls) { int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); TupleDesc itupdesc = RelationGetDescr(rel); @@ -861,6 +866,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + if (hasnulls) + *hasnulls |= isNull1 | isNull2; if (isNull1 != isNull2) break; @@ -880,7 +887,7 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * expected in an allequalimage index. */ Assert(!itup_key->allequalimage || - keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright)); + keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright, NULL)); return keepnatts; } @@ -891,7 +898,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * This is exported so that a candidate split point can have its effect on * suffix truncation inexpensively evaluated ahead of time when finding a * split location. A naive bitwise approach to datum comparisons is used to - * save cycles. + * save cycles. Also, it may be used as comparison function during concurrent + * build of unique index. * * The approach taken here usually provides the same answer as _bt_keep_natts * will (for the same pair of tuples from a heapkeyspace index), since the @@ -900,6 +908,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * "equal image" columns, routine is guaranteed to give the same result as * _bt_keep_natts would. * + * hasnulls value set to true in case of any null column in any tuple. + * * Callers can rely on the fact that attributes considered equal here are * definitely also equal according to _bt_keep_natts, even when the index uses * an opclass or collation that is not "allequalimage"/deduplication-safe. @@ -908,7 +918,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * more balanced split point. */ int -_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) +_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright, + bool *hasnulls) { TupleDesc itupdesc = RelationGetDescr(rel); int keysz = IndexRelationGetNumberOfKeyAttributes(rel); @@ -925,6 +936,8 @@ _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + if (hasnulls) + *hasnulls |= isNull1 | isNull2; att = TupleDescCompactAttr(itupdesc, attnum - 1); if (isNull1 != isNull2) @@ -1102,6 +1115,34 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum) return tupnatts > 0 && tupnatts <= nkeyatts; } +/* + * _bt_report_duplicate() -- report a unique violation during index build. + * + * This is used by both _bt_load() and the tuplesort comparator's fail-fast + * check to report duplicate keys found during CREATE INDEX CONCURRENTLY or + * REINDEX CONCURRENTLY with snapshot resets enabled. + */ +void +_bt_report_duplicate(Relation indexRel, Relation heapRel, IndexTuple itup) +{ + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + char *key_desc; + + index_deform_tuple(itup, RelationGetDescr(indexRel), values, isnull); + + key_desc = BuildIndexValueDescription(indexRel, values, isnull); + + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(heapRel, + RelationGetRelationName(indexRel)))); +} + /* * * _bt_check_third_page() -- check whether tuple fits on a btree page at all. diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index ce017e836be..fa073a33b65 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -3344,9 +3344,9 @@ IndexCheckExclusion(Relation heapRelation, * if we used HeapTupleSatisfiesVacuum). This leaves us with an index that * does not contain any tuples added to the table while we built the index. * - * Furthermore, in case of non-unique index we set SO_RESET_SNAPSHOT for the - * scan, which causes new snapshot to be set as active every so often. The reason - * for that is to propagate the xmin horizon forward. + * Furthermore, we set SO_RESET_SNAPSHOT for the scan, which causes new + * snapshot to be set as active every so often. The reason for that is to + * propagate the xmin horizon forward. * * Next, we mark the index "indisready" (but still not "indisvalid") and * commit the second transaction and start a third. Again we wait for all diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index be3dc5e8d28..9fff8635d3d 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1704,8 +1704,8 @@ DefineIndex(ParseState *pstate, * chains can be created where the new tuple and the old tuple in the * chain have different index keys. * - * We build the index using all tuples that are visible using single or - * multiple refreshing snapshots. We can be sure that any HOT updates to + * We build the index using all tuples that are visible using multiple + * refreshing snapshots. We can be sure that any HOT updates to * these tuples will be compatible with the index, since any updates made * by transactions that didn't know about the index are now committed or * rolled back. Thus, each visible tuple is either the end of its diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index df92b9fee68..fb960384c18 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -1356,6 +1356,16 @@ show_hook => 'show_in_hot_standby', }, +{ name => 'index_build_duplicate_check_max_fetch_pct', type => 'int', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Limits heap fetches during fail-fast duplicate checking in concurrent unique index builds.', + long_desc => 'Maximum percentage of total tuplesort tuples that may trigger heap fetches in the sort comparator. Zero disables the fail-fast check. 100 allows unlimited fetches.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'index_build_duplicate_check_max_fetch_pct', + boot_val => '1', + min => '0', + max => '100', +}, + { name => 'integer_datetimes', type => 'bool', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', short_desc => 'Shows whether datetimes are integer based.', flags => 'GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 72c2c2995d8..b9caa001fdc 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -688,6 +688,7 @@ tuplesort_begin_batch(Tuplesortstate *state) state->tapeset = NULL; state->memtupcount = 0; + state->base.ntuples = 0; state->growmemtuples = true; state->slabAllocatorUsed = false; @@ -1073,6 +1074,7 @@ tuplesort_puttuple_common(Tuplesortstate *state, SortTuple *tuple, /* account for the memory used for this tuple */ USEMEM(state, tuplen); state->tupleMem += tuplen; + state->base.ntuples++; if (!useAbbrev) { diff --git a/src/backend/utils/sort/tuplesortvariants.c b/src/backend/utils/sort/tuplesortvariants.c index 2509ac3e3a4..fb5d9126509 100644 --- a/src/backend/utils/sort/tuplesortvariants.c +++ b/src/backend/utils/sort/tuplesortvariants.c @@ -25,9 +25,12 @@ #include "access/hash.h" #include "access/htup_details.h" #include "access/nbtree.h" +#include "access/relscan.h" +#include "access/tableam.h" #include "catalog/index.h" #include "catalog/pg_collation.h" #include "executor/executor.h" +#include "miscadmin.h" #include "pg_trace.h" #include "utils/builtins.h" #include "utils/datum.h" @@ -35,7 +38,9 @@ #include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/tuplesort.h" +#include "storage/proc.h" +int index_build_duplicate_check_max_fetch_pct = 1; /* sort-type codes for sort__start probes */ #define HEAP_SORT 0 @@ -136,6 +141,9 @@ typedef struct bool enforceUnique; /* complain if we find duplicate tuples */ bool uniqueNullsNotDistinct; /* unique constraint null treatment */ + bool uniqueDeadIgnored; /* ignore dead tuples in unique check */ + int heapFetchCount; /* heap fetches performed in comparator */ + int heapFetchLimit; /* max allowed, -1 = not yet computed */ } TuplesortIndexBTreeArg; /* @@ -361,6 +369,7 @@ tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, + bool uniqueDeadIgnored, int workMem, SortCoordinate coordinate, int sortopt) @@ -403,6 +412,9 @@ tuplesort_begin_index_btree(Relation heapRel, arg->index.indexRel = indexRel; arg->enforceUnique = enforceUnique; arg->uniqueNullsNotDistinct = uniqueNullsNotDistinct; + arg->uniqueDeadIgnored = uniqueDeadIgnored; + arg->heapFetchCount = 0; + arg->heapFetchLimit = -1; indexScanKey = _bt_mkscankey(indexRel, NULL); @@ -1667,9 +1679,7 @@ comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, */ if (arg->enforceUnique && !(!arg->uniqueNullsNotDistinct && equal_hasnull)) { - Datum values[INDEX_MAX_KEYS]; - bool isnull[INDEX_MAX_KEYS]; - char *key_desc; + bool uniqueCheckFail = true; /* * Some rather brain-dead implementations of qsort (such as the one in @@ -1679,18 +1689,79 @@ comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, */ Assert(tuple1 != tuple2); - index_deform_tuple(tuple1, tupDes, values, isnull); - - key_desc = BuildIndexValueDescription(arg->index.indexRel, values, isnull); - - ereport(ERROR, - (errcode(ERRCODE_UNIQUE_VIOLATION), - errmsg("could not create unique index \"%s\"", - RelationGetRelationName(arg->index.indexRel)), - key_desc ? errdetail("Key %s is duplicated.", key_desc) : - errdetail("Duplicate keys exist."), - errtableconstraint(arg->index.heapRel, - RelationGetRelationName(arg->index.indexRel)))); + /* + * Fail-fast check: perform heap fetches to see if either tuple is + * dead, allowing us to skip the uniqueness error. See _bt_load for + * the definitive check. + * + * The number of heap fetches is bounded by + * index_build_duplicate_check_max_fetch_pct to avoid excessive I/O + * when many equal keys are compared during sorting. Once the budget + * is exhausted, we skip the fail-fast check and let _bt_load() + * handle uniqueness verification. + */ + if (arg->uniqueDeadIgnored) + { + /* Lazily compute the heap fetch limit */ + if (arg->heapFetchLimit < 0) + { + if (index_build_duplicate_check_max_fetch_pct > 0) + arg->heapFetchLimit = (int) (base->ntuples * + index_build_duplicate_check_max_fetch_pct / 100); + else + arg->heapFetchLimit = 0; + } + + if (arg->heapFetchLimit > 0 && + arg->heapFetchCount < arg->heapFetchLimit) + { + bool any_tuple_dead, + call_again = false, + ignored; + /* + * Keep the slot/fetch lifetime local to this duplicate check + * so the xmin propagation assertion below continues to + * validate cleanup. + */ + TupleTableSlot *slot = MakeSingleTupleTableSlot(RelationGetDescr(arg->index.heapRel), + &TTSOpsBufferHeapTuple); + ItemPointerData tid = tuple1->t_tid; + + IndexFetchTableData *fetch = table_index_fetch_begin(arg->index.heapRel, SO_NONE); + any_tuple_dead = !table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + arg->heapFetchCount++; + + if (!any_tuple_dead) + { + call_again = false; + tid = tuple2->t_tid; + any_tuple_dead = !table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, + &ignored); + arg->heapFetchCount++; + } + + if (any_tuple_dead) + { + elog(DEBUG5, "skipping duplicate values because some of them are dead: (%u,%u) vs (%u,%u)", + ItemPointerGetBlockNumber(&tuple1->t_tid), + ItemPointerGetOffsetNumber(&tuple1->t_tid), + ItemPointerGetBlockNumber(&tuple2->t_tid), + ItemPointerGetOffsetNumber(&tuple2->t_tid)); + + uniqueCheckFail = false; + } + ExecDropSingleTupleTableSlot(slot); + table_index_fetch_end(fetch); + Assert(!TransactionIdIsValid(MyProc->xmin)); + } + else + { + /* Budget exhausted; defer to _bt_load() */ + uniqueCheckFail = false; + } + } + if (uniqueCheckFail) + _bt_report_duplicate(arg->index.indexRel, arg->index.heapRel, tuple1); } /* diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 3097e9bb1af..bd789b7d344 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1307,13 +1307,17 @@ extern bool btproperty(Oid index_oid, int attno, extern char *btbuildphasename(int64 phasenum); extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key); +extern int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, + BTScanInsert itup_key, bool *hasnulls); extern int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, - IndexTuple firstright); + IndexTuple firstright, bool *hasnulls); extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum); extern void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup); extern bool _bt_allequalimage(Relation rel, bool debugmessage); +extern void _bt_report_duplicate(Relation indexRel, Relation heapRel, + IndexTuple itup); /* * prototypes for functions in nbtvalidate.c diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 219525dbcb0..d1a298bf1bb 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -1867,9 +1867,10 @@ table_scan_analyze_next_tuple(TableScanDesc scan, * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. * - * In case of non-unique concurrent build, - * concurrent_index_reset_snapshot_every_n_pages is applied for the scan. - * That leads to changing snapshots on the fly to allow xmin horizon propagate. + * In case of concurrent index build, + * concurrent_index_reset_snapshot_every_n_pages is applied for the + * scan. That leads to changing snapshots on the fly to allow xmin + * horizon propagate. */ static inline double table_index_build_scan(Relation table_rel, diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 60f8356ed82..ed000bf10d5 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -32,13 +32,11 @@ typedef struct AttrMap AttrMap; * * Snapshot resetting is only applicable when all of: * - the build is concurrent (ii_Concurrent) - * - the index is non-unique (unique needs consistent snapshot) * - isolation level is not REPEATABLE READ/SERIALIZABLE (those keep a * registered transaction snapshot) */ #define IndexBuildResetsSnapshots(indexInfo) \ ((indexInfo)->ii_Concurrent && \ - !(indexInfo)->ii_Unique && \ !IsolationUsesXactSnapshot()) /* Action code for index_set_state_flags */ diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 09cbb4b1d99..8954b7818f0 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -273,6 +273,7 @@ extern PGDLLIMPORT double hash_mem_multiplier; extern PGDLLIMPORT int maintenance_work_mem; extern PGDLLIMPORT int max_parallel_maintenance_workers; extern PGDLLIMPORT int concurrent_index_reset_snapshot_every_n_pages; +extern PGDLLIMPORT int index_build_duplicate_check_max_fetch_pct; /* * Upper and lower hard limits for the buffer access strategy ring size diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index da68f45acf2..191010c57c3 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -216,6 +216,8 @@ typedef struct bool tuples; /* Can SortTuple.tuple ever be set? */ + int64 ntuples; /* total tuples inserted into sort */ + void *arg; /* Specific information for the sort variant */ } TuplesortPublic; @@ -396,6 +398,7 @@ extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, + bool uniqueDeadIgnored, int workMem, SortCoordinate coordinate, int sortopt); extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel, diff --git a/src/test/modules/injection_points/expected/cic_reset_snapshots.out b/src/test/modules/injection_points/expected/cic_reset_snapshots.out index e5a8a7f3a79..7835a796b88 100644 --- a/src/test/modules/injection_points/expected/cic_reset_snapshots.out +++ b/src/test/modules/injection_points/expected/cic_reset_snapshots.out @@ -41,7 +41,11 @@ END; $$; ---------------- ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots @@ -86,7 +90,9 @@ SELECT injection_points_detach('heap_reset_scan_snapshot_effective'); (1 row) CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); NOTICE: notice triggered for injection point table_parallelscan_initialize -- 2.43.0 ^ permalink raw reply [nested|flat] 6+ messages in thread
* Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY 2026-03-09 00:03 Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY Mihail Nikalayeu <[email protected]> 2026-03-20 01:15 ` Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY Mihail Nikalayeu <[email protected]> 2026-03-21 23:50 ` Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY Mihail Nikalayeu <[email protected]> 2026-04-06 17:55 ` Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY Mihail Nikalayeu <[email protected]> 2026-04-11 17:48 ` Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY Mihail Nikalayeu <[email protected]> @ 2026-04-18 13:33 ` Mihail Nikalayeu <[email protected]> 0 siblings, 0 replies; 6+ messages in thread From: Mihail Nikalayeu @ 2026-04-18 13:33 UTC (permalink / raw) To: Álvaro Herrera <[email protected]>; +Cc: PostgreSQL Hackers <[email protected]>; Matthias van de Meent <[email protected]>; Antonin Houska <[email protected]>; Sergey Sargsyan <[email protected]>; Hannu Krosing <[email protected]> Some small fixes around. Attachments: [application/octet-stream] v6-0002-Reset-snapshots-periodically-in-non-unique-non-pa.patch (51.5K, 2-v6-0002-Reset-snapshots-periodically-in-non-unique-non-pa.patch) download | inline diff: From c6f4c96ea7cfa8217a021a84952b80d2ddbdbcfc Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <[email protected]> Date: Fri, 3 Apr 2026 18:23:01 +0200 Subject: [PATCH v6 2/4] Reset snapshots periodically in non-unique non-parallel concurrent index builds Long-living snapshots used by CREATE INDEX CONCURRENTLY and REINDEX CONCURRENTLY can hold back the global xmin horizon. Commit d9d0762 attempted to allow VACUUM to ignore such snapshots to mitigate this problem. However, this was reverted in commit e28bb88 because it could cause indexes to miss heap tuples that were HOT-updated and HOT-pruned during the index creation, leading to index corruption. This patch introduces an alternative by periodically resetting the snapshot used during the first phase. By resetting the snapshot every N pages during the heap scan, it allows the xmin horizon to advance. Currently, this technique is applied to: - only during the first scan of the heap: The second scan during index validation still uses a single snapshot to ensure index correctness - non-parallel index builds: Parallel index builds are not yet supported and will be addressed in following commits - non-unique indexes: Unique index builds still require a consistent snapshot to enforce uniqueness constraints, will be addressed in following commits A new scan option SO_RESET_SNAPSHOT is introduced. When set, it causes the snapshot to be reset "between" every SO_RESET_SNAPSHOT_EACH_N_PAGE pages during the scan. The heap scan code is adjusted to support this option, and the index build code is modified to use it for applicable concurrent index builds that are not on system catalogs and not using parallel workers. --- contrib/amcheck/verify_nbtree.c | 3 +- contrib/pgstattuple/pgstattuple.c | 2 +- src/backend/access/brin/brin.c | 21 +++- src/backend/access/gin/gininsert.c | 21 ++++ src/backend/access/gist/gistbuild.c | 5 + src/backend/access/hash/hash.c | 4 + src/backend/access/heap/heapam.c | 50 +++++++- src/backend/access/heap/heapam_handler.c | 74 +++++++++-- src/backend/access/index/genam.c | 2 +- src/backend/access/nbtree/nbtsort.c | 27 +++- src/backend/access/spgist/spginsert.c | 4 + src/backend/catalog/index.c | 30 ++++- src/backend/commands/indexcmds.c | 14 +-- src/backend/optimizer/plan/planner.c | 10 ++ src/backend/utils/misc/guc_parameters.dat | 10 ++ src/include/access/tableam.h | 31 ++++- src/include/catalog/index.h | 17 +++ src/include/miscadmin.h | 1 + src/test/modules/injection_points/Makefile | 2 +- .../expected/cic_reset_snapshots.out | 118 ++++++++++++++++++ src/test/modules/injection_points/meson.build | 1 + .../sql/cic_reset_snapshots.sql | 101 +++++++++++++++ 22 files changed, 509 insertions(+), 39 deletions(-) create mode 100644 src/test/modules/injection_points/expected/cic_reset_snapshots.out create mode 100644 src/test/modules/injection_points/sql/cic_reset_snapshots.sql diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index b74ab5f7a05..40874167631 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -557,7 +557,8 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - true); /* syncscan OK? */ + true, /* syncscan OK? */ + false); /* * Scan will behave as the first scan of a CREATE INDEX CONCURRENTLY diff --git a/contrib/pgstattuple/pgstattuple.c b/contrib/pgstattuple/pgstattuple.c index 6a7f8cb4a7c..a0e7ed2e137 100644 --- a/contrib/pgstattuple/pgstattuple.c +++ b/contrib/pgstattuple/pgstattuple.c @@ -335,7 +335,7 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo) errmsg("only heap AM is supported"))); /* Disable syncscan because we assume we scan from block zero upwards */ - scan = table_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false); + scan = table_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false, false); hscan = (HeapScanDesc) scan; InitDirtySnapshot(SnapshotDirty); diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index bdb30752e09..7e058f0d7ad 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -1221,11 +1221,13 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) state->bs_sortstate = tuplesort_begin_index_brin(maintenance_work_mem, coordinate, TUPLESORT_NONE); - + InvalidateCatalogSnapshot(); /* scan the relation and merge per-worker results */ reltuples = _brin_parallel_merge(state); _brin_end_parallel(state->bs_leader, state); + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); } else /* no parallel index build */ { @@ -1238,6 +1240,7 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) reltuples = table_index_build_scan(heap, index, indexInfo, false, true, brinbuildCallback, state, NULL); + InvalidateCatalogSnapshot(); /* * process the final batch * @@ -1257,6 +1260,8 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) brin_fill_empty_ranges(state, state->bs_currRangeStart, state->bs_maxRangeStart); + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); } /* release resources */ @@ -2390,6 +2395,7 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; + bool need_pop_active_snapshot = true; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -2415,9 +2421,16 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * live according to that. */ if (!isconcurrent) + { + Assert(ActiveSnapshotSet()); snapshot = SnapshotAny; + need_pop_active_snapshot = false; + } else + { snapshot = RegisterSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(GetTransactionSnapshot()); + } /* * Estimate size for our own PARALLEL_KEY_BRIN_SHARED workspace. @@ -2460,6 +2473,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, /* If no DSM segment was available, back out (do serial build) */ if (pcxt->seg == NULL) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); if (IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); @@ -2539,6 +2554,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); _brin_end_parallel(brinleader, NULL); return; } @@ -2555,6 +2572,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * sure that the failure-to-start case will not hang forever. */ WaitForParallelWorkersToAttach(pcxt); + if (need_pop_active_snapshot) + PopActiveSnapshot(); } /* diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 9d83a495775..4df216a268b 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -683,6 +683,9 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) buildstate.accum.ginstate = &buildstate.ginstate; ginInitBA(&buildstate.accum); + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); + /* Report table scan phase started */ pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_GIN_PHASE_INDEXBUILD_TABLESCAN); @@ -745,11 +748,13 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) tuplesort_begin_index_gin(heap, index, maintenance_work_mem, coordinate, TUPLESORT_NONE); + InvalidateCatalogSnapshot(); /* scan the relation in parallel and merge per-worker results */ reltuples = _gin_parallel_merge(state); _gin_end_parallel(state->bs_leader, state); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); } else /* no parallel index build */ { @@ -759,6 +764,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) */ reltuples = table_index_build_scan(heap, index, indexInfo, false, true, ginBuildCallback, &buildstate, NULL); + InvalidateCatalogSnapshot(); /* dump remaining entries to the index */ oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx); @@ -772,6 +778,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) list, nlist, &buildstate.buildStats); } MemoryContextSwitchTo(oldCtx); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); } MemoryContextDelete(buildstate.funcCtx); @@ -948,6 +955,7 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; + bool need_pop_active_snapshot = true; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -972,9 +980,16 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * live according to that. */ if (!isconcurrent) + { + Assert(ActiveSnapshotSet()); snapshot = SnapshotAny; + need_pop_active_snapshot = false; + } else + { snapshot = RegisterSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(GetTransactionSnapshot()); + } /* * Estimate size for our own PARALLEL_KEY_GIN_SHARED workspace. @@ -1017,6 +1032,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, /* If no DSM segment was available, back out (do serial build) */ if (pcxt->seg == NULL) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); if (IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); @@ -1091,6 +1108,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); _gin_end_parallel(ginleader, NULL); return; } @@ -1107,6 +1126,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * sure that the failure-to-start case will not hang forever. */ WaitForParallelWorkersToAttach(pcxt); + if (need_pop_active_snapshot) + PopActiveSnapshot(); } /* diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 7f57c787f4c..86af1e11317 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -38,11 +38,13 @@ #include "access/gist_private.h" #include "access/tableam.h" #include "access/xloginsert.h" +#include "catalog/index.h" #include "miscadmin.h" #include "nodes/execnodes.h" #include "optimizer/optimizer.h" #include "storage/bufmgr.h" #include "storage/bulk_write.h" +#include "storage/proc.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -259,6 +261,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) buildstate.indtuples = 0; buildstate.indtuplesSize = 0; + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); if (buildstate.buildMode == GIST_SORTED_BUILD) { /* @@ -350,6 +353,8 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = (double) buildstate.indtuples; + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); return result; } diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 8d8cd30dc38..44a374c0d57 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -24,12 +24,14 @@ #include "access/stratnum.h" #include "access/tableam.h" #include "access/xloginsert.h" +#include "catalog/index.h" #include "commands/progress.h" #include "commands/vacuum.h" #include "miscadmin.h" #include "nodes/execnodes.h" #include "optimizer/plancat.h" #include "pgstat.h" +#include "storage/proc.h" #include "storage/read_stream.h" #include "utils/fmgrprotos.h" #include "utils/index_selfuncs.h" @@ -210,6 +212,8 @@ hashbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); return result; } diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index abfd8e8970a..83344334e76 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -56,6 +56,9 @@ #include "utils/spccache.h" #include "utils/syscache.h" +/* GUCs */ +int concurrent_index_reset_snapshot_every_n_pages = 4096; + static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, CommandId cid, uint32 options); @@ -700,6 +703,36 @@ heap_prepare_pagescan(TableScanDesc sscan) LockBuffer(buffer, BUFFER_LOCK_UNLOCK); } +/* + * Reset the active snapshot during a scan. + * This ensures the xmin horizon can advance while maintaining safe tuple visibility. + * Note: No other snapshot should be active during this operation. + */ +static inline void +heap_reset_scan_snapshot(TableScanDesc sscan) +{ + /* Make sure no other snapshot was set as active. */ + Assert(GetActiveSnapshot() == sscan->rs_snapshot); + /* And make sure active snapshot is not registered. */ + Assert(GetActiveSnapshot()->regd_count == 0); + PopActiveSnapshot(); + + sscan->rs_snapshot = InvalidSnapshot; /* just to be tidy */ + Assert(!HaveRegisteredOrActiveSnapshot()); + InvalidateCatalogSnapshot(); + + /* The goal of snapshot reset is to allow horizon to advance. */ + Assert(!TransactionIdIsValid(MyProc->xmin)); +#if USE_INJECTION_POINTS + /* In some cases it is still not possible due xid assign. */ + if (!TransactionIdIsValid(MyProc->xid)) + INJECTION_POINT("heap_reset_scan_snapshot_effective", NULL); +#endif + + PushActiveSnapshot(GetLatestSnapshot()); + sscan->rs_snapshot = GetActiveSnapshot(); +} + /* * heap_fetch_next_buffer - read and pin the next block from MAIN_FORKNUM. * @@ -741,7 +774,13 @@ heap_fetch_next_buffer(HeapScanDesc scan, ScanDirection dir) scan->rs_cbuf = read_stream_next_buffer(scan->rs_read_stream, NULL); if (BufferIsValid(scan->rs_cbuf)) + { scan->rs_cblock = BufferGetBlockNumber(scan->rs_cbuf); + if (scan->rs_base.rs_flags & SO_RESET_SNAPSHOT && + concurrent_index_reset_snapshot_every_n_pages > 0 && + scan->rs_cblock % concurrent_index_reset_snapshot_every_n_pages == 0) + heap_reset_scan_snapshot((TableScanDesc) scan); + } } /* @@ -1421,7 +1460,16 @@ heap_endscan(TableScanDesc sscan) if (scan->rs_parallelworkerdata != NULL) pfree(scan->rs_parallelworkerdata); - +#ifdef USE_ASSERT_CHECKING + if (scan->rs_base.rs_flags & SO_RESET_SNAPSHOT) + { + Assert(!(scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)); + /* Make sure no other snapshot was set as active. */ + Assert(GetActiveSnapshot() == sscan->rs_snapshot); + /* And make sure snapshot is not registered. */ + Assert(GetActiveSnapshot()->regd_count == 0); + } +#endif if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT) UnregisterSnapshot(scan->rs_base.rs_snapshot); diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 20d3b46e062..faf3e04c449 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1163,8 +1163,17 @@ heapam_index_build_range_scan(Relation heapRelation, TupleTableSlot *slot; EState *estate; ExprContext *econtext; - Snapshot snapshot; + /* + * In isolation modes where IsolationUsesXactSnapshot() is true, the + * registered scan snapshot can differ from the active snapshot copy + * pushed for expression evaluation, so remember the registered one + * separately for later UnregisterSnapshot(). + */ + Snapshot snapshot, + registered_snapshot = InvalidSnapshot; bool need_unregister_snapshot = false; + bool need_pop_active_snapshot = false; + bool reset_snapshots = false; TransactionId OldestXmin; BlockNumber previous_blkno = InvalidBlockNumber; BlockNumber root_blkno = InvalidBlockNumber; @@ -1199,9 +1208,6 @@ heapam_index_build_range_scan(Relation heapRelation, /* Arrange for econtext's scan tuple to be the tuple under test */ econtext->ecxt_scantuple = slot; - /* Set up execution state for predicate, if any. */ - predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); - /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time @@ -1211,6 +1217,16 @@ heapam_index_build_range_scan(Relation heapRelation, */ OldestXmin = InvalidTransactionId; + /* + * For unique indexes we need a consistent snapshot for the whole scan. + * Resetting snapshots also doesn't work in xact-snapshot isolation modes, + * because those keep a registered transaction snapshot for the whole xact. + * In the case of parallel scan, some additional infrastructure is required + * to perform a scan with SO_RESET_SNAPSHOT which is not yet ready. + */ + reset_snapshots = IndexBuildResetsSnapshots(indexInfo) && + !is_system_catalog; /* just for the case */ + /* okay to ignore lazy VACUUMs here */ if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent) OldestXmin = GetOldestNonRemovableTransactionId(heapRelation); @@ -1219,24 +1235,42 @@ heapam_index_build_range_scan(Relation heapRelation, { /* * Serial index build. - * - * Must begin our own heap scan in this case. We may also need to - * register a snapshot whose lifetime is under our direct control. */ if (!TransactionIdIsValid(OldestXmin)) { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); - need_unregister_snapshot = true; + snapshot = GetTransactionSnapshot(); + /* + * Must begin our own heap scan in this case. We may also need to + * register a snapshot whose lifetime is under our direct control. + * In case of resetting of a snapshot during the scan, registration is + * not allowed because snapshot is going to be changed every so + * often. + */ + if (!reset_snapshots) + { + /* Store active snapshot because PushActiveSnapshot() may copy */ + snapshot = registered_snapshot = RegisterSnapshot(snapshot); + need_unregister_snapshot = true; + } + Assert(!ActiveSnapshotSet()); + PushActiveSnapshot(snapshot); + /* table_beginscan_strat() needs the exact active snapshot pointer */ + snapshot = GetActiveSnapshot(); + need_pop_active_snapshot = true; } else + { + Assert(!indexInfo->ii_Concurrent); snapshot = SnapshotAny; + } scan = table_beginscan_strat(heapRelation, /* relation */ snapshot, /* snapshot */ 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - allow_sync); /* syncscan OK? */ + allow_sync, /* syncscan OK? */ + reset_snapshots /* reset snapshots? */); } else { @@ -1250,6 +1284,12 @@ heapam_index_build_range_scan(Relation heapRelation, Assert(!IsBootstrapProcessingMode()); Assert(allow_sync); snapshot = scan->rs_snapshot; + if (IsMVCCSnapshot(snapshot)) + { + /* Don't expose SnapshotAny to SQL run by predicates/expressions. */ + PushActiveSnapshot(snapshot); + need_pop_active_snapshot = true; + } } hscan = (HeapScanDesc) scan; @@ -1264,6 +1304,13 @@ heapam_index_build_range_scan(Relation heapRelation, Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) : !TransactionIdIsValid(OldestXmin)); Assert(snapshot == SnapshotAny || !anyvisible); + Assert(snapshot == SnapshotAny || ActiveSnapshotSet()); + + /* Set up execution state for predicate, if any. */ + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + /* Clear reference to snapshot since it may be changed by the scan itself. */ + if (reset_snapshots) + snapshot = InvalidSnapshot; /* Publish number of blocks to scan */ if (progress) @@ -1699,9 +1746,11 @@ heapam_index_build_range_scan(Relation heapRelation, table_endscan(scan); + if (need_pop_active_snapshot) + PopActiveSnapshot(); /* we can now forget our snapshot, if set and registered by us */ if (need_unregister_snapshot) - UnregisterSnapshot(snapshot); + UnregisterSnapshot(registered_snapshot); ExecDropSingleTupleTableSlot(slot); @@ -1771,7 +1820,8 @@ heapam_index_validate_scan(Relation heapRelation, 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - false); /* syncscan not OK */ + false, /* syncscan not OK */ + false); hscan = (HeapScanDesc) scan; pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL, diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 97d44b84622..204b5b614ba 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -481,7 +481,7 @@ systable_beginscan(Relation heapRelation, */ sysscan->scan = table_beginscan_strat(heapRelation, snapshot, nkeys, key, - true, false); + true, false, false); sysscan->iscan = NULL; } diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 756dfa3dcf4..8d804d6bcfe 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -262,7 +262,7 @@ static double _bt_spools_heapscan(Relation heap, Relation index, static void _bt_spooldestroy(BTSpool *btspool); static void _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull); -static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2); +static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots); static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state); static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level); @@ -325,18 +325,20 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) RelationGetRelationName(index)); reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); /* * Finish the build by (1) completing the sort of the spool file, (2) * inserting the sorted tuples into btree pages and (3) building the upper * levels. Finally, it may also be necessary to end use of parallelism. */ - _bt_leafbuild(buildstate.spool, buildstate.spool2); + _bt_leafbuild(buildstate.spool, buildstate.spool2, IndexBuildResetsSnapshots(indexInfo)); _bt_spooldestroy(buildstate.spool); if (buildstate.spool2) _bt_spooldestroy(buildstate.spool2); if (buildstate.btleader) _bt_end_parallel(buildstate.btleader); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); result = palloc_object(IndexBuildResult); @@ -484,6 +486,8 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, else reltuples = _bt_parallel_heapscan(buildstate, &indexInfo->ii_BrokenHotChain); + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); /* * Set the progress target for the next phase. Reset the block number @@ -539,7 +543,7 @@ _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, co * create an entire btree. */ static void -_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) +_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots) { BTWriteState wstate; @@ -561,18 +565,21 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) PROGRESS_BTREE_PHASE_PERFORMSORT_2); tuplesort_performsort(btspool2->sortstate); } + Assert(!reset_snapshots || !TransactionIdIsValid(MyProc->xmin)); wstate.heap = btspool->heap; wstate.index = btspool->index; wstate.inskey = _bt_mkscankey(wstate.index, NULL); /* _bt_mkscankey() won't set allequalimage without metapage */ wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true); + InvalidateCatalogSnapshot(); /* reserve the metapage */ wstate.btws_pages_alloced = BTREE_METAPAGE + 1; pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_BTREE_PHASE_LEAF_LOAD); + Assert(!reset_snapshots || !TransactionIdIsValid(MyProc->xmin)); _bt_load(&wstate, btspool, btspool2); } @@ -1411,6 +1418,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; + bool need_pop_active_snapshot = true; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -1436,9 +1444,16 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * live according to that. */ if (!isconcurrent) + { + Assert(ActiveSnapshotSet()); snapshot = SnapshotAny; + need_pop_active_snapshot = false; + } else + { snapshot = RegisterSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(snapshot); + } /* * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and @@ -1492,6 +1507,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) /* If no DSM segment was available, back out (do serial build) */ if (pcxt->seg == NULL) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); if (IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); @@ -1586,6 +1603,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); _bt_end_parallel(btleader); return; } @@ -1602,6 +1621,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * sure that the failure-to-start case will not hang forever. */ WaitForParallelWorkersToAttach(pcxt); + if (need_pop_active_snapshot) + PopActiveSnapshot(); } /* diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 780ef646a54..ff457e3bbfa 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -20,10 +20,12 @@ #include "access/spgist_private.h" #include "access/tableam.h" #include "access/xloginsert.h" +#include "catalog/index.h" #include "miscadmin.h" #include "nodes/execnodes.h" #include "storage/bufmgr.h" #include "storage/bulk_write.h" +#include "storage/proc.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -143,6 +145,8 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) result = palloc0_object(IndexBuildResult); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); return result; } diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 9407c357f27..7f47e7df07d 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -81,6 +81,7 @@ #include "utils/snapmgr.h" #include "utils/syscache.h" #include "utils/tuplesort.h" +#include "storage/proc.h" /* Potentially set by pg_upgrade_support functions */ Oid binary_upgrade_next_index_pg_class_oid = InvalidOid; @@ -1510,8 +1511,8 @@ index_concurrently_build(Oid heapRelationId, Relation indexRelation; IndexInfo *indexInfo; - /* This had better make sure that a snapshot is active */ - Assert(ActiveSnapshotSet()); + Assert(!TransactionIdIsValid(MyProc->xmin)); + Assert(!TransactionIdIsValid(MyProc->xid)); /* Open and lock the parent heap relation */ heapRel = table_open(heapRelationId, ShareUpdateExclusiveLock); @@ -1529,19 +1530,28 @@ index_concurrently_build(Oid heapRelationId, indexRelation = index_open(indexRelationId, RowExclusiveLock); + /* BuildIndexInfo may require a snapshot for expressions and predicates */ + PushActiveSnapshot(GetTransactionSnapshot()); /* * We have to re-build the IndexInfo struct, since it was lost in the * commit of the transaction where this concurrent index was created at * the catalog level. */ indexInfo = BuildIndexInfo(indexRelation); + /* Done with snapshot */ + PopActiveSnapshot(); Assert(!indexInfo->ii_ReadyForInserts); indexInfo->ii_Concurrent = true; indexInfo->ii_BrokenHotChain = false; + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); /* Now build the index */ index_build(heapRel, indexRelation, indexInfo, false, true, true); + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); + /* Roll back any GUC changes executed by index functions */ AtEOXact_GUC(false, save_nestlevel); @@ -1552,12 +1562,19 @@ index_concurrently_build(Oid heapRelationId, table_close(heapRel, NoLock); index_close(indexRelation, NoLock); + /* + * Updating pg_index might involve TOAST table access, so ensure we + * have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); /* * Update the pg_index row to mark the index as ready for inserts. Once we * commit this transaction, any new transactions that open the table must * insert new entries into the index for insertions and non-HOT updates. */ index_set_state_flags(indexRelationId, INDEX_CREATE_SET_READY); + /* we can do away with our snapshot */ + PopActiveSnapshot(); } /* @@ -3257,7 +3274,8 @@ IndexCheckExclusion(Relation heapRelation, 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - true); /* syncscan OK */ + true, /* syncscan OK */ + false); while (table_scan_getnextslot(scan, ForwardScanDirection, slot)) { @@ -3320,12 +3338,16 @@ IndexCheckExclusion(Relation heapRelation, * as of the start of the scan (see table_index_build_scan), whereas a normal * build takes care to include recently-dead tuples. This is OK because * we won't mark the index valid until all transactions that might be able - * to see those tuples are gone. The reason for doing that is to avoid + * to see those tuples are gone. One of reasons for doing that is to avoid * bogus unique-index failures due to concurrent UPDATEs (we might see * different versions of the same row as being valid when we pass over them, * if we used HeapTupleSatisfiesVacuum). This leaves us with an index that * does not contain any tuples added to the table while we built the index. * + * Furthermore, in case of non-unique index we set SO_RESET_SNAPSHOT for the + * scan, which causes new snapshot to be set as active every so often. The reason + * for that is to propagate the xmin horizon forward. + * * Next, we mark the index "indisready" (but still not "indisvalid") and * commit the second transaction and start a third. Again we wait for all * transactions that could have been modifying the table to terminate. Now diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 9ab74c8df0a..be3dc5e8d28 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1704,23 +1704,17 @@ DefineIndex(ParseState *pstate, * chains can be created where the new tuple and the old tuple in the * chain have different index keys. * - * We now take a new snapshot, and build the index using all tuples that - * are visible in this snapshot. We can be sure that any HOT updates to + * We build the index using all tuples that are visible using single or + * multiple refreshing snapshots. We can be sure that any HOT updates to * these tuples will be compatible with the index, since any updates made * by transactions that didn't know about the index are now committed or * rolled back. Thus, each visible tuple is either the end of its * HOT-chain or the extension of the chain is HOT-safe for this index. */ - /* Set ActiveSnapshot since functions in the indexes may need it */ - PushActiveSnapshot(GetTransactionSnapshot()); - /* Perform concurrent build of index */ index_concurrently_build(tableId, indexRelationId); - /* we can do away with our snapshot */ - PopActiveSnapshot(); - /* * Commit this transaction to make the indisready update visible. */ @@ -4137,9 +4131,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein if (newidx->safe) set_indexsafe_procflags(); - /* Set ActiveSnapshot since functions in the indexes may need it */ - PushActiveSnapshot(GetTransactionSnapshot()); - /* * Update progress for the index to build, with the correct parent * table involved. @@ -4154,7 +4145,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein /* Perform concurrent build of new index */ index_concurrently_build(newidx->tableId, newidx->indexId); - PopActiveSnapshot(); CommitTransactionCommand(); } diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 4ec76ce31a9..6a086e23a6c 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -63,6 +63,7 @@ #include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/selfuncs.h" +#include "utils/snapmgr.h" /* GUC parameters */ double cursor_tuple_fraction = DEFAULT_CURSOR_TUPLE_FRACTION; @@ -7047,6 +7048,7 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) Relation heap; Relation index; RelOptInfo *rel; + bool need_pop_active_snapshot = false; int parallel_workers; BlockNumber heap_blocks; double reltuples; @@ -7102,6 +7104,12 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) heap = table_open(tableOid, NoLock); index = index_open(indexOid, NoLock); + /* Set ActiveSnapshot since functions in the indexes may need it */ + if (!ActiveSnapshotSet()) + { + PushActiveSnapshot(GetTransactionSnapshot()); + need_pop_active_snapshot = true; + } /* * Determine if it's safe to proceed. * @@ -7159,6 +7167,8 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) parallel_workers--; done: + if (need_pop_active_snapshot) + PopActiveSnapshot(); index_close(index, NoLock); table_close(heap, NoLock); diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 83af594d4af..df92b9fee68 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -517,6 +517,16 @@ options => 'compute_query_id_options', }, +{ name => 'concurrent_index_reset_snapshot_every_n_pages', type => 'int', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Sets how often concurrent index builds refresh their snapshot.', + long_desc => 'Zero disables periodic snapshot refresh during the first heap scan of eligible CREATE INDEX CONCURRENTLY and REINDEX CONCURRENTLY builds.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'concurrent_index_reset_snapshot_every_n_pages', + boot_val => '4096', + min => '0', + max => 'INT_MAX', +}, + { name => 'config_file', type => 'string', context => 'PGC_POSTMASTER', group => 'FILE_LOCATIONS', short_desc => 'Sets the server\'s main configuration file.', flags => 'GUC_DISALLOW_IN_FILE | GUC_SUPERUSER_ONLY', diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index c13f05d39db..3560ba40fc2 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -24,6 +24,7 @@ #include "storage/read_stream.h" #include "utils/rel.h" #include "utils/snapshot.h" +#include "utils/injection_point.h" #define DEFAULT_TABLE_ACCESS_METHOD "heap" @@ -72,6 +73,18 @@ typedef enum ScanOptions /* collect scan instrumentation */ SO_SCAN_INSTRUMENT = 1 << 11, + /* + * Reset scan and catalog snapshot every so often? If so, the + * concurrent_index_reset_snapshot_every_n_pages GUC decides when the + * active snapshot is popped, the catalog snapshot invalidated, and the + * latest snapshot pushed as active. + * + * At the end of the scan snapshot is not popped. + * Goal of such mode is keep xmin propagating horizon forward. + * + * see heap_reset_scan_snapshot for details. + */ + SO_RESET_SNAPSHOT = 1 << 12, } ScanOptions; /* @@ -85,7 +98,7 @@ typedef enum ScanOptions (SO_TYPE_SEQSCAN | SO_TYPE_BITMAPSCAN | SO_TYPE_SAMPLESCAN | \ SO_TYPE_TIDSCAN | SO_TYPE_TIDRANGESCAN | SO_TYPE_ANALYZE | \ SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE | \ - SO_TEMP_SNAPSHOT) + SO_TEMP_SNAPSHOT | SO_RESET_SNAPSHOT) /* * Result codes for table_{update,delete,lock_tuple}, and for visibility @@ -967,7 +980,8 @@ extern TableScanDesc table_beginscan_catalog(Relation relation, int nkeys, static inline TableScanDesc table_beginscan_strat(Relation rel, Snapshot snapshot, int nkeys, ScanKeyData *key, - bool allow_strat, bool allow_sync) + bool allow_strat, bool allow_sync, + bool reset_snapshot) { uint32 flags = SO_TYPE_SEQSCAN | SO_ALLOW_PAGEMODE; @@ -975,6 +989,15 @@ table_beginscan_strat(Relation rel, Snapshot snapshot, flags |= SO_ALLOW_STRAT; if (allow_sync) flags |= SO_ALLOW_SYNC; + if (reset_snapshot) + { + INJECTION_POINT("table_beginscan_strat_reset_snapshots", NULL); + /* Active snapshot is required on start. */ + Assert(GetActiveSnapshot() == snapshot); + /* Active snapshot should not be registered to keep xmin propagating. */ + Assert(GetActiveSnapshot()->regd_count == 0); + flags |= (SO_RESET_SNAPSHOT); + } return table_beginscan_common(rel, snapshot, nkeys, key, NULL, flags, SO_NONE); @@ -1842,6 +1865,10 @@ table_scan_analyze_next_tuple(TableScanDesc scan, * very hard to detect whether they're really incompatible with the chain tip. * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. + * + * In case of non-unique index and non-parallel concurrent build, + * concurrent_index_reset_snapshot_every_n_pages is applied for the scan. + * That leads to changing snapshots on the fly to allow xmin horizon propagate. */ static inline double table_index_build_scan(Relation table_rel, diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 9aee8226347..3d93232361f 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -14,6 +14,7 @@ #ifndef INDEX_H #define INDEX_H +#include "access/xact.h" #include "catalog/objectaddress.h" #include "nodes/execnodes.h" @@ -26,6 +27,22 @@ typedef struct AttrMap AttrMap; #define DEFAULT_INDEX_TYPE "btree" +/* + * Does this concurrent index build use periodic snapshot resets? + * + * Snapshot resetting is only applicable when all of: + * - the build is concurrent (ii_Concurrent) + * - the index is non-unique (unique needs consistent snapshot) + * - isolation level is not REPEATABLE READ/SERIALIZABLE (those keep a + * registered transaction snapshot) + * - the build is not parallel (parallel needs separate infrastructure) + */ +#define IndexBuildResetsSnapshots(indexInfo) \ + ((indexInfo)->ii_Concurrent && \ + !(indexInfo)->ii_Unique && \ + !IsolationUsesXactSnapshot() && \ + !(indexInfo)->ii_ParallelWorkers) + /* Action code for index_set_state_flags */ typedef enum { diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 8ccdf61246b..09cbb4b1d99 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -272,6 +272,7 @@ extern PGDLLIMPORT int work_mem; extern PGDLLIMPORT double hash_mem_multiplier; extern PGDLLIMPORT int maintenance_work_mem; extern PGDLLIMPORT int max_parallel_maintenance_workers; +extern PGDLLIMPORT int concurrent_index_reset_snapshot_every_n_pages; /* * Upper and lower hard limits for the buffer access strategy ring size diff --git a/src/test/modules/injection_points/Makefile b/src/test/modules/injection_points/Makefile index f057d143d1a..0ba0e47f3b8 100644 --- a/src/test/modules/injection_points/Makefile +++ b/src/test/modules/injection_points/Makefile @@ -9,7 +9,7 @@ EXTENSION = injection_points DATA = injection_points--1.0.sql PGFILEDESC = "injection_points - facility for injection points" -REGRESS = injection_points hashagg reindex_conc vacuum +REGRESS = injection_points hashagg reindex_conc vacuum cic_reset_snapshots REGRESS_OPTS = --dlpath=$(top_builddir)/src/test/regress ISOLATION = basic \ diff --git a/src/test/modules/injection_points/expected/cic_reset_snapshots.out b/src/test/modules/injection_points/expected/cic_reset_snapshots.out new file mode 100644 index 00000000000..ed4aaaf3463 --- /dev/null +++ b/src/test/modules/injection_points/expected/cic_reset_snapshots.out @@ -0,0 +1,118 @@ +CREATE EXTENSION injection_points; +SELECT injection_points_set_local(); + injection_points_set_local +---------------------------- + +(1 row) + +SELECT injection_points_attach('heap_reset_scan_snapshot_effective', 'notice'); + injection_points_attach +------------------------- + +(1 row) + +SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice'); + injection_points_attach +------------------------- + +(1 row) + +CREATE SCHEMA cic_reset_snap; +CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); +INSERT INTO cic_reset_snap.tbl SELECT i, i * I FROM generate_series(1, 200) s(i); +CREATE FUNCTION cic_reset_snap.predicate_stable(integer) RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN MOD($1, 2) = 0; +END; $$; +CREATE FUNCTION cic_reset_snap.predicate_stable_no_param() RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN false; +END; $$; +---------------- +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +-- The same in parallel mode +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i DESC NULLS LAST); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +BEGIN TRANSACTION; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +ERROR: CREATE INDEX CONCURRENTLY cannot run inside a transaction block +ROLLBACK ; +SET default_transaction_isolation = 'repeatable read'; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +SET default_transaction_isolation = serializable; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +RESET default_transaction_isolation; +DROP SCHEMA cic_reset_snap CASCADE; +NOTICE: drop cascades to 3 other objects +DETAIL: drop cascades to table cic_reset_snap.tbl +drop cascades to function cic_reset_snap.predicate_stable(integer) +drop cascades to function cic_reset_snap.predicate_stable_no_param() +DROP EXTENSION injection_points; diff --git a/src/test/modules/injection_points/meson.build b/src/test/modules/injection_points/meson.build index fb1418e2caa..fe58023f904 100644 --- a/src/test/modules/injection_points/meson.build +++ b/src/test/modules/injection_points/meson.build @@ -36,6 +36,7 @@ tests += { 'hashagg', 'reindex_conc', 'vacuum', + 'cic_reset_snapshots', ], 'regress_args': ['--dlpath', meson.project_build_root() / 'src/test/regress'], # The injection points are cluster-wide, so disable installcheck diff --git a/src/test/modules/injection_points/sql/cic_reset_snapshots.sql b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql new file mode 100644 index 00000000000..553787539b3 --- /dev/null +++ b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql @@ -0,0 +1,101 @@ +CREATE EXTENSION injection_points; + +SELECT injection_points_set_local(); +SELECT injection_points_attach('heap_reset_scan_snapshot_effective', 'notice'); +SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice'); + + +CREATE SCHEMA cic_reset_snap; +CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); +INSERT INTO cic_reset_snap.tbl SELECT i, i * I FROM generate_series(1, 200) s(i); + +CREATE FUNCTION cic_reset_snap.predicate_stable(integer) RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN MOD($1, 2) = 0; +END; $$; + +CREATE FUNCTION cic_reset_snap.predicate_stable_no_param() RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN false; +END; $$; + +---------------- +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); + +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +-- The same in parallel mode +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); + +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i DESC NULLS LAST); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +BEGIN TRANSACTION; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +ROLLBACK ; + +SET default_transaction_isolation = 'repeatable read'; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +SET default_transaction_isolation = serializable; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +RESET default_transaction_isolation; + +DROP SCHEMA cic_reset_snap CASCADE; + +DROP EXTENSION injection_points; -- 2.43.0 [application/octet-stream] v6-0003-Support-snapshot-resets-in-parallel-concurrent-in.patch (38.8K, 3-v6-0003-Support-snapshot-resets-in-parallel-concurrent-in.patch) download | inline diff: From 2cbb2b26f63671d4fbef6f52757969ce5e5c77a1 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <[email protected]> Date: Fri, 3 Apr 2026 19:56:44 +0200 Subject: [PATCH v6 3/4] Support snapshot resets in parallel concurrent index builds Extend periodic snapshot reset support to parallel builds, previously limited to non-parallel operations. This allows the xmin horizon to advance during parallel concurrent index builds as well. The main limitation of applying that technique to parallel builds was a requirement to wait until worker processes restore their initial snapshot from leader. To address this, following changes applied: - add infrastructure to track snapshot restoration in parallel workers - extend parallel scan initialization to support periodic snapshot resets - wait for parallel workers to restore their initial snapshots before proceeding with scan - relax limitation for parallel worker to call GetLatestSnapshot --- src/backend/access/brin/brin.c | 54 +++++++++++++---- src/backend/access/gin/gininsert.c | 51 ++++++++++++---- src/backend/access/heap/heapam_handler.c | 7 +-- src/backend/access/nbtree/nbtsort.c | 51 ++++++++++++++-- src/backend/access/table/tableam.c | 37 ++++++++++-- src/backend/access/transam/parallel.c | 60 +++++++++++++++++-- src/backend/executor/nodeSeqscan.c | 3 +- src/backend/executor/nodeTidrangescan.c | 3 +- src/backend/utils/time/snapmgr.c | 8 --- src/include/access/parallel.h | 3 +- src/include/access/relscan.h | 1 + src/include/access/tableam.h | 5 +- src/include/catalog/index.h | 4 +- .../expected/cic_reset_snapshots.out | 25 +++++++- .../sql/cic_reset_snapshots.sql | 5 +- 15 files changed, 257 insertions(+), 60 deletions(-) diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 7e058f0d7ad..af1b081056f 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -1277,6 +1277,9 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = idxtuples; + InvalidateCatalogSnapshot(); + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); + return result; } @@ -2396,6 +2399,7 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, BufferUsage *bufferusage; bool leaderparticipates = true; bool need_pop_active_snapshot = true; + bool reset_snapshot; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -2413,12 +2417,16 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, scantuplesortstates = leaderparticipates ? request + 1 : request; + reset_snapshot = isconcurrent && !IsolationUsesXactSnapshot(); + /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a - * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that. + * concurrent build, we take a regular MVCC snapshot and push it as active. + * Later we index whatever's live according to that snapshot while that + * snapshot may be reset periodically (in non-xact-snapshot isolation + * modes) to allow the xmin horizon to advance. */ if (!isconcurrent) { @@ -2426,10 +2434,15 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, snapshot = SnapshotAny; need_pop_active_snapshot = false; } + else if (reset_snapshot) + { + snapshot = InvalidSnapshot; + PushActiveSnapshot(GetTransactionSnapshot()); + } else { snapshot = RegisterSnapshot(GetTransactionSnapshot()); - PushActiveSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(snapshot); } /* @@ -2475,7 +2488,7 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, { if (need_pop_active_snapshot) PopActiveSnapshot(); - if (IsMVCCSnapshot(snapshot)) + if (snapshot != InvalidSnapshot && IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); ExitParallelMode(); @@ -2501,7 +2514,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, table_parallelscan_initialize(heap, ParallelTableScanFromBrinShared(brinshared), - snapshot); + snapshot, + reset_snapshot); /* * Store shared tuplesort-private state, for which we reserved space. @@ -2563,6 +2577,13 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, /* Save leader state now that it's clear build will be parallel */ buildstate->bs_leader = brinleader; + /* + * In case of concurrent build with snapshot resets, wait until all + * workers imported initial snapshot. + */ + if (reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, true); + /* Join heap scan ourselves */ if (leaderparticipates) _brin_leader_participate_as_worker(buildstate, heap, index); @@ -2571,9 +2592,13 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - WaitForParallelWorkersToAttach(pcxt); + if (!reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); + + InvalidateCatalogSnapshot(); + Assert(!reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); } /* @@ -2594,8 +2619,7 @@ _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state) for (i = 0; i < brinleader->pcxt->nworkers_launched; i++) InstrAccumParallelQuery(&brinleader->bufferusage[i], &brinleader->walusage[i]); - /* Free last reference to MVCC snapshot, if one was used */ - if (IsMVCCSnapshot(brinleader->snapshot)) + if (brinleader->snapshot != InvalidSnapshot && IsMVCCSnapshot(brinleader->snapshot)) UnregisterSnapshot(brinleader->snapshot); DestroyParallelContext(brinleader->pcxt); ExitParallelMode(); @@ -2796,7 +2820,7 @@ _brin_parallel_merge(BrinBuildState *state) /* * Returns size of shared memory required to store state for a parallel - * brin index build based on the snapshot its parallel scan will use. + * brin index build. */ static Size _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot) @@ -2845,6 +2869,7 @@ _brin_parallel_scan_and_build(BrinBuildState *state, { SortCoordinate coordinate; TableScanDesc scan; + ParallelTableScanDesc pscan; double reltuples; IndexInfo *indexInfo; @@ -2861,13 +2886,18 @@ _brin_parallel_scan_and_build(BrinBuildState *state, /* Join parallel scan */ indexInfo = BuildIndexInfo(index); indexInfo->ii_Concurrent = brinshared->isconcurrent; + pscan = ParallelTableScanFromBrinShared(brinshared); scan = table_beginscan_parallel(heap, - ParallelTableScanFromBrinShared(brinshared), + pscan, SO_NONE); reltuples = table_index_build_scan(heap, index, indexInfo, true, true, brinbuildCallbackParallel, state, scan); + InvalidateCatalogSnapshot(); + if (pscan->phs_reset_snapshot) + PopActiveSnapshot(); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); /* insert the last item */ form_and_spill_tuple(state); @@ -2890,6 +2920,9 @@ _brin_parallel_scan_and_build(BrinBuildState *state, ConditionVariableSignal(&brinshared->workersdonecv); tuplesort_end(state->bs_sortstate); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); + if (pscan->phs_reset_snapshot) + PushActiveSnapshot(GetTransactionSnapshot()); } /* @@ -2966,7 +2999,6 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) _brin_parallel_scan_and_build(buildstate, brinshared, sharedsort, heapRel, indexRel, sortmem, false); - /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 4df216a268b..6d1c2ce1c2d 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -808,6 +808,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; + Assert(!IndexBuildResetsSnapshots(indexInfo) || !TransactionIdIsValid(MyProc->xmin)); return result; } @@ -956,6 +957,7 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, BufferUsage *bufferusage; bool leaderparticipates = true; bool need_pop_active_snapshot = true; + bool reset_snapshot; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -972,12 +974,16 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, scantuplesortstates = leaderparticipates ? request + 1 : request; + reset_snapshot = isconcurrent && !IsolationUsesXactSnapshot(); + /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a - * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that. + * concurrent build, we take a regular MVCC snapshot and push it as active. + * Later we index whatever's live according to that snapshot while that + * snapshot may be reset periodically (in non-xact-snapshot isolation + * modes) to allow the xmin horizon to advance. */ if (!isconcurrent) { @@ -985,10 +991,15 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, snapshot = SnapshotAny; need_pop_active_snapshot = false; } + else if (reset_snapshot) + { + snapshot = InvalidSnapshot; + PushActiveSnapshot(GetTransactionSnapshot()); + } else { snapshot = RegisterSnapshot(GetTransactionSnapshot()); - PushActiveSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(snapshot); } /* @@ -1034,7 +1045,7 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, { if (need_pop_active_snapshot) PopActiveSnapshot(); - if (IsMVCCSnapshot(snapshot)) + if (snapshot != InvalidSnapshot && IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); ExitParallelMode(); @@ -1059,7 +1070,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, table_parallelscan_initialize(heap, ParallelTableScanFromGinBuildShared(ginshared), - snapshot); + snapshot, + reset_snapshot); /* * Store shared tuplesort-private state, for which we reserved space. @@ -1117,6 +1129,13 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, /* Save leader state now that it's clear build will be parallel */ buildstate->bs_leader = ginleader; + /* + * In case of concurrent build with snapshot resets, wait until all + * workers imported initial snapshot. + */ + if (reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, true); + /* Join heap scan ourselves */ if (leaderparticipates) _gin_leader_participate_as_worker(buildstate, heap, index); @@ -1125,9 +1144,12 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - WaitForParallelWorkersToAttach(pcxt); + if (!reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); + InvalidateCatalogSnapshot(); + Assert(!reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); } /* @@ -1148,8 +1170,7 @@ _gin_end_parallel(GinLeader *ginleader, GinBuildState *state) for (i = 0; i < ginleader->pcxt->nworkers_launched; i++) InstrAccumParallelQuery(&ginleader->bufferusage[i], &ginleader->walusage[i]); - /* Free last reference to MVCC snapshot, if one was used */ - if (IsMVCCSnapshot(ginleader->snapshot)) + if (ginleader->snapshot != InvalidSnapshot && IsMVCCSnapshot(ginleader->snapshot)) UnregisterSnapshot(ginleader->snapshot); DestroyParallelContext(ginleader->pcxt); ExitParallelMode(); @@ -1825,7 +1846,7 @@ _gin_parallel_merge(GinBuildState *state) /* * Returns size of shared memory required to store state for a parallel - * gin index build based on the snapshot its parallel scan will use. + * gin index build. */ static Size _gin_parallel_estimate_shared(Relation heap, Snapshot snapshot) @@ -2057,6 +2078,7 @@ _gin_parallel_scan_and_build(GinBuildState *state, { SortCoordinate coordinate; TableScanDesc scan; + ParallelTableScanDesc pscan; double reltuples; IndexInfo *indexInfo; @@ -2087,13 +2109,18 @@ _gin_parallel_scan_and_build(GinBuildState *state, /* Join parallel scan */ indexInfo = BuildIndexInfo(index); indexInfo->ii_Concurrent = ginshared->isconcurrent; + pscan = ParallelTableScanFromGinBuildShared(ginshared); scan = table_beginscan_parallel(heap, - ParallelTableScanFromGinBuildShared(ginshared), + pscan, SO_NONE); reltuples = table_index_build_scan(heap, index, indexInfo, true, progress, ginBuildCallbackParallel, state, scan); + InvalidateCatalogSnapshot(); + if (pscan->phs_reset_snapshot) + PopActiveSnapshot(); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); /* write remaining accumulated entries */ ginFlushBuildState(state, index); @@ -2123,6 +2150,9 @@ _gin_parallel_scan_and_build(GinBuildState *state, ConditionVariableSignal(&ginshared->workersdonecv); tuplesort_end(state->bs_sortstate); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); + if (pscan->phs_reset_snapshot) + PushActiveSnapshot(GetTransactionSnapshot()); } /* @@ -2218,7 +2248,6 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) _gin_parallel_scan_and_build(&buildstate, ginshared, sharedsort, heapRel, indexRel, sortmem, false); - /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index faf3e04c449..1184ed086fe 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1213,7 +1213,8 @@ heapam_index_build_range_scan(Relation heapRelation, * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, or during bootstrap, we take a regular MVCC snapshot - * and index whatever's live according to that. + * and index whatever's live according to that while that snapshot is reset + * every so often (in the case of non-unique index). */ OldestXmin = InvalidTransactionId; @@ -1221,8 +1222,6 @@ heapam_index_build_range_scan(Relation heapRelation, * For unique indexes we need a consistent snapshot for the whole scan. * Resetting snapshots also doesn't work in xact-snapshot isolation modes, * because those keep a registered transaction snapshot for the whole xact. - * In the case of parallel scan, some additional infrastructure is required - * to perform a scan with SO_RESET_SNAPSHOT which is not yet ready. */ reset_snapshots = IndexBuildResetsSnapshots(indexInfo) && !is_system_catalog; /* just for the case */ @@ -1284,7 +1283,7 @@ heapam_index_build_range_scan(Relation heapRelation, Assert(!IsBootstrapProcessingMode()); Assert(allow_sync); snapshot = scan->rs_snapshot; - if (IsMVCCSnapshot(snapshot)) + if (!reset_snapshots && IsMVCCSnapshot(snapshot)) { /* Don't expose SnapshotAny to SQL run by predicates/expressions. */ PushActiveSnapshot(snapshot); diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 8d804d6bcfe..6bb365c951d 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -1419,6 +1419,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) BufferUsage *bufferusage; bool leaderparticipates = true; bool need_pop_active_snapshot = true; + bool reset_snapshot; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -1436,12 +1437,24 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) scantuplesortstates = leaderparticipates ? request + 1 : request; + /* + * For concurrent non-unique index builds, we can periodically reset + * snapshots to allow the xmin horizon to advance. This is safe since + * these builds don't require a consistent view across the entire scan. + * Unique indexes still need a stable snapshot to properly enforce + * uniqueness constraints. Isolation modes where + * IsolationUsesXactSnapshot() is true also prevent resetting because + * they keep a registered transaction snapshot for the whole transaction. + */ + reset_snapshot = isconcurrent && !IsolationUsesXactSnapshot() && !btspool->isunique; + /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that. + * live according to that, while that snapshot may be reset periodically + * for non-unique indexes in non-xact-snapshot isolation modes. */ if (!isconcurrent) { @@ -1449,6 +1462,11 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) snapshot = SnapshotAny; need_pop_active_snapshot = false; } + else if (reset_snapshot) + { + snapshot = InvalidSnapshot; + PushActiveSnapshot(GetTransactionSnapshot()); + } else { snapshot = RegisterSnapshot(GetTransactionSnapshot()); @@ -1509,7 +1527,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) { if (need_pop_active_snapshot) PopActiveSnapshot(); - if (IsMVCCSnapshot(snapshot)) + if (snapshot != InvalidSnapshot && IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); ExitParallelMode(); @@ -1536,7 +1554,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) btshared->brokenhotchain = false; table_parallelscan_initialize(btspool->heap, ParallelTableScanFromBTShared(btshared), - snapshot); + snapshot, + reset_snapshot); /* * Store shared tuplesort-private state, for which we reserved space. @@ -1612,6 +1631,13 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) /* Save leader state now that it's clear build will be parallel */ buildstate->btleader = btleader; + /* + * In the case of concurrent build, snapshots are going to be reset periodically. + * Wait until all workers imported initial snapshot. + */ + if (reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, true); + /* Join heap scan ourselves */ if (leaderparticipates) _bt_leader_participate_as_worker(buildstate); @@ -1620,9 +1646,13 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - WaitForParallelWorkersToAttach(pcxt); + if (!reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); + + InvalidateCatalogSnapshot(); + Assert(!reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); } /* @@ -1644,7 +1674,7 @@ _bt_end_parallel(BTLeader *btleader) InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]); /* Free last reference to MVCC snapshot, if one was used */ - if (IsMVCCSnapshot(btleader->snapshot)) + if (btleader->snapshot != InvalidSnapshot && IsMVCCSnapshot(btleader->snapshot)) UnregisterSnapshot(btleader->snapshot); DestroyParallelContext(btleader->pcxt); ExitParallelMode(); @@ -1894,6 +1924,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, SortCoordinate coordinate; BTBuildState buildstate; TableScanDesc scan; + ParallelTableScanDesc pscan; double reltuples; IndexInfo *indexInfo; @@ -1948,12 +1979,17 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, /* Join parallel scan */ indexInfo = BuildIndexInfo(btspool->index); indexInfo->ii_Concurrent = btshared->isconcurrent; + pscan = ParallelTableScanFromBTShared(btshared); scan = table_beginscan_parallel(btspool->heap, - ParallelTableScanFromBTShared(btshared), + pscan, SO_NONE); reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo, true, progress, _bt_build_callback, &buildstate, scan); + InvalidateCatalogSnapshot(); + if (pscan->phs_reset_snapshot) + PopActiveSnapshot(); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); /* Execute this worker's part of the sort */ if (progress) @@ -1989,4 +2025,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, tuplesort_end(btspool->sortstate); if (btspool2) tuplesort_end(btspool2->sortstate); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); + if (pscan->phs_reset_snapshot) + PushActiveSnapshot(GetTransactionSnapshot()); } diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 68ff0966f1c..cf337eda5f6 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -132,10 +132,10 @@ table_parallelscan_estimate(Relation rel, Snapshot snapshot) { Size sz = 0; - if (IsMVCCSnapshot(snapshot)) + if (snapshot != InvalidSnapshot && IsMVCCSnapshot(snapshot)) sz = add_size(sz, EstimateSnapshotSpace(snapshot)); else - Assert(snapshot == SnapshotAny); + Assert(snapshot == SnapshotAny || snapshot == InvalidSnapshot); sz = add_size(sz, rel->rd_tableam->parallelscan_estimate(rel)); @@ -144,21 +144,36 @@ table_parallelscan_estimate(Relation rel, Snapshot snapshot) void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, - Snapshot snapshot) + Snapshot snapshot, bool reset_snapshot) { Size snapshot_off = rel->rd_tableam->parallelscan_initialize(rel, pscan); pscan->phs_snapshot_off = snapshot_off; - if (IsMVCCSnapshot(snapshot)) + /* + * Initialize parallel scan description. For normal scans with a regular + * MVCC snapshot, serialize the snapshot info. For scans that use periodic + * snapshot resets, mark the scan accordingly. + */ + if (reset_snapshot) + { + Assert(snapshot == InvalidSnapshot); + pscan->phs_snapshot_any = false; + pscan->phs_reset_snapshot = true; + INJECTION_POINT("table_parallelscan_initialize", NULL); + } + else if (IsMVCCSnapshot(snapshot)) { SerializeSnapshot(snapshot, (char *) pscan + pscan->phs_snapshot_off); pscan->phs_snapshot_any = false; + pscan->phs_reset_snapshot = false; } else { Assert(snapshot == SnapshotAny); + Assert(!reset_snapshot); pscan->phs_snapshot_any = true; + pscan->phs_reset_snapshot = false; } } @@ -172,7 +187,19 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan, Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator)); - if (!pscan->phs_snapshot_any) + /* + * For scans that + * use periodic snapshot resets, mark the scan accordingly and use the active + * snapshot as the initial state. + */ + if (pscan->phs_reset_snapshot) + { + Assert(ActiveSnapshotSet()); + internal_flags |= SO_RESET_SNAPSHOT; + /* Start with current active snapshot. */ + snapshot = GetActiveSnapshot(); + } + else if (!pscan->phs_snapshot_any) { /* Snapshot was serialized -- restore it */ snapshot = RestoreSnapshot((char *) pscan + pscan->phs_snapshot_off); diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c index 89e9d224eec..cff50909ea3 100644 --- a/src/backend/access/transam/parallel.c +++ b/src/backend/access/transam/parallel.c @@ -79,6 +79,7 @@ #define PARALLEL_KEY_RELMAPPER_STATE UINT64CONST(0xFFFFFFFFFFFF000D) #define PARALLEL_KEY_UNCOMMITTEDENUMS UINT64CONST(0xFFFFFFFFFFFF000E) #define PARALLEL_KEY_CLIENTCONNINFO UINT64CONST(0xFFFFFFFFFFFF000F) +#define PARALLEL_KEY_SNAPSHOT_RESTORED UINT64CONST(0xFFFFFFFFFFFF0010) /* Fixed-size parallel state. */ typedef struct FixedParallelState @@ -308,6 +309,10 @@ InitializeParallelDSM(ParallelContext *pcxt) pcxt->nworkers)); shm_toc_estimate_keys(&pcxt->estimator, 1); + shm_toc_estimate_chunk(&pcxt->estimator, mul_size(sizeof(bool), + pcxt->nworkers)); + shm_toc_estimate_keys(&pcxt->estimator, 1); + /* Estimate how much we'll need for the entrypoint info. */ shm_toc_estimate_chunk(&pcxt->estimator, strlen(pcxt->library_name) + strlen(pcxt->function_name) + 2); @@ -379,6 +384,7 @@ InitializeParallelDSM(ParallelContext *pcxt) char *entrypointstate; char *uncommittedenumsspace; char *clientconninfospace; + bool *snapshot_set_flag_space; Size lnamelen; /* Serialize shared libraries we have loaded. */ @@ -494,6 +500,19 @@ InitializeParallelDSM(ParallelContext *pcxt) strcpy(entrypointstate, pcxt->library_name); strcpy(entrypointstate + lnamelen + 1, pcxt->function_name); shm_toc_insert(pcxt->toc, PARALLEL_KEY_ENTRYPOINT, entrypointstate); + + /* + * Establish dynamic shared memory to pass information about importing + * of snapshot. + */ + snapshot_set_flag_space = + shm_toc_allocate(pcxt->toc, mul_size(sizeof(bool), pcxt->nworkers)); + for (i = 0; i < pcxt->nworkers; ++i) + { + pcxt->worker[i].snapshot_restored = &snapshot_set_flag_space[i]; + *pcxt->worker[i].snapshot_restored = false; + } + shm_toc_insert(pcxt->toc, PARALLEL_KEY_SNAPSHOT_RESTORED, snapshot_set_flag_space); } /* Update nworkers_to_launch, in case we changed nworkers above. */ @@ -670,6 +689,10 @@ LaunchParallelWorkers(ParallelContext *pcxt) * Wait for all workers to attach to their error queues, and throw an error if * any worker fails to do this. * + * wait_for_snapshot: track whether each parallel worker has successfully restored + * its snapshot. This is needed when using periodic snapshot resets to ensure all + * workers have a valid initial snapshot before proceeding with the scan. + * * Callers can assume that if this function returns successfully, then the * number of workers given by pcxt->nworkers_launched have initialized and * attached to their error queues. Whether or not these workers are guaranteed @@ -699,7 +722,7 @@ LaunchParallelWorkers(ParallelContext *pcxt) * call this function at all. */ void -WaitForParallelWorkersToAttach(ParallelContext *pcxt) +WaitForParallelWorkersToAttach(ParallelContext *pcxt, bool wait_for_snapshot) { int i; @@ -743,9 +766,23 @@ WaitForParallelWorkersToAttach(ParallelContext *pcxt) mq = shm_mq_get_queue(pcxt->worker[i].error_mqh); if (shm_mq_get_sender(mq) != NULL) { - /* Yes, so it is known to be attached. */ - pcxt->known_attached_workers[i] = true; - ++pcxt->nknown_attached_workers; + if (!wait_for_snapshot || *pcxt->worker[i].snapshot_restored) + { + /* Worker is known to be attached. */ + pcxt->known_attached_workers[i] = true; + ++pcxt->nknown_attached_workers; + } + else + { + /* + * Worker attached but hasn't restored its snapshot yet. + */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 10, WAIT_EVENT_BGWORKER_STARTUP); + if (rc & WL_LATCH_SET) + ResetLatch(MyLatch); + } } } else if (status == BGWH_STOPPED) @@ -788,6 +825,16 @@ WaitForParallelWorkersToAttach(ParallelContext *pcxt) break; } } + + /* Set snapshot restored flag to false. */ + if (pcxt->nworkers > 0) + { + bool *snapshot_restored_space; + snapshot_restored_space = + shm_toc_lookup(pcxt->toc, PARALLEL_KEY_SNAPSHOT_RESTORED, false); + for (i = 0; i < pcxt->nworkers; ++i) + snapshot_restored_space[i] = false; + } } /* @@ -1304,6 +1351,7 @@ ParallelWorkerMain(Datum main_arg) shm_toc *toc; FixedParallelState *fps; char *error_queue_space; + bool *snapshot_restored_space; shm_mq *mq; shm_mq_handle *mqh; char *libraryspace; @@ -1507,6 +1555,10 @@ ParallelWorkerMain(Datum main_arg) fps->parallel_leader_pgproc); PushActiveSnapshot(asnapshot); + /* Snapshot is restored, set flag to make leader know about it. */ + snapshot_restored_space = shm_toc_lookup(toc, PARALLEL_KEY_SNAPSHOT_RESTORED, false); + snapshot_restored_space[ParallelWorkerNumber] = true; + /* * We've changed which tuples we can see, and must therefore invalidate * system caches. diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c index 5bcb0a861d7..122d7458561 100644 --- a/src/backend/executor/nodeSeqscan.c +++ b/src/backend/executor/nodeSeqscan.c @@ -404,7 +404,8 @@ ExecSeqScanInitializeDSM(SeqScanState *node, pscan = shm_toc_allocate(pcxt->toc, node->pscan_len); table_parallelscan_initialize(node->ss.ss_currentRelation, pscan, - estate->es_snapshot); + estate->es_snapshot, + false); shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); node->ss.ss_currentScanDesc = diff --git a/src/backend/executor/nodeTidrangescan.c b/src/backend/executor/nodeTidrangescan.c index b387ed6c308..07e575164a1 100644 --- a/src/backend/executor/nodeTidrangescan.c +++ b/src/backend/executor/nodeTidrangescan.c @@ -488,7 +488,8 @@ ExecTidRangeScanInitializeDSM(TidRangeScanState *node, ParallelContext *pcxt) pscan = shm_toc_allocate(pcxt->toc, node->trss_pscanlen); table_parallelscan_initialize(node->ss.ss_currentRelation, pscan, - estate->es_snapshot); + estate->es_snapshot, + false); shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); node->ss.ss_currentScanDesc = table_beginscan_parallel_tidrange(node->ss.ss_currentRelation, diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 10fe18df2e7..f22da271b9e 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -353,14 +353,6 @@ GetTransactionSnapshot(void) Snapshot GetLatestSnapshot(void) { - /* - * We might be able to relax this, but nothing that could otherwise work - * needs it. - */ - if (IsInParallelMode()) - elog(ERROR, - "cannot update SecondarySnapshot during a parallel operation"); - /* * So far there are no cases requiring support for GetLatestSnapshot() * during logical decoding, but it wouldn't be hard to add if required. diff --git a/src/include/access/parallel.h b/src/include/access/parallel.h index 60f857675e0..9a3e14e0c0e 100644 --- a/src/include/access/parallel.h +++ b/src/include/access/parallel.h @@ -28,6 +28,7 @@ typedef struct ParallelWorkerInfo { BackgroundWorkerHandle *bgwhandle; shm_mq_handle *error_mqh; + bool *snapshot_restored; } ParallelWorkerInfo; typedef struct ParallelContext @@ -67,7 +68,7 @@ extern void InitializeParallelDSM(ParallelContext *pcxt); extern void ReinitializeParallelDSM(ParallelContext *pcxt); extern void ReinitializeParallelWorkers(ParallelContext *pcxt, int nworkers_to_launch); extern void LaunchParallelWorkers(ParallelContext *pcxt); -extern void WaitForParallelWorkersToAttach(ParallelContext *pcxt); +extern void WaitForParallelWorkersToAttach(ParallelContext *pcxt, bool wait_for_snapshot); extern void WaitForParallelWorkersToFinish(ParallelContext *pcxt); extern void DestroyParallelContext(ParallelContext *pcxt); extern bool ParallelContextActive(void); diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 2ea06a67a63..8d54fdc169e 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -87,6 +87,7 @@ typedef struct ParallelTableScanDescData RelFileLocator phs_locator; /* physical relation to scan */ bool phs_syncscan; /* report location to syncscan logic? */ bool phs_snapshot_any; /* SnapshotAny, not phs_snapshot_data? */ + bool phs_reset_snapshot; /* use SO_RESET_SNAPSHOT? */ Size phs_snapshot_off; /* data for snapshot */ } ParallelTableScanDescData; typedef struct ParallelTableScanDescData *ParallelTableScanDesc; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 3560ba40fc2..219525dbcb0 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -1212,7 +1212,8 @@ extern Size table_parallelscan_estimate(Relation rel, Snapshot snapshot); */ extern void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, - Snapshot snapshot); + Snapshot snapshot, + bool reset_snapshot); /* * Begin a parallel scan. `pscan` needs to have been initialized with @@ -1866,7 +1867,7 @@ table_scan_analyze_next_tuple(TableScanDesc scan, * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. * - * In case of non-unique index and non-parallel concurrent build, + * In case of non-unique concurrent build, * concurrent_index_reset_snapshot_every_n_pages is applied for the scan. * That leads to changing snapshots on the fly to allow xmin horizon propagate. */ diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 3d93232361f..60f8356ed82 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -35,13 +35,11 @@ typedef struct AttrMap AttrMap; * - the index is non-unique (unique needs consistent snapshot) * - isolation level is not REPEATABLE READ/SERIALIZABLE (those keep a * registered transaction snapshot) - * - the build is not parallel (parallel needs separate infrastructure) */ #define IndexBuildResetsSnapshots(indexInfo) \ ((indexInfo)->ii_Concurrent && \ !(indexInfo)->ii_Unique && \ - !IsolationUsesXactSnapshot() && \ - !(indexInfo)->ii_ParallelWorkers) + !IsolationUsesXactSnapshot()) /* Action code for index_set_state_flags */ typedef enum diff --git a/src/test/modules/injection_points/expected/cic_reset_snapshots.out b/src/test/modules/injection_points/expected/cic_reset_snapshots.out index ed4aaaf3463..e5a8a7f3a79 100644 --- a/src/test/modules/injection_points/expected/cic_reset_snapshots.out +++ b/src/test/modules/injection_points/expected/cic_reset_snapshots.out @@ -17,6 +17,12 @@ SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice' (1 row) +SELECT injection_points_attach('table_parallelscan_initialize', 'notice'); + injection_points_attach +------------------------- + +(1 row) + CREATE SCHEMA cic_reset_snap; CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); INSERT INTO cic_reset_snap.tbl SELECT i, i * I FROM generate_series(1, 200) s(i); @@ -72,30 +78,45 @@ NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective DROP INDEX CONCURRENTLY cic_reset_snap.idx; -- The same in parallel mode ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); +-- Detach to keep test stable, since parallel worker may complete scan before leader +SELECT injection_points_detach('heap_reset_scan_snapshot_effective'); + injection_points_detach +------------------------- + +(1 row) + CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots -NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots -NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i DESC NULLS LAST); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; BEGIN TRANSACTION; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); diff --git a/src/test/modules/injection_points/sql/cic_reset_snapshots.sql b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql index 553787539b3..a85b0cbf575 100644 --- a/src/test/modules/injection_points/sql/cic_reset_snapshots.sql +++ b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql @@ -3,7 +3,7 @@ CREATE EXTENSION injection_points; SELECT injection_points_set_local(); SELECT injection_points_attach('heap_reset_scan_snapshot_effective', 'notice'); SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice'); - +SELECT injection_points_attach('table_parallelscan_initialize', 'notice'); CREATE SCHEMA cic_reset_snap; CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); @@ -53,6 +53,9 @@ DROP INDEX CONCURRENTLY cic_reset_snap.idx; -- The same in parallel mode ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); +-- Detach to keep test stable, since parallel worker may complete scan before leader +SELECT injection_points_detach('heap_reset_scan_snapshot_effective'); + CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; DROP INDEX CONCURRENTLY cic_reset_snap.idx; -- 2.43.0 [application/octet-stream] v6-0004-Support-snapshot-resets-in-concurrent-builds-of-u.patch (41.9K, 4-v6-0004-Support-snapshot-resets-in-concurrent-builds-of-u.patch) download | inline diff: From a91a6dfdb918966a87c641a8e2ec0f13e8adeb0b Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <[email protected]> Date: Sat, 11 Apr 2026 18:45:47 +0200 Subject: [PATCH v6 4/4] Support snapshot resets in concurrent builds of unique indexes Previously, concurrent builds of unique index used a fixed snapshot for the entire scan to ensure proper uniqueness checks. Now reset snapshots periodically during concurrent unique index builds, while still maintaining uniqueness by: - ignoring SnapshotSelf dead tuples during uniqueness checks in tuplesort as not a guarantee, but a fail-fast mechanics - adding a uniqueness check in _bt_load that detects multiple alive tuples with the same key values as a guarantee of correctness Tuples are SnapshotSelf tested only in the case of equal index key values, otherwise _bt_load works like before. --- src/backend/access/heap/README.HOT | 12 +- src/backend/access/heap/heapam_handler.c | 7 +- src/backend/access/nbtree/nbtdedup.c | 8 +- src/backend/access/nbtree/nbtreadpage.c | 2 +- src/backend/access/nbtree/nbtsort.c | 158 ++++++++++++++---- src/backend/access/nbtree/nbtsplitloc.c | 12 +- src/backend/access/nbtree/nbtutils.c | 59 ++++++- src/backend/catalog/index.c | 6 +- src/backend/commands/indexcmds.c | 4 +- src/backend/utils/misc/guc_parameters.dat | 10 ++ src/backend/utils/sort/tuplesort.c | 2 + src/backend/utils/sort/tuplesortvariants.c | 101 +++++++++-- src/include/access/nbtree.h | 6 +- src/include/access/tableam.h | 7 +- src/include/catalog/index.h | 2 - src/include/miscadmin.h | 1 + src/include/utils/tuplesort.h | 3 + .../expected/cic_reset_snapshots.out | 6 + 18 files changed, 320 insertions(+), 86 deletions(-) diff --git a/src/backend/access/heap/README.HOT b/src/backend/access/heap/README.HOT index 74e407f375a..829dad1194e 100644 --- a/src/backend/access/heap/README.HOT +++ b/src/backend/access/heap/README.HOT @@ -386,12 +386,12 @@ have the HOT-safety property enforced before we start to build the new index. After waiting for transactions which had the table open, we build the index -for all rows that are valid in a fresh snapshot. Any tuples visible in the -snapshot will have only valid forward-growing HOT chains. (They might have -older HOT updates behind them which are broken, but this is OK for the same -reason it's OK in a regular index build.) As above, we point the index -entry at the root of the HOT-update chain but we use the key value from the -live tuple. +for all rows that are valid in a fresh snapshot, which is updated every so +often. Any tuples visible in the snapshot will have only valid forward-growing +HOT chains. (They might have older HOT updates behind them which are broken, +but this is OK for the same reason it's OK in a regular index build.) +As above, we point the index entry at the root of the HOT-update chain but we +use the key value from the live tuple. We mark the index open for inserts (but still not ready for reads) then we again wait for transactions which have the table open. Then we take diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 1184ed086fe..68f6e6731df 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1214,13 +1214,14 @@ heapam_index_build_range_scan(Relation heapRelation, * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, or during bootstrap, we take a regular MVCC snapshot * and index whatever's live according to that while that snapshot is reset - * every so often (in the case of non-unique index). + * every so often. */ OldestXmin = InvalidTransactionId; /* - * For unique indexes we need a consistent snapshot for the whole scan. - * Resetting snapshots also doesn't work in xact-snapshot isolation modes, + * For concurrent builds of non-system indexes, we want to periodically + * reset snapshots. + * Resetting snapshots doesn't work in xact-snapshot isolation modes, * because those keep a registered transaction snapshot for the whole xact. */ reset_snapshots = IndexBuildResetsSnapshots(indexInfo) && diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c index af7affdf409..10f4f7eeba9 100644 --- a/src/backend/access/nbtree/nbtdedup.c +++ b/src/backend/access/nbtree/nbtdedup.c @@ -149,7 +149,7 @@ _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, _bt_dedup_start_pending(state, itup, offnum); } else if (state->deduplicate && - _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + _bt_keep_natts_fast(rel, state->base, itup, NULL) > nkeyatts && _bt_dedup_save_htid(state, itup)) { /* @@ -376,7 +376,7 @@ _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, /* itup starts first pending interval */ _bt_dedup_start_pending(state, itup, offnum); } - else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + else if (_bt_keep_natts_fast(rel, state->base, itup, NULL) > nkeyatts && _bt_dedup_save_htid(state, itup)) { /* Tuple is equal; just added its TIDs to pending interval */ @@ -789,12 +789,12 @@ _bt_do_singleval(Relation rel, Page page, BTDedupState state, itemid = PageGetItemId(page, minoff); itup = (IndexTuple) PageGetItem(page, itemid); - if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + if (_bt_keep_natts_fast(rel, newitem, itup, NULL) > nkeyatts) { itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page)); itup = (IndexTuple) PageGetItem(page, itemid); - if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + if (_bt_keep_natts_fast(rel, newitem, itup, NULL) > nkeyatts) return true; } diff --git a/src/backend/access/nbtree/nbtreadpage.c b/src/backend/access/nbtree/nbtreadpage.c index 2ba1ca66023..c4c278e777a 100644 --- a/src/backend/access/nbtree/nbtreadpage.c +++ b/src/backend/access/nbtree/nbtreadpage.c @@ -623,7 +623,7 @@ _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate) lasttup = (IndexTuple) PageGetItem(pstate->page, iid); /* Determine the first attribute whose values change on caller's page */ - firstchangingattnum = _bt_keep_natts_fast(rel, firsttup, lasttup); + firstchangingattnum = _bt_keep_natts_fast(rel, firsttup, lasttup, NULL); for (; startikey < so->numberOfKeys; startikey++) { diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 6bb365c951d..3982e5e8a1d 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -87,6 +87,7 @@ typedef struct BTSpool Relation index; bool isunique; bool nulls_not_distinct; + bool unique_dead_ignored; } BTSpool; /* @@ -105,6 +106,7 @@ typedef struct BTShared Oid indexrelid; bool isunique; bool nulls_not_distinct; + bool unique_dead_ignored; bool isconcurrent; int scantuplesortstates; @@ -207,15 +209,14 @@ typedef struct BTLeader */ typedef struct BTBuildState { - bool isunique; - bool nulls_not_distinct; bool havedead; Relation heap; BTSpool *spool; /* - * spool2 is needed only when the index is a unique index. Dead tuples are - * put into spool2 instead of spool in order to avoid uniqueness check. + * spool2 is needed only when the index is a unique index and built + * non-concurrently. Dead tuples are put into spool2 instead of spool + * in order to avoid uniqueness check. */ BTSpool *spool2; double indtuples; @@ -262,7 +263,7 @@ static double _bt_spools_heapscan(Relation heap, Relation index, static void _bt_spooldestroy(BTSpool *btspool); static void _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull); -static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots); +static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool isconcurrent); static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state); static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level); @@ -307,8 +308,6 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) ResetUsage(); #endif /* BTREE_BUILD_STATS */ - buildstate.isunique = indexInfo->ii_Unique; - buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct; buildstate.havedead = false; buildstate.heap = heap; buildstate.spool = NULL; @@ -385,6 +384,12 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, btspool->index = index; btspool->isunique = indexInfo->ii_Unique; btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct; + /* + * We need to ignore dead tuples for unique checks only when concurrent + * unique builds actually use periodic snapshot resets. + */ + btspool->unique_dead_ignored = IndexBuildResetsSnapshots(indexInfo) && + indexInfo->ii_Unique; /* Save as primary spool */ buildstate->spool = btspool; @@ -433,8 +438,9 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * the use of parallelism or any other factor. */ buildstate->spool->sortstate = - tuplesort_begin_index_btree(heap, index, buildstate->isunique, - buildstate->nulls_not_distinct, + tuplesort_begin_index_btree(heap, index, btspool->isunique, + btspool->nulls_not_distinct, + btspool->unique_dead_ignored, maintenance_work_mem, coordinate, TUPLESORT_NONE); @@ -442,8 +448,12 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * If building a unique index, put dead tuples in a second spool to keep * them out of the uniqueness check. We expect that the second spool (for * dead tuples) won't get very full, so we give it only work_mem. + * + * In case of concurrent build dead tuples do not need to be put into index + * since we wait for all snapshots older than reference snapshot during the + * validation phase. */ - if (indexInfo->ii_Unique) + if (indexInfo->ii_Unique && !IndexBuildResetsSnapshots(indexInfo)) { BTSpool *btspool2 = palloc0_object(BTSpool); SortCoordinate coordinate2 = NULL; @@ -474,7 +484,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * full, so we give it only work_mem */ buildstate->spool2->sortstate = - tuplesort_begin_index_btree(heap, index, false, false, work_mem, + tuplesort_begin_index_btree(heap, index, false, false, false, work_mem, coordinate2, TUPLESORT_NONE); } @@ -1155,14 +1165,100 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index); SortSupport sortKeys; int64 tuples_done = 0; - bool deduplicate; + bool deduplicate, + fail_on_alive_duplicate; wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM); deduplicate = wstate->inskey->allequalimage && !btspool->isunique && BTGetDeduplicateItems(wstate->index); + /* + * The unique_dead_ignored does not guarantee absence of multiple alive + * tuples with the same values in the spool. Such a case may happen if + * alive tuples are located between a few dead tuples, like this: addda. + */ + fail_on_alive_duplicate = btspool->unique_dead_ignored; - if (merge) + if (fail_on_alive_duplicate) + { + bool seen_alive = false, + prev_tested = false; + IndexTuple prev = NULL; + TupleTableSlot *slot = MakeSingleTupleTableSlot(RelationGetDescr(wstate->heap), + &TTSOpsBufferHeapTuple); + IndexFetchTableData *fetch = table_index_fetch_begin(wstate->heap, SO_NONE); + + Assert(btspool->isunique); + Assert(!btspool2); + + while ((itup = tuplesort_getindextuple(btspool->sortstate, true)) != NULL) + { + bool tuples_equal = false; + + /* When we see first tuple, create first index page */ + if (state == NULL) + state = _bt_pagestate(wstate, 0); + + if (prev != NULL) /* if this is not the first tuple */ + { + bool has_nulls = false, + call_again = false, + ignored = false, + now_alive; + ItemPointerData tid; + + /* is this tuple equal to previous one? */ + if (wstate->inskey->allequalimage) + tuples_equal = _bt_keep_natts_fast(wstate->index, prev, itup, &has_nulls) > keysz; + else + tuples_equal = _bt_keep_natts(wstate->index, prev, itup, wstate->inskey, &has_nulls) > keysz; + + /* handle null values correctly */ + if (has_nulls && !btspool->nulls_not_distinct) + tuples_equal = false; + + if (tuples_equal) + { + /* check previous tuple if not yet */ + if (!prev_tested) + { + call_again = false; + tid = prev->t_tid; + seen_alive = table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + prev_tested = true; + } + + call_again = false; + tid = itup->t_tid; + now_alive = table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + + /* are multiple alive tuples detected in equal group? */ + if (seen_alive && now_alive) + _bt_report_duplicate(wstate->index, wstate->heap, itup); + seen_alive |= now_alive; + } + } + + if (!tuples_equal) + { + seen_alive = false; + prev_tested = false; + } + + _bt_buildadd(wstate, state, itup, 0); + if (prev) pfree(prev); + prev = CopyIndexTuple(itup); + + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + if (prev) pfree(prev); + ExecDropSingleTupleTableSlot(slot); + table_index_fetch_end(fetch); + Assert(!TransactionIdIsValid(MyProc->xmin)); + } + else if (merge) { /* * Another BTSpool for dead tuples exists. Now we have to merge @@ -1322,7 +1418,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) InvalidOffsetNumber); } else if (_bt_keep_natts_fast(wstate->index, dstate->base, - itup) > keysz && + itup, NULL) > keysz && _bt_dedup_save_htid(dstate, itup)) { /* @@ -1438,15 +1534,13 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) scantuplesortstates = leaderparticipates ? request + 1 : request; /* - * For concurrent non-unique index builds, we can periodically reset - * snapshots to allow the xmin horizon to advance. This is safe since - * these builds don't require a consistent view across the entire scan. - * Unique indexes still need a stable snapshot to properly enforce - * uniqueness constraints. Isolation modes where + * For concurrent index builds, we can periodically reset snapshots to + * allow the xmin horizon to advance. + * Isolation modes where * IsolationUsesXactSnapshot() is true also prevent resetting because * they keep a registered transaction snapshot for the whole transaction. */ - reset_snapshot = isconcurrent && !IsolationUsesXactSnapshot() && !btspool->isunique; + reset_snapshot = isconcurrent && !IsolationUsesXactSnapshot(); /* * Prepare for scan of the base relation. In a normal index build, we use @@ -1454,7 +1548,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, we take a regular MVCC snapshot and index whatever's * live according to that, while that snapshot may be reset periodically - * for non-unique indexes in non-xact-snapshot isolation modes. + * in non-xact-snapshot isolation modes. */ if (!isconcurrent) { @@ -1483,10 +1577,10 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) shm_toc_estimate_chunk(&pcxt->estimator, estsort); /* - * Unique case requires a second spool, and so we may have to account for - * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2 + * Non-concurrent unique case requires a second spool, and so we may have + * to account for another shared workspace -- PARALLEL_KEY_TUPLESORT_SPOOL2 */ - if (!btspool->isunique) + if (!btspool->isunique || reset_snapshot) shm_toc_estimate_keys(&pcxt->estimator, 2); else { @@ -1541,6 +1635,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) btshared->indexrelid = RelationGetRelid(btspool->index); btshared->isunique = btspool->isunique; btshared->nulls_not_distinct = btspool->nulls_not_distinct; + btshared->unique_dead_ignored = btspool->unique_dead_ignored; btshared->isconcurrent = isconcurrent; btshared->scantuplesortstates = scantuplesortstates; btshared->queryid = pgstat_get_my_query_id(); @@ -1568,8 +1663,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared); shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort); - /* Unique case requires a second spool, and associated shared state */ - if (!btspool->isunique) + /* Non-concurrent unique case requires a second spool and shared state */ + if (!btspool->isunique || reset_snapshot) sharedsort2 = NULL; else { @@ -1752,9 +1847,10 @@ _bt_leader_participate_as_worker(BTBuildState *buildstate) leaderworker->index = buildstate->spool->index; leaderworker->isunique = buildstate->spool->isunique; leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct; + leaderworker->unique_dead_ignored = buildstate->spool->unique_dead_ignored; /* Initialize second spool, if required */ - if (!btleader->btshared->isunique) + if (!btleader->btshared->isunique || (btleader->btshared->isconcurrent && !IsolationUsesXactSnapshot())) leaderworker2 = NULL; else { @@ -1855,11 +1951,12 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) btspool->index = indexRel; btspool->isunique = btshared->isunique; btspool->nulls_not_distinct = btshared->nulls_not_distinct; + btspool->unique_dead_ignored = btshared->unique_dead_ignored; /* Look up shared state private to tuplesort.c */ sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false); tuplesort_attach_shared(sharedsort, seg); - if (!btshared->isunique) + if (!btshared->isunique || (btshared->isconcurrent && !IsolationUsesXactSnapshot())) { btspool2 = NULL; sharedsort2 = NULL; @@ -1939,6 +2036,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, btspool->index, btspool->isunique, btspool->nulls_not_distinct, + btspool->unique_dead_ignored, sortmem, coordinate, TUPLESORT_NONE); @@ -1961,14 +2059,12 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, coordinate2->nParticipants = -1; coordinate2->sharedsort = sharedsort2; btspool2->sortstate = - tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false, + tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false, false, Min(sortmem, work_mem), coordinate2, false); } /* Fill in buildstate for _bt_build_callback() */ - buildstate.isunique = btshared->isunique; - buildstate.nulls_not_distinct = btshared->nulls_not_distinct; buildstate.havedead = false; buildstate.heap = btspool->heap; buildstate.spool = btspool; diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index de9eca3c8b2..b174fb8d3ee 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -688,7 +688,7 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, { itemid = PageGetItemId(state->origpage, maxoff); tup = (IndexTuple) PageGetItem(state->origpage, itemid); - keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); + keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem, NULL); if (keepnatts > 1 && keepnatts <= nkeyatts) { @@ -719,7 +719,7 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) return false; /* Check same conditions as rightmost item case, too */ - keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); + keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem, NULL); if (keepnatts > 1 && keepnatts <= nkeyatts) { @@ -968,7 +968,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, * avoid appending a heap TID in new high key, we're done. Finish split * with default strategy and initial split interval. */ - perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost, NULL); if (perfectpenalty <= indnkeyatts) return perfectpenalty; @@ -989,7 +989,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, * If page is entirely full of duplicates, a single value strategy split * will be performed. */ - perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost, NULL); if (perfectpenalty <= indnkeyatts) { *strategy = SPLIT_MANY_DUPLICATES; @@ -1028,7 +1028,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, itemid = PageGetItemId(state->origpage, P_HIKEY); hikey = (IndexTuple) PageGetItem(state->origpage, itemid); perfectpenalty = _bt_keep_natts_fast(state->rel, hikey, - state->newitem); + state->newitem, NULL); if (perfectpenalty <= indnkeyatts) *strategy = SPLIT_SINGLE_VALUE; else @@ -1150,7 +1150,7 @@ _bt_split_penalty(FindSplitData *state, SplitPoint *split) lastleft = _bt_split_lastleft(state, split); firstright = _bt_split_firstright(state, split); - return _bt_keep_natts_fast(state->rel, lastleft, firstright); + return _bt_keep_natts_fast(state->rel, lastleft, firstright, NULL); } /* diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 014faa1622f..e9351e54d7f 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -17,6 +17,7 @@ #include <time.h> +#include "access/genam.h" #include "access/nbtree.h" #include "access/reloptions.h" #include "access/relscan.h" @@ -32,9 +33,6 @@ static int _bt_compare_int(const void *va, const void *vb); -static int _bt_keep_natts(Relation rel, IndexTuple lastleft, - IndexTuple firstright, BTScanInsert itup_key); - /* * _bt_mkscankey @@ -707,7 +705,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright)); /* Determine how many attributes must be kept in truncated tuple */ - keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key); + keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key, NULL); #ifdef DEBUG_NO_TRUNCATE /* Force truncation to be ineffective for testing purposes */ @@ -825,17 +823,24 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, /* * _bt_keep_natts - how many key attributes to keep when truncating. * + * This is exported to be used as comparison function during concurrent + * unique index build in case _bt_keep_natts_fast is not suitable because + * collation is not "allequalimage"/deduplication-safe. + * * Caller provides two tuples that enclose a split point. Caller's insertion * scankey is used to compare the tuples; the scankey's argument values are * not considered here. * + * hasnulls value set to true in case of any null column in any tuple. + * * This can return a number of attributes that is one greater than the * number of key attributes for the index relation. This indicates that the * caller must use a heap TID as a unique-ifier in new pivot tuple. */ -static int +int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, - BTScanInsert itup_key) + BTScanInsert itup_key, + bool *hasnulls) { int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); TupleDesc itupdesc = RelationGetDescr(rel); @@ -861,6 +866,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + if (hasnulls) + *hasnulls |= isNull1 | isNull2; if (isNull1 != isNull2) break; @@ -880,7 +887,7 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * expected in an allequalimage index. */ Assert(!itup_key->allequalimage || - keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright)); + keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright, NULL)); return keepnatts; } @@ -891,7 +898,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * This is exported so that a candidate split point can have its effect on * suffix truncation inexpensively evaluated ahead of time when finding a * split location. A naive bitwise approach to datum comparisons is used to - * save cycles. + * save cycles. Also, it may be used as comparison function during concurrent + * build of unique index. * * The approach taken here usually provides the same answer as _bt_keep_natts * will (for the same pair of tuples from a heapkeyspace index), since the @@ -900,6 +908,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * "equal image" columns, routine is guaranteed to give the same result as * _bt_keep_natts would. * + * hasnulls value set to true in case of any null column in any tuple. + * * Callers can rely on the fact that attributes considered equal here are * definitely also equal according to _bt_keep_natts, even when the index uses * an opclass or collation that is not "allequalimage"/deduplication-safe. @@ -908,7 +918,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * more balanced split point. */ int -_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) +_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright, + bool *hasnulls) { TupleDesc itupdesc = RelationGetDescr(rel); int keysz = IndexRelationGetNumberOfKeyAttributes(rel); @@ -925,6 +936,8 @@ _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + if (hasnulls) + *hasnulls |= isNull1 | isNull2; att = TupleDescCompactAttr(itupdesc, attnum - 1); if (isNull1 != isNull2) @@ -1102,6 +1115,34 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum) return tupnatts > 0 && tupnatts <= nkeyatts; } +/* + * _bt_report_duplicate() -- report a unique violation during index build. + * + * This is used by both _bt_load() and the tuplesort comparator's fail-fast + * check to report duplicate keys found during CREATE INDEX CONCURRENTLY or + * REINDEX CONCURRENTLY with snapshot resets enabled. + */ +void +_bt_report_duplicate(Relation indexRel, Relation heapRel, IndexTuple itup) +{ + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + char *key_desc; + + index_deform_tuple(itup, RelationGetDescr(indexRel), values, isnull); + + key_desc = BuildIndexValueDescription(indexRel, values, isnull); + + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(heapRel, + RelationGetRelationName(indexRel)))); +} + /* * * _bt_check_third_page() -- check whether tuple fits on a btree page at all. diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 7f47e7df07d..1383faeacfc 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -3344,9 +3344,9 @@ IndexCheckExclusion(Relation heapRelation, * if we used HeapTupleSatisfiesVacuum). This leaves us with an index that * does not contain any tuples added to the table while we built the index. * - * Furthermore, in case of non-unique index we set SO_RESET_SNAPSHOT for the - * scan, which causes new snapshot to be set as active every so often. The reason - * for that is to propagate the xmin horizon forward. + * Furthermore, we set SO_RESET_SNAPSHOT for the scan, which causes new + * snapshot to be set as active every so often. The reason for that is to + * propagate the xmin horizon forward. * * Next, we mark the index "indisready" (but still not "indisvalid") and * commit the second transaction and start a third. Again we wait for all diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index be3dc5e8d28..9fff8635d3d 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1704,8 +1704,8 @@ DefineIndex(ParseState *pstate, * chains can be created where the new tuple and the old tuple in the * chain have different index keys. * - * We build the index using all tuples that are visible using single or - * multiple refreshing snapshots. We can be sure that any HOT updates to + * We build the index using all tuples that are visible using multiple + * refreshing snapshots. We can be sure that any HOT updates to * these tuples will be compatible with the index, since any updates made * by transactions that didn't know about the index are now committed or * rolled back. Thus, each visible tuple is either the end of its diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index df92b9fee68..fb960384c18 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -1356,6 +1356,16 @@ show_hook => 'show_in_hot_standby', }, +{ name => 'index_build_duplicate_check_max_fetch_pct', type => 'int', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Limits heap fetches during fail-fast duplicate checking in concurrent unique index builds.', + long_desc => 'Maximum percentage of total tuplesort tuples that may trigger heap fetches in the sort comparator. Zero disables the fail-fast check. 100 allows unlimited fetches.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'index_build_duplicate_check_max_fetch_pct', + boot_val => '1', + min => '0', + max => '100', +}, + { name => 'integer_datetimes', type => 'bool', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', short_desc => 'Shows whether datetimes are integer based.', flags => 'GUC_REPORT | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 72c2c2995d8..b9caa001fdc 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -688,6 +688,7 @@ tuplesort_begin_batch(Tuplesortstate *state) state->tapeset = NULL; state->memtupcount = 0; + state->base.ntuples = 0; state->growmemtuples = true; state->slabAllocatorUsed = false; @@ -1073,6 +1074,7 @@ tuplesort_puttuple_common(Tuplesortstate *state, SortTuple *tuple, /* account for the memory used for this tuple */ USEMEM(state, tuplen); state->tupleMem += tuplen; + state->base.ntuples++; if (!useAbbrev) { diff --git a/src/backend/utils/sort/tuplesortvariants.c b/src/backend/utils/sort/tuplesortvariants.c index 2509ac3e3a4..2ec0c71918b 100644 --- a/src/backend/utils/sort/tuplesortvariants.c +++ b/src/backend/utils/sort/tuplesortvariants.c @@ -25,9 +25,12 @@ #include "access/hash.h" #include "access/htup_details.h" #include "access/nbtree.h" +#include "access/relscan.h" +#include "access/tableam.h" #include "catalog/index.h" #include "catalog/pg_collation.h" #include "executor/executor.h" +#include "miscadmin.h" #include "pg_trace.h" #include "utils/builtins.h" #include "utils/datum.h" @@ -35,7 +38,9 @@ #include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/tuplesort.h" +#include "storage/proc.h" +int index_build_duplicate_check_max_fetch_pct = 1; /* sort-type codes for sort__start probes */ #define HEAP_SORT 0 @@ -136,6 +141,9 @@ typedef struct bool enforceUnique; /* complain if we find duplicate tuples */ bool uniqueNullsNotDistinct; /* unique constraint null treatment */ + bool uniqueDeadIgnored; /* ignore dead tuples in unique check */ + int64 heapFetchCount; /* heap fetches performed in comparator */ + int64 heapFetchLimit; /* max allowed, -1 = not yet computed */ } TuplesortIndexBTreeArg; /* @@ -361,6 +369,7 @@ tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, + bool uniqueDeadIgnored, int workMem, SortCoordinate coordinate, int sortopt) @@ -403,6 +412,9 @@ tuplesort_begin_index_btree(Relation heapRel, arg->index.indexRel = indexRel; arg->enforceUnique = enforceUnique; arg->uniqueNullsNotDistinct = uniqueNullsNotDistinct; + arg->uniqueDeadIgnored = uniqueDeadIgnored; + arg->heapFetchCount = 0; + arg->heapFetchLimit = -1; indexScanKey = _bt_mkscankey(indexRel, NULL); @@ -1667,9 +1679,7 @@ comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, */ if (arg->enforceUnique && !(!arg->uniqueNullsNotDistinct && equal_hasnull)) { - Datum values[INDEX_MAX_KEYS]; - bool isnull[INDEX_MAX_KEYS]; - char *key_desc; + bool uniqueCheckFail = true; /* * Some rather brain-dead implementations of qsort (such as the one in @@ -1679,18 +1689,79 @@ comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, */ Assert(tuple1 != tuple2); - index_deform_tuple(tuple1, tupDes, values, isnull); - - key_desc = BuildIndexValueDescription(arg->index.indexRel, values, isnull); - - ereport(ERROR, - (errcode(ERRCODE_UNIQUE_VIOLATION), - errmsg("could not create unique index \"%s\"", - RelationGetRelationName(arg->index.indexRel)), - key_desc ? errdetail("Key %s is duplicated.", key_desc) : - errdetail("Duplicate keys exist."), - errtableconstraint(arg->index.heapRel, - RelationGetRelationName(arg->index.indexRel)))); + /* + * Fail-fast check: perform heap fetches to see if either tuple is + * dead, allowing us to skip the uniqueness error. See _bt_load for + * the definitive check. + * + * The number of heap fetches is bounded by + * index_build_duplicate_check_max_fetch_pct to avoid excessive I/O + * when many equal keys are compared during sorting. Once the budget + * is exhausted, we skip the fail-fast check and let _bt_load() + * handle uniqueness verification. + */ + if (arg->uniqueDeadIgnored) + { + /* Lazily compute the heap fetch limit */ + if (arg->heapFetchLimit < 0) + { + if (index_build_duplicate_check_max_fetch_pct > 0) + arg->heapFetchLimit = base->ntuples * + index_build_duplicate_check_max_fetch_pct / 100; + else + arg->heapFetchLimit = 0; + } + + if (arg->heapFetchLimit > 0 && + arg->heapFetchCount < arg->heapFetchLimit) + { + bool any_tuple_dead, + call_again = false, + ignored; + /* + * Keep the slot/fetch lifetime local to this duplicate check + * so the xmin propagation assertion below continues to + * validate cleanup. + */ + TupleTableSlot *slot = MakeSingleTupleTableSlot(RelationGetDescr(arg->index.heapRel), + &TTSOpsBufferHeapTuple); + ItemPointerData tid = tuple1->t_tid; + + IndexFetchTableData *fetch = table_index_fetch_begin(arg->index.heapRel, SO_NONE); + any_tuple_dead = !table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + arg->heapFetchCount++; + + if (!any_tuple_dead) + { + call_again = false; + tid = tuple2->t_tid; + any_tuple_dead = !table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, + &ignored); + arg->heapFetchCount++; + } + + if (any_tuple_dead) + { + elog(DEBUG5, "skipping duplicate values because some of them are dead: (%u,%u) vs (%u,%u)", + ItemPointerGetBlockNumber(&tuple1->t_tid), + ItemPointerGetOffsetNumber(&tuple1->t_tid), + ItemPointerGetBlockNumber(&tuple2->t_tid), + ItemPointerGetOffsetNumber(&tuple2->t_tid)); + + uniqueCheckFail = false; + } + ExecDropSingleTupleTableSlot(slot); + table_index_fetch_end(fetch); + Assert(!TransactionIdIsValid(MyProc->xmin)); + } + else + { + /* Budget exhausted; defer to _bt_load() */ + uniqueCheckFail = false; + } + } + if (uniqueCheckFail) + _bt_report_duplicate(arg->index.indexRel, arg->index.heapRel, tuple1); } /* diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 3097e9bb1af..bd789b7d344 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1307,13 +1307,17 @@ extern bool btproperty(Oid index_oid, int attno, extern char *btbuildphasename(int64 phasenum); extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key); +extern int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, + BTScanInsert itup_key, bool *hasnulls); extern int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, - IndexTuple firstright); + IndexTuple firstright, bool *hasnulls); extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum); extern void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup); extern bool _bt_allequalimage(Relation rel, bool debugmessage); +extern void _bt_report_duplicate(Relation indexRel, Relation heapRel, + IndexTuple itup); /* * prototypes for functions in nbtvalidate.c diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 219525dbcb0..d1a298bf1bb 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -1867,9 +1867,10 @@ table_scan_analyze_next_tuple(TableScanDesc scan, * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. * - * In case of non-unique concurrent build, - * concurrent_index_reset_snapshot_every_n_pages is applied for the scan. - * That leads to changing snapshots on the fly to allow xmin horizon propagate. + * In case of concurrent index build, + * concurrent_index_reset_snapshot_every_n_pages is applied for the + * scan. That leads to changing snapshots on the fly to allow xmin + * horizon propagate. */ static inline double table_index_build_scan(Relation table_rel, diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 60f8356ed82..ed000bf10d5 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -32,13 +32,11 @@ typedef struct AttrMap AttrMap; * * Snapshot resetting is only applicable when all of: * - the build is concurrent (ii_Concurrent) - * - the index is non-unique (unique needs consistent snapshot) * - isolation level is not REPEATABLE READ/SERIALIZABLE (those keep a * registered transaction snapshot) */ #define IndexBuildResetsSnapshots(indexInfo) \ ((indexInfo)->ii_Concurrent && \ - !(indexInfo)->ii_Unique && \ !IsolationUsesXactSnapshot()) /* Action code for index_set_state_flags */ diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 09cbb4b1d99..8954b7818f0 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -273,6 +273,7 @@ extern PGDLLIMPORT double hash_mem_multiplier; extern PGDLLIMPORT int maintenance_work_mem; extern PGDLLIMPORT int max_parallel_maintenance_workers; extern PGDLLIMPORT int concurrent_index_reset_snapshot_every_n_pages; +extern PGDLLIMPORT int index_build_duplicate_check_max_fetch_pct; /* * Upper and lower hard limits for the buffer access strategy ring size diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index da68f45acf2..191010c57c3 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -216,6 +216,8 @@ typedef struct bool tuples; /* Can SortTuple.tuple ever be set? */ + int64 ntuples; /* total tuples inserted into sort */ + void *arg; /* Specific information for the sort variant */ } TuplesortPublic; @@ -396,6 +398,7 @@ extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, + bool uniqueDeadIgnored, int workMem, SortCoordinate coordinate, int sortopt); extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel, diff --git a/src/test/modules/injection_points/expected/cic_reset_snapshots.out b/src/test/modules/injection_points/expected/cic_reset_snapshots.out index e5a8a7f3a79..7835a796b88 100644 --- a/src/test/modules/injection_points/expected/cic_reset_snapshots.out +++ b/src/test/modules/injection_points/expected/cic_reset_snapshots.out @@ -41,7 +41,11 @@ END; $$; ---------------- ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots @@ -86,7 +90,9 @@ SELECT injection_points_detach('heap_reset_scan_snapshot_effective'); (1 row) CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); NOTICE: notice triggered for injection point table_parallelscan_initialize -- 2.43.0 [application/octet-stream] v6-0001-Add-stress-tests-for-concurrent-index-builds.patch (12.6K, 5-v6-0001-Add-stress-tests-for-concurrent-index-builds.patch) download | inline diff: From 93f0159013569ade893117635de98801635e3233 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu <[email protected]> Date: Sat, 30 Nov 2024 16:24:20 +0100 Subject: [PATCH v6 1/4] Add stress tests for concurrent index builds Introduce stress tests for concurrent index operations: - test concurrent inserts/updates during CREATE/REINDEX INDEX CONCURRENTLY - cover various index types (btree, gin, gist, brin, hash, spgist) - test unique and non-unique indexes - test with expressions and predicates - test both parallel and non-parallel operations - test both read-committed and repeatable-read isolation levels These tests verify the behavior of the following commits. --- src/bin/pg_amcheck/meson.build | 1 + src/bin/pg_amcheck/t/006_cic.pl | 293 ++++++++++++++++++++++++++++++++ 2 files changed, 294 insertions(+) create mode 100644 src/bin/pg_amcheck/t/006_cic.pl diff --git a/src/bin/pg_amcheck/meson.build b/src/bin/pg_amcheck/meson.build index 592cef74ecb..51a62dccb7b 100644 --- a/src/bin/pg_amcheck/meson.build +++ b/src/bin/pg_amcheck/meson.build @@ -28,6 +28,7 @@ tests += { 't/003_check.pl', 't/004_verify_heapam.pl', 't/005_opclass_damage.pl', + 't/006_cic.pl', ], }, } diff --git a/src/bin/pg_amcheck/t/006_cic.pl b/src/bin/pg_amcheck/t/006_cic.pl new file mode 100644 index 00000000000..dd7a1eff0ef --- /dev/null +++ b/src/bin/pg_amcheck/t/006_cic.pl @@ -0,0 +1,293 @@ +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Test REINDEX CONCURRENTLY with concurrent modifications and HOT updates +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use constant STRESS_PGBENCH_CLIENTS => 30; +use constant STRESS_PGBENCH_JOBS => 8; +use constant STRESS_PGBENCH_TRANSACTIONS => 10000; +use constant STRESS_MAX_SLEEP_MS => 10; + +use constant DEFAULT_PGBENCH_CLIENTS => 15; +use constant DEFAULT_PGBENCH_JOBS => 4; +use constant DEFAULT_PGBENCH_TRANSACTIONS => 500; +use constant DEFAULT_MAX_SLEEP_MS => 1; + +Test::More->builder->todo_start('filesystem bug') + if PostgreSQL::Test::Utils::has_wal_read_bug; + +my $node; +my $pg_test_extra = $ENV{PG_TEST_EXTRA} // ''; +my $is_stress = $pg_test_extra =~ /\bstress\b/ ? 1 : 0; +my $pgbench_clients = + $is_stress ? STRESS_PGBENCH_CLIENTS : DEFAULT_PGBENCH_CLIENTS; +my $pgbench_jobs = $is_stress ? STRESS_PGBENCH_JOBS : DEFAULT_PGBENCH_JOBS; +my $pgbench_transactions = + $is_stress ? STRESS_PGBENCH_TRANSACTIONS : DEFAULT_PGBENCH_TRANSACTIONS; +my $max_sleep_ms = $is_stress ? STRESS_MAX_SLEEP_MS : DEFAULT_MAX_SLEEP_MS; +my $pgbench_options = sprintf( + '--no-vacuum --client=%d --jobs=%d --exit-on-abort --transactions=%d', + $pgbench_clients, + $pgbench_jobs, + $pgbench_transactions); +my $no_hot = $is_stress ? int(rand(2)) : 0; + +print( + sprintf( + 'settings: PG_TEST_EXTRA=%s stress=%d clients=%d jobs=%d transactions=%d max_sleep_ms=%d no_hot=%d', + defined($ENV{PG_TEST_EXTRA}) + ? ($pg_test_extra eq '' ? '(empty)' : $pg_test_extra) + : '(undef)', + $is_stress, + $pgbench_clients, + $pgbench_jobs, + $pgbench_transactions, + $max_sleep_ms, + $no_hot)); +print "\n"; + +# +# Test set-up +# +$node = PostgreSQL::Test::Cluster->new('RC_test'); +$node->init; +$node->append_conf('postgresql.conf', + 'lock_timeout = ' . (1000 * $PostgreSQL::Test::Utils::timeout_default)); +$node->append_conf('postgresql.conf', 'fsync = off'); +$node->append_conf('postgresql.conf', 'maintenance_work_mem = 32MB'); # to avoid OOM +$node->append_conf('postgresql.conf', 'shared_buffers = 32MB'); # to avoid OOM +$node->start; +$node->safe_psql('postgres', q(CREATE EXTENSION amcheck)); +$node->safe_psql('postgres', q(CREATE UNLOGGED TABLE tbl(i int primary key, + c1 money default 0, c2 money default 0, + c3 money default 0, updated_at timestamp, + ia int4[], p point))); + +if ($no_hot) { $node->safe_psql('postgres', q(CREATE INDEX CONCURRENTLY idx ON tbl(i, updated_at);)); } + +# create sequence +$node->safe_psql('postgres', q(CREATE UNLOGGED SEQUENCE in_row_rebuild START 1 INCREMENT 1;)); +$node->safe_psql('postgres', q(SELECT nextval('in_row_rebuild');)); + +# Create helper functions for predicate tests +$node->safe_psql('postgres', q( + CREATE FUNCTION predicate_stable() RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ + BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN true; + END; $$; +)); + +$node->safe_psql('postgres', q( + CREATE FUNCTION predicate_const(integer) RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ + BEGIN + RETURN MOD($1, 2) = 0; + END; $$; +)); + +# Run CIC/RIC in different options concurrently with upserts +$node->pgbench( + $pgbench_options, + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY', + { + 'concurrent_ops' => sprintf(q( + SET debug_parallel_query = off; -- this is because predicate_stable implementation + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set variant random(0, 5) + \set parallels random(0, 4) + \set use_rr random(0, 9) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + \if :use_rr = 0 + SET default_transaction_isolation = 'repeatable read'; + \endif + \if :variant = 0 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at); + \elif :variant = 1 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at) WHERE predicate_stable(); + \elif :variant = 2 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at) WHERE MOD(i, 2) = 0; + \elif :variant = 3 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at) WHERE predicate_const(i); + \elif :variant = 4 + CREATE INDEX CONCURRENTLY new_idx ON tbl(predicate_const(i)); + \elif :variant = 5 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, predicate_const(i), updated_at) WHERE predicate_const(i); + \endif + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + REINDEX INDEX CONCURRENTLY new_idx; + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + DROP INDEX CONCURRENTLY new_idx; + RESET default_transaction_isolation; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1000, 100000) + BEGIN; + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + SELECT setval('in_row_rebuild', 1); + COMMIT; + \endif + ), $max_sleep_ms, $max_sleep_ms) + }); + +$node->safe_psql('postgres', q(TRUNCATE TABLE tbl;)); + +# Run CIC/RIC for unique index concurrently with upserts +$node->pgbench( + $pgbench_options, + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for unique BTREE', + { + 'concurrent_ops_unique_idx' => sprintf(q( + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set parallels random(0, 4) + \set use_rr random(0, 9) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + \if :use_rr = 0 + SET default_transaction_isolation = 'repeatable read'; + \endif + CREATE UNIQUE INDEX CONCURRENTLY new_idx ON tbl(i); + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + REINDEX INDEX CONCURRENTLY new_idx; + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + DROP INDEX CONCURRENTLY new_idx; + RESET default_transaction_isolation; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1, power(10, random(1, 5))) + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + SELECT setval('in_row_rebuild', 1); + \endif + ), $max_sleep_ms, $max_sleep_ms) + }); + +$node->safe_psql('postgres', q(TRUNCATE TABLE tbl;)); + +# Run CIC/RIC for GIN with upserts +$node->pgbench( + $pgbench_options, + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for GIN', + { + 'concurrent_ops_gin_idx' => sprintf(q( + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set parallels random(0, 4) + \set use_rr random(0, 9) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + \if :use_rr = 0 + SET default_transaction_isolation = 'repeatable read'; + \endif + CREATE INDEX CONCURRENTLY new_idx ON tbl USING GIN (ia); + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + SELECT gin_index_check('new_idx'); + REINDEX INDEX CONCURRENTLY new_idx; + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + SELECT gin_index_check('new_idx'); + DROP INDEX CONCURRENTLY new_idx; + RESET default_transaction_isolation; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1, power(10, random(1, 5))) + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + SELECT setval('in_row_rebuild', 1); + \endif + ), $max_sleep_ms, $max_sleep_ms) + }); + +$node->safe_psql('postgres', q(TRUNCATE TABLE tbl;)); + +# Run CIC/RIC for GIST/BRIN/HASH/SPGIST index concurrently with upserts +$node->pgbench( + $pgbench_options, + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for GIST/BRIN/HASH/SPGIST', + { + 'concurrent_ops_other_idx' => sprintf(q( + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set parallels random(0, 4) + \set use_rr random(0, 9) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + \if :use_rr = 0 + SET default_transaction_isolation = 'repeatable read'; + \endif + \set variant random(0, 3) + \if :variant = 0 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING GIST (p); + \elif :variant = 1 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING BRIN (updated_at); + \elif :variant = 2 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING HASH (updated_at); + \elif :variant = 3 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING SPGIST (p); + \endif + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + REINDEX INDEX CONCURRENTLY new_idx; + \set sleep_ms random(0, %d) + \sleep :sleep_ms ms + DROP INDEX CONCURRENTLY new_idx; + RESET default_transaction_isolation; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1, power(10, random(1, 5))) + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now(),ARRAY[floor(random()*100)::int],point(random(),random())) + ON CONFLICT(i) DO UPDATE SET updated_at = now(), ia = ARRAY[floor(random()*100)::int], p = point(random(),random()); + SELECT setval('in_row_rebuild', 1); + \endif + ), $max_sleep_ms, $max_sleep_ms) + }); + +$node->stop; +done_testing(); -- 2.43.0 ^ permalink raw reply [nested|flat] 6+ messages in thread
end of thread, other threads:[~2026-04-18 13:33 UTC | newest] Thread overview: 6+ messages (download: mbox mbox.gz follow: Atom feed) -- links below jump to the message on this page -- 2026-03-09 00:03 Re: Resetting snapshots during the first phase of [CREATE |RE]INDEX CONCURRENTLY Mihail Nikalayeu <[email protected]> 2026-03-20 01:15 ` Mihail Nikalayeu <[email protected]> 2026-03-21 23:50 ` Mihail Nikalayeu <[email protected]> 2026-04-06 17:55 ` Mihail Nikalayeu <[email protected]> 2026-04-11 17:48 ` Mihail Nikalayeu <[email protected]> 2026-04-18 13:33 ` Mihail Nikalayeu <[email protected]>
This inbox is served by agora; see mirroring instructions for how to clone and mirror all data and code used for this inbox