From dc8447015383a3c38c71570749b697b25c7aceb7 Mon Sep 17 00:00:00 2001 From: nkey Date: Sat, 7 Dec 2024 23:27:34 +0100 Subject: [PATCH v6 6/6] Allow snapshot resets in concurrent unique index builds Previously, concurrent unique index builds used a fixed snapshot for the entire scan to ensure proper uniqueness checks. This could delay vacuum's ability to clean up dead tuples. Now reset snapshots periodically during concurrent unique index builds, while still maintaining uniqueness by: 1. Ignoring dead tuples during uniqueness checks in tuplesort 2. Adding a uniqueness check in _bt_load that detects multiple alive tuples with the same key values This improves vacuum effectiveness during long-running index builds without compromising index uniqueness enforcement. --- src/backend/access/heap/heapam_handler.c | 6 +- src/backend/access/nbtree/nbtdedup.c | 8 +- src/backend/access/nbtree/nbtsort.c | 173 ++++++++++++++---- src/backend/access/nbtree/nbtsplitloc.c | 12 +- src/backend/access/nbtree/nbtutils.c | 29 ++- src/backend/catalog/index.c | 6 +- src/backend/utils/sort/tuplesortvariants.c | 67 +++++-- src/include/access/nbtree.h | 4 +- src/include/access/tableam.h | 5 +- src/include/utils/tuplesort.h | 1 + .../expected/cic_reset_snapshots.out | 6 + 11 files changed, 242 insertions(+), 75 deletions(-) diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 2e5163609c1..921b806642a 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1232,15 +1232,15 @@ heapam_index_build_range_scan(Relation heapRelation, * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, or during bootstrap, we take a regular MVCC snapshot * and index whatever's live according to that while that snapshot is reset - * every so often (in case of non-unique index). + * every so often. */ OldestXmin = InvalidTransactionId; /* - * For unique index we need consistent snapshot for the whole scan. + * For concurrent builds of non-system indexes, we may want to periodically + * reset snapshots to allow vacuum to clean up tuples. */ reset_snapshots = indexInfo->ii_Concurrent && - !indexInfo->ii_Unique && !is_system_catalog; /* just for the case */ /* okay to ignore lazy VACUUMs here */ diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c index 456d86b51c9..31b59265a29 100644 --- a/src/backend/access/nbtree/nbtdedup.c +++ b/src/backend/access/nbtree/nbtdedup.c @@ -148,7 +148,7 @@ _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, _bt_dedup_start_pending(state, itup, offnum); } else if (state->deduplicate && - _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + _bt_keep_natts_fast(rel, state->base, itup, NULL) > nkeyatts && _bt_dedup_save_htid(state, itup)) { /* @@ -374,7 +374,7 @@ _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, /* itup starts first pending interval */ _bt_dedup_start_pending(state, itup, offnum); } - else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + else if (_bt_keep_natts_fast(rel, state->base, itup, NULL) > nkeyatts && _bt_dedup_save_htid(state, itup)) { /* Tuple is equal; just added its TIDs to pending interval */ @@ -789,12 +789,12 @@ _bt_do_singleval(Relation rel, Page page, BTDedupState state, itemid = PageGetItemId(page, minoff); itup = (IndexTuple) PageGetItem(page, itemid); - if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + if (_bt_keep_natts_fast(rel, newitem, itup, NULL) > nkeyatts) { itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page)); itup = (IndexTuple) PageGetItem(page, itemid); - if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + if (_bt_keep_natts_fast(rel, newitem, itup, NULL) > nkeyatts) return true; } diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 2acbf121745..ac9e5acfc53 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -83,6 +83,7 @@ typedef struct BTSpool Relation index; bool isunique; bool nulls_not_distinct; + bool unique_dead_ignored; } BTSpool; /* @@ -101,6 +102,7 @@ typedef struct BTShared Oid indexrelid; bool isunique; bool nulls_not_distinct; + bool unique_dead_ignored; bool isconcurrent; int scantuplesortstates; @@ -203,15 +205,13 @@ typedef struct BTLeader */ typedef struct BTBuildState { - bool isunique; - bool nulls_not_distinct; bool havedead; Relation heap; BTSpool *spool; /* - * spool2 is needed only when the index is a unique index. Dead tuples are - * put into spool2 instead of spool in order to avoid uniqueness check. + * spool2 is needed only when the index is a unique index and build non-concurrently. + * Dead tuples are put into spool2 instead of spool in order to avoid uniqueness check. */ BTSpool *spool2; double indtuples; @@ -303,8 +303,6 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) ResetUsage(); #endif /* BTREE_BUILD_STATS */ - buildstate.isunique = indexInfo->ii_Unique; - buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct; buildstate.havedead = false; buildstate.heap = heap; buildstate.spool = NULL; @@ -379,6 +377,11 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, btspool->index = index; btspool->isunique = indexInfo->ii_Unique; btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct; + /* + * We need to ignore dead tuples for unique checks in case of concurrent build. + * It is required because or periodic reset of snapshot. + */ + btspool->unique_dead_ignored = indexInfo->ii_Concurrent && indexInfo->ii_Unique; /* Save as primary spool */ buildstate->spool = btspool; @@ -427,8 +430,9 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * the use of parallelism or any other factor. */ buildstate->spool->sortstate = - tuplesort_begin_index_btree(heap, index, buildstate->isunique, - buildstate->nulls_not_distinct, + tuplesort_begin_index_btree(heap, index, btspool->isunique, + btspool->nulls_not_distinct, + btspool->unique_dead_ignored, maintenance_work_mem, coordinate, TUPLESORT_NONE); @@ -436,8 +440,12 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * If building a unique index, put dead tuples in a second spool to keep * them out of the uniqueness check. We expect that the second spool (for * dead tuples) won't get very full, so we give it only work_mem. + * + * In case of concurrent build dead tuples are not need to be put into index + * since we wait for all snapshots older than reference snapshot during the + * validation phase. */ - if (indexInfo->ii_Unique) + if (indexInfo->ii_Unique && !indexInfo->ii_Concurrent) { BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool)); SortCoordinate coordinate2 = NULL; @@ -468,7 +476,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * full, so we give it only work_mem */ buildstate->spool2->sortstate = - tuplesort_begin_index_btree(heap, index, false, false, work_mem, + tuplesort_begin_index_btree(heap, index, false, false, false, work_mem, coordinate2, TUPLESORT_NONE); } @@ -1147,13 +1155,116 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) SortSupport sortKeys; int64 tuples_done = 0; bool deduplicate; + bool fail_on_alive_duplicate; wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM); deduplicate = wstate->inskey->allequalimage && !btspool->isunique && BTGetDeduplicateItems(wstate->index); + /* + * The unique_dead_ignored does not guarantee absence of multiple alive + * tuples with same values exists in the spool. Such thing may happen if + * alive tuples are located between a few dead tuples, like this: addda. + */ + fail_on_alive_duplicate = btspool->unique_dead_ignored; - if (merge) + if (fail_on_alive_duplicate) + { + bool seen_alive = false, + prev_tested = false; + IndexTuple prev = NULL; + TupleTableSlot *slot = MakeSingleTupleTableSlot(RelationGetDescr(wstate->heap), + &TTSOpsBufferHeapTuple); + IndexFetchTableData *fetch = table_index_fetch_begin(wstate->heap); + + Assert(btspool->isunique); + Assert(!btspool2); + + while ((itup = tuplesort_getindextuple(btspool->sortstate, true)) != NULL) + { + bool tuples_equal = false; + + /* When we see first tuple, create first index page */ + if (state == NULL) + state = _bt_pagestate(wstate, 0); + + if (prev != NULL) /* if is not the first tuple */ + { + bool has_nulls = false, + call_again, /* just to pass something */ + ignored, /* just to pass something */ + now_alive; + ItemPointerData tid; + + /* if this tuples equal to previouse one? */ + if (wstate->inskey->allequalimage) + tuples_equal = _bt_keep_natts_fast(wstate->index, prev, itup, &has_nulls) > keysz; + else + tuples_equal = _bt_keep_natts(wstate->index, prev, itup,wstate->inskey, &has_nulls) > keysz; + + /* handle null values correctly */ + if (has_nulls && !btspool->nulls_not_distinct) + tuples_equal = false; + + if (tuples_equal) + { + /* check previous tuple if not yet */ + if (!prev_tested) + { + call_again = false; + tid = prev->t_tid; + seen_alive = table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + prev_tested = true; + } + + call_again = false; + tid = itup->t_tid; + now_alive = table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + + /* are multiple alive tuples detected in equal group? */ + if (seen_alive && now_alive) + { + char *key_desc; + TupleDesc tupDes = RelationGetDescr(wstate->index); + bool isnull[INDEX_MAX_KEYS]; + Datum values[INDEX_MAX_KEYS]; + + index_deform_tuple(itup, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(wstate->index, values, isnull); + + /* keep this message in sync with the same in comparetup_index_btree_tiebreak */ + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(wstate->index)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(wstate->heap, + RelationGetRelationName(wstate->index)))); + } + seen_alive |= now_alive; + } + } + + if (!tuples_equal) + { + seen_alive = false; + prev_tested = false; + } + + _bt_buildadd(wstate, state, itup, 0); + if (prev) pfree(prev); + prev = CopyIndexTuple(itup); + + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + ExecDropSingleTupleTableSlot(slot); + table_index_fetch_end(fetch); + } + else if (merge) { /* * Another BTSpool for dead tuples exists. Now we have to merge @@ -1314,7 +1425,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) InvalidOffsetNumber); } else if (_bt_keep_natts_fast(wstate->index, dstate->base, - itup) > keysz && + itup, NULL) > keysz && _bt_dedup_save_htid(dstate, itup)) { /* @@ -1411,7 +1522,6 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) BufferUsage *bufferusage; bool leaderparticipates = true; bool need_pop_active_snapshot = true; - bool reset_snapshot; bool wait_for_snapshot_attach; int querylen; @@ -1430,21 +1540,12 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) scantuplesortstates = leaderparticipates ? request + 1 : request; - /* - * For concurrent non-unique index builds, we can periodically reset snapshots - * to allow the xmin horizon to advance. This is safe since these builds don't - * require a consistent view across the entire scan. Unique indexes still need - * a stable snapshot to properly enforce uniqueness constraints. - */ - reset_snapshot = isconcurrent && !btspool->isunique; - /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that, while that snapshot may be reset periodically in - * case of non-unique index. + * live according to that, while that snapshot may be reset periodically. */ if (!isconcurrent) { @@ -1452,16 +1553,16 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) snapshot = SnapshotAny; need_pop_active_snapshot = false; } - else if (reset_snapshot) + else { + /* + * For concurrent index builds, we can periodically reset snapshots to allow + * the xmin horizon to advance. This is safe since these builds don't + * require a consistent view across the entire scan. + */ snapshot = InvalidSnapshot; PushActiveSnapshot(GetTransactionSnapshot()); } - else - { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); - PushActiveSnapshot(snapshot); - } /* * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and @@ -1531,6 +1632,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) btshared->indexrelid = RelationGetRelid(btspool->index); btshared->isunique = btspool->isunique; btshared->nulls_not_distinct = btspool->nulls_not_distinct; + btshared->unique_dead_ignored = btspool->unique_dead_ignored; btshared->isconcurrent = isconcurrent; btshared->scantuplesortstates = scantuplesortstates; btshared->queryid = pgstat_get_my_query_id(); @@ -1545,7 +1647,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) table_parallelscan_initialize(btspool->heap, ParallelTableScanFromBTShared(btshared), snapshot, - reset_snapshot); + isconcurrent); /* * Store shared tuplesort-private state, for which we reserved space. @@ -1626,7 +1728,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * In case when leader going to reset own active snapshot as well - we need to * wait until all workers imported initial snapshot. */ - wait_for_snapshot_attach = reset_snapshot && leaderparticipates; + wait_for_snapshot_attach = isconcurrent && leaderparticipates; if (wait_for_snapshot_attach) WaitForParallelWorkersToAttach(pcxt, true); @@ -1742,6 +1844,7 @@ _bt_leader_participate_as_worker(BTBuildState *buildstate) leaderworker->index = buildstate->spool->index; leaderworker->isunique = buildstate->spool->isunique; leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct; + leaderworker->unique_dead_ignored = buildstate->spool->unique_dead_ignored; /* Initialize second spool, if required */ if (!btleader->btshared->isunique) @@ -1845,11 +1948,12 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) btspool->index = indexRel; btspool->isunique = btshared->isunique; btspool->nulls_not_distinct = btshared->nulls_not_distinct; + btspool->unique_dead_ignored = btshared->unique_dead_ignored; /* Look up shared state private to tuplesort.c */ sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false); tuplesort_attach_shared(sharedsort, seg); - if (!btshared->isunique) + if (!btshared->isunique || btshared->isconcurrent) { btspool2 = NULL; sharedsort2 = NULL; @@ -1928,6 +2032,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, btspool->index, btspool->isunique, btspool->nulls_not_distinct, + btspool->unique_dead_ignored, sortmem, coordinate, TUPLESORT_NONE); @@ -1950,14 +2055,12 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, coordinate2->nParticipants = -1; coordinate2->sharedsort = sharedsort2; btspool2->sortstate = - tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false, + tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false, false, Min(sortmem, work_mem), coordinate2, false); } /* Fill in buildstate for _bt_build_callback() */ - buildstate.isunique = btshared->isunique; - buildstate.nulls_not_distinct = btshared->nulls_not_distinct; buildstate.havedead = false; buildstate.heap = btspool->heap; buildstate.spool = btspool; diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index 1f40d40263e..e2ed4537026 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -687,7 +687,7 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, { itemid = PageGetItemId(state->origpage, maxoff); tup = (IndexTuple) PageGetItem(state->origpage, itemid); - keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); + keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem, NULL); if (keepnatts > 1 && keepnatts <= nkeyatts) { @@ -718,7 +718,7 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) return false; /* Check same conditions as rightmost item case, too */ - keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); + keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem, NULL); if (keepnatts > 1 && keepnatts <= nkeyatts) { @@ -967,7 +967,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, * avoid appending a heap TID in new high key, we're done. Finish split * with default strategy and initial split interval. */ - perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost, NULL); if (perfectpenalty <= indnkeyatts) return perfectpenalty; @@ -988,7 +988,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, * If page is entirely full of duplicates, a single value strategy split * will be performed. */ - perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost, NULL); if (perfectpenalty <= indnkeyatts) { *strategy = SPLIT_MANY_DUPLICATES; @@ -1027,7 +1027,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, itemid = PageGetItemId(state->origpage, P_HIKEY); hikey = (IndexTuple) PageGetItem(state->origpage, itemid); perfectpenalty = _bt_keep_natts_fast(state->rel, hikey, - state->newitem); + state->newitem, NULL); if (perfectpenalty <= indnkeyatts) *strategy = SPLIT_SINGLE_VALUE; else @@ -1149,7 +1149,7 @@ _bt_split_penalty(FindSplitData *state, SplitPoint *split) lastleft = _bt_split_lastleft(state, split); firstright = _bt_split_firstright(state, split); - return _bt_keep_natts_fast(state->rel, lastleft, firstright); + return _bt_keep_natts_fast(state->rel, lastleft, firstright, NULL); } /* diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 50cbf06cb45..3d6dda4ace8 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -100,8 +100,6 @@ static bool _bt_check_rowcompare(ScanKey skey, ScanDirection dir, bool *continuescan); static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, int tupnatts, TupleDesc tupdesc); -static int _bt_keep_natts(Relation rel, IndexTuple lastleft, - IndexTuple firstright, BTScanInsert itup_key); /* @@ -4672,7 +4670,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright)); /* Determine how many attributes must be kept in truncated tuple */ - keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key); + keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key, NULL); #ifdef DEBUG_NO_TRUNCATE /* Force truncation to be ineffective for testing purposes */ @@ -4790,17 +4788,24 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, /* * _bt_keep_natts - how many key attributes to keep when truncating. * + * This is exported to be used as comparison function during concurrent + * unique index build in case _bt_keep_natts_fast is not suitable because + * collation is not "allequalimage"/deduplication-safe. + * * Caller provides two tuples that enclose a split point. Caller's insertion * scankey is used to compare the tuples; the scankey's argument values are * not considered here. * + * hasnulls value set to true in case of any null column in any tuple. + * * This can return a number of attributes that is one greater than the * number of key attributes for the index relation. This indicates that the * caller must use a heap TID as a unique-ifier in new pivot tuple. */ -static int +int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, - BTScanInsert itup_key) + BTScanInsert itup_key, + bool *hasnulls) { int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); TupleDesc itupdesc = RelationGetDescr(rel); @@ -4826,6 +4831,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + if (hasnulls) + (*hasnulls) |= (isNull1 || isNull2); if (isNull1 != isNull2) break; @@ -4845,7 +4852,7 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * expected in an allequalimage index. */ Assert(!itup_key->allequalimage || - keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright)); + keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright, NULL)); return keepnatts; } @@ -4856,7 +4863,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * This is exported so that a candidate split point can have its effect on * suffix truncation inexpensively evaluated ahead of time when finding a * split location. A naive bitwise approach to datum comparisons is used to - * save cycles. + * save cycles. Also, it may be used as comparison function during concurrent + * build of unique index. * * The approach taken here usually provides the same answer as _bt_keep_natts * will (for the same pair of tuples from a heapkeyspace index), since the @@ -4865,6 +4873,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * "equal image" columns, routine is guaranteed to give the same result as * _bt_keep_natts would. * + * hasnulls value set to true in case of any null column in any tuple. + * * Callers can rely on the fact that attributes considered equal here are * definitely also equal according to _bt_keep_natts, even when the index uses * an opclass or collation that is not "allequalimage"/deduplication-safe. @@ -4873,7 +4883,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * more balanced split point. */ int -_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) +_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright, + bool *hasnulls) { TupleDesc itupdesc = RelationGetDescr(rel); int keysz = IndexRelationGetNumberOfKeyAttributes(rel); @@ -4890,6 +4901,8 @@ _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + if (hasnulls) + *hasnulls |= (isNull1 | isNull2); att = TupleDescAttr(itupdesc, attnum - 1); if (isNull1 != isNull2) diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index e0ada5ce159..f6a1a2f3f90 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -3292,9 +3292,9 @@ IndexCheckExclusion(Relation heapRelation, * if we used HeapTupleSatisfiesVacuum). This leaves us with an index that * does not contain any tuples added to the table while we built the index. * - * Furthermore, in case of non-unique index we set SO_RESET_SNAPSHOT for the - * scan, which causes new snapshot to be set as active every so often. The reason - * for that is to propagate the xmin horizon forward. + * Furthermore, we set SO_RESET_SNAPSHOT for the scan, which causes new + * snapshot to be set as active every so often. The reason for that is to + * propagate the xmin horizon forward. * * Next, we mark the index "indisready" (but still not "indisvalid") and * commit the second transaction and start a third. Again we wait for all diff --git a/src/backend/utils/sort/tuplesortvariants.c b/src/backend/utils/sort/tuplesortvariants.c index e07ba4ea4b1..aa4fcaac9a0 100644 --- a/src/backend/utils/sort/tuplesortvariants.c +++ b/src/backend/utils/sort/tuplesortvariants.c @@ -123,6 +123,7 @@ typedef struct bool enforceUnique; /* complain if we find duplicate tuples */ bool uniqueNullsNotDistinct; /* unique constraint null treatment */ + bool uniqueDeadIgnored; /* ignore dead tuples in unique check */ } TuplesortIndexBTreeArg; /* @@ -349,6 +350,7 @@ tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, + bool uniqueDeadIgnored, int workMem, SortCoordinate coordinate, int sortopt) @@ -391,6 +393,7 @@ tuplesort_begin_index_btree(Relation heapRel, arg->index.indexRel = indexRel; arg->enforceUnique = enforceUnique; arg->uniqueNullsNotDistinct = uniqueNullsNotDistinct; + arg->uniqueDeadIgnored = uniqueDeadIgnored; indexScanKey = _bt_mkscankey(indexRel, NULL); @@ -1520,6 +1523,7 @@ comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; char *key_desc; + bool uniqueCheckFail = true; /* * Some rather brain-dead implementations of qsort (such as the one in @@ -1529,18 +1533,57 @@ comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, */ Assert(tuple1 != tuple2); - index_deform_tuple(tuple1, tupDes, values, isnull); - - key_desc = BuildIndexValueDescription(arg->index.indexRel, values, isnull); - - ereport(ERROR, - (errcode(ERRCODE_UNIQUE_VIOLATION), - errmsg("could not create unique index \"%s\"", - RelationGetRelationName(arg->index.indexRel)), - key_desc ? errdetail("Key %s is duplicated.", key_desc) : - errdetail("Duplicate keys exist."), - errtableconstraint(arg->index.heapRel, - RelationGetRelationName(arg->index.indexRel)))); + /* This is fail-fast check, see _bt_load for details. */ + if (arg->uniqueDeadIgnored) + { + bool any_tuple_dead, + call_again = false, + ignored; + + TupleTableSlot *slot = MakeSingleTupleTableSlot(RelationGetDescr(arg->index.heapRel), + &TTSOpsBufferHeapTuple); + ItemPointerData tid = tuple1->t_tid; + + IndexFetchTableData *fetch = table_index_fetch_begin(arg->index.heapRel); + any_tuple_dead = !table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + + if (!any_tuple_dead) + { + call_again = false; + tid = tuple2->t_tid; + any_tuple_dead = !table_index_fetch_tuple(fetch, &tuple2->t_tid, SnapshotSelf, slot, &call_again, + &ignored); + } + + if (any_tuple_dead) + { + elog(DEBUG5, "skipping duplicate values because some of them are dead: (%u,%u) vs (%u,%u)", + ItemPointerGetBlockNumber(&tuple1->t_tid), + ItemPointerGetOffsetNumber(&tuple1->t_tid), + ItemPointerGetBlockNumber(&tuple2->t_tid), + ItemPointerGetOffsetNumber(&tuple2->t_tid)); + + uniqueCheckFail = false; + } + ExecDropSingleTupleTableSlot(slot); + table_index_fetch_end(fetch); + } + if (uniqueCheckFail) + { + index_deform_tuple(tuple1, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(arg->index.indexRel, values, isnull); + + /* keep this error message in sync with the same in _bt_load */ + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(arg->index.indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(arg->index.heapRel, + RelationGetRelationName(arg->index.indexRel)))); + } } /* diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 123fba624db..4200d2bd20e 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1297,8 +1297,10 @@ extern bool btproperty(Oid index_oid, int attno, extern char *btbuildphasename(int64 phasenum); extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key); +extern int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, + BTScanInsert itup_key, bool *hasnulls); extern int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, - IndexTuple firstright); + IndexTuple firstright, bool *hasnulls); extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum); extern void _bt_check_third_page(Relation rel, Relation heap, diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 9ee5ea15fd4..ec3769585c3 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -1803,9 +1803,8 @@ table_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. * - * In case of non-unique concurrent index build SO_RESET_SNAPSHOT is applied - * for the scan. That leads for changing snapshots on the fly to allow xmin - * horizon propagate. + * In case of concurrent index build SO_RESET_SNAPSHOT is applied for the scan. + * That leads for changing snapshots on the fly to allow xmin horizon propagate. */ static inline double table_index_build_scan(Relation table_rel, diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index cde83f62015..ae5f4d28fdc 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -428,6 +428,7 @@ extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, + bool uniqueDeadIgnored, int workMem, SortCoordinate coordinate, int sortopt); extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel, diff --git a/src/test/modules/injection_points/expected/cic_reset_snapshots.out b/src/test/modules/injection_points/expected/cic_reset_snapshots.out index 49ef68d9071..c8e4683ad6d 100644 --- a/src/test/modules/injection_points/expected/cic_reset_snapshots.out +++ b/src/test/modules/injection_points/expected/cic_reset_snapshots.out @@ -41,7 +41,11 @@ END; $$; ---------------- ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots @@ -86,7 +90,9 @@ SELECT injection_points_detach('heap_reset_scan_snapshot_effective'); (1 row) CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); NOTICE: notice triggered for injection point table_parallelscan_initialize -- 2.43.0