public inbox for [email protected]
help / color / mirror / Atom feedFrom: Mihail Nikalayeu <[email protected]>
To: Zhijie Hou (Fujitsu) <[email protected]>
To: Peter Geoghegan <[email protected]>
Cc: Amit Kapila <[email protected]>
Cc: Hayato Kuroda (Fujitsu) <[email protected]>
Cc: PostgreSQL Hackers <[email protected]>
Cc: Andres Freund <[email protected]>
Subject: Re: [BUG?] check_exclusion_or_unique_constraint false negative
Date: Sun, 18 May 2025 16:36:00 +0200
Message-ID: <CADzfLwW61K09Q8HnQ6zO+QHD2oHBmm5VmZbHa19=ZkMMX_Vcow@mail.gmail.com> (raw)
In-Reply-To: <CADzfLwWuXh8KO=OZvB71pZnQ8nH0NYXfuGbFU6FBiVZUbmuFGg@mail.gmail.com>
References: <CANtu0oiktqQ2pwExoXqDpByXNCJa-KE5vQRodTRnmFHN_+qwHg@mail.gmail.com>
<CANtu0ohU2XRV9shtu14CffLPDS1x10q7ebOGf-vX0p+45_L8jw@mail.gmail.com>
<CANtu0oh0tspW-xWzDGWP9ehz96KPt9aUP1c9JYhdBYxKsB0jpA@mail.gmail.com>
<CANtu0ohUB9ky45iiMAYN1fGyt82+cg=+UYBom=P7drb+=97G9w@mail.gmail.com>
<TYAPR01MB56921C9C3D21B0D62FF76330F5B22@TYAPR01MB5692.jpnprd01.prod.outlook.com>
<CANtu0og=5v4j8onS4nyJ4zMPdh-EPFxmiEi5PLoyZrmqHA6RKw@mail.gmail.com>
<CAA4eK1Jfb0xviXYon-_TvHNKeAY7ngAeo++Knu-0RPR6EkSBjA@mail.gmail.com>
<CANtu0ohHmYXsK5bxU9Thcq1FbELLAk0S2Zap0r8AnU3OTmcCOA@mail.gmail.com>
<CAA4eK1+_V1PWXrrgAM01p+CByP6JwYRxejZrcxOu83a-v_+zZg@mail.gmail.com>
<CANtu0ogDDQnXbrv6p7Xtc2dT_MZ1fjdPgB9-0B5Lw1b4pQGd2A@mail.gmail.com>
<OS0PR01MB5716FFD8DBBADB55E8E6935994852@OS0PR01MB5716.jpnprd01.prod.outlook.com>
<CANtu0oiziTBM8+WDtkktMZv0rhGBroYGWwqSQW+MzOWpmk-XEw@mail.gmail.com>
<OS0PR01MB5716E30952F542E256DD72E294802@OS0PR01MB5716.jpnprd01.prod.outlook.com>
<CANtu0oh69b+VCiASX86dF_eY=9=A2RmMQ_+0+uxZ_Zir+oNhhw@mail.gmail.com>
<CANtu0og_4FVsRMxXue8SXUn03MwBAT0SiZSib_wcPMLDkpn-RA@mail.gmail.com>
<CANtu0oj8LzpjCvF1zSwdLJxPddhbdY0=uh=7-wT6vwhmju_-PQ@mail.gmail.com>
<CANtu0ogHMahRJvLKPofE9T7Z19H3UyWeb22fZ9cKfFRG_BEV0w@mail.gmail.com>
<CANtu0ojos4kvrrQ9YJOei2=c5vB1wJBHpR3q_X+BG1i99ut+Hw@mail.gmail.com>
<CADzfLwWuXh8KO=OZvB71pZnQ8nH0NYXfuGbFU6FBiVZUbmuFGg@mail.gmail.com>
Hello, everyone!
Rebased + fix for compilation due the new INEJCTION_POINT signature.
Best regards,
Mikhail.
Attachments:
[application/octet-stream] v6-0001-Add-an-isolation-test-to-reproduce-a-dirty-snapsh.patch (5.8K, 2-v6-0001-Add-an-isolation-test-to-reproduce-a-dirty-snapsh.patch)
download | inline diff:
From 07f2d816c94d94c587cef16a6448de09ef1dc2d6 Mon Sep 17 00:00:00 2001
From: nkey <[email protected]>
Date: Sat, 23 Nov 2024 13:25:11 +0100
Subject: [PATCH v6 1/2] Add an isolation test to reproduce a dirty snapshot
scan issue
This commit introduces an isolation test to reliably reproduce the issue where non-MVCC index scans using SnapshotDirty can miss tuples due to concurrent modifications.
When using non-MVCC snapshot types, a scan could miss tuples if concurrent transactions delete existing tuples and insert new one with different TIDs on the same page.
---
src/backend/access/index/indexam.c | 8 ++++
src/backend/executor/execIndexing.c | 3 ++
src/test/modules/injection_points/Makefile | 2 +-
.../expected/dirty_index_scan.out | 27 ++++++++++++++
src/test/modules/injection_points/meson.build | 1 +
.../specs/dirty_index_scan.spec | 37 +++++++++++++++++++
6 files changed, 77 insertions(+), 1 deletion(-)
create mode 100644 src/test/modules/injection_points/expected/dirty_index_scan.out
create mode 100644 src/test/modules/injection_points/specs/dirty_index_scan.spec
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 219df1971da..676e06c095c 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -57,6 +57,7 @@
#include "utils/ruleutils.h"
#include "utils/snapmgr.h"
#include "utils/syscache.h"
+#include "utils/injection_point.h"
/* ----------------------------------------------------------------
@@ -741,6 +742,13 @@ index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot *
* the index.
*/
Assert(ItemPointerIsValid(&scan->xs_heaptid));
+#ifdef USE_INJECTION_POINTS
+ if (!IsCatalogRelationOid(scan->indexRelation->rd_id))
+ {
+ INJECTION_POINT("index_getnext_slot_before_fetch", NULL);
+ }
+#endif
+
if (index_fetch_heap(scan, slot))
return true;
}
diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c
index bdf862b2406..36748b39e68 100644
--- a/src/backend/executor/execIndexing.c
+++ b/src/backend/executor/execIndexing.c
@@ -117,6 +117,7 @@
#include "utils/multirangetypes.h"
#include "utils/rangetypes.h"
#include "utils/snapmgr.h"
+#include "utils/injection_point.h"
/* waitMode argument to check_exclusion_or_unique_constraint() */
typedef enum
@@ -943,6 +944,8 @@ retry:
ExecDropSingleTupleTableSlot(existing_slot);
+ if (!conflict)
+ INJECTION_POINT("check_exclusion_or_unique_constraint_no_conflict", NULL);
return !conflict;
}
diff --git a/src/test/modules/injection_points/Makefile b/src/test/modules/injection_points/Makefile
index e680991f8d4..b73f8ac80f2 100644
--- a/src/test/modules/injection_points/Makefile
+++ b/src/test/modules/injection_points/Makefile
@@ -14,7 +14,7 @@ PGFILEDESC = "injection_points - facility for injection points"
REGRESS = injection_points hashagg reindex_conc
REGRESS_OPTS = --dlpath=$(top_builddir)/src/test/regress
-ISOLATION = basic inplace syscache-update-pruned
+ISOLATION = basic inplace syscache-update-pruned dirty_index_scan
TAP_TESTS = 1
diff --git a/src/test/modules/injection_points/expected/dirty_index_scan.out b/src/test/modules/injection_points/expected/dirty_index_scan.out
new file mode 100644
index 00000000000..c286a9fd5b6
--- /dev/null
+++ b/src/test/modules/injection_points/expected/dirty_index_scan.out
@@ -0,0 +1,27 @@
+Parsed test spec with 3 sessions
+
+starting permutation: s1_s1 s2_s1 s3_s1
+injection_points_attach
+-----------------------
+
+(1 row)
+
+step s1_s1: INSERT INTO test.tbl VALUES(42, 1) on conflict(i) do update set n = EXCLUDED.n + 1; <waiting ...>
+step s2_s1: UPDATE test.tbl SET n = n + 1 WHERE i = 42; <waiting ...>
+step s3_s1:
+ SELECT injection_points_detach('index_getnext_slot_before_fetch');
+ SELECT injection_points_wakeup('index_getnext_slot_before_fetch');
+ <waiting ...>
+step s1_s1: <... completed>
+step s2_s1: <... completed>
+step s3_s1: <... completed>
+injection_points_detach
+-----------------------
+
+(1 row)
+
+injection_points_wakeup
+-----------------------
+
+(1 row)
+
diff --git a/src/test/modules/injection_points/meson.build b/src/test/modules/injection_points/meson.build
index d61149712fd..bb3869f9a75 100644
--- a/src/test/modules/injection_points/meson.build
+++ b/src/test/modules/injection_points/meson.build
@@ -47,6 +47,7 @@ tests += {
'basic',
'inplace',
'syscache-update-pruned',
+ 'dirty_index_scan',
],
'runningcheck': false, # see syscache-update-pruned
},
diff --git a/src/test/modules/injection_points/specs/dirty_index_scan.spec b/src/test/modules/injection_points/specs/dirty_index_scan.spec
new file mode 100644
index 00000000000..373bcaf4929
--- /dev/null
+++ b/src/test/modules/injection_points/specs/dirty_index_scan.spec
@@ -0,0 +1,37 @@
+setup
+{
+ CREATE EXTENSION injection_points;
+ CREATE SCHEMA test;
+ CREATE UNLOGGED TABLE test.tbl(i int primary key, n int);
+ CREATE INDEX tbl_n_idx ON test.tbl(n);
+ INSERT INTO test.tbl VALUES(42,1);
+}
+
+teardown
+{
+ DROP SCHEMA test CASCADE;
+ DROP EXTENSION injection_points;
+}
+
+session s1
+setup {
+ SELECT injection_points_set_local();
+ SELECT injection_points_attach('check_exclusion_or_unique_constraint_no_conflict', 'error');
+ SELECT injection_points_attach('index_getnext_slot_before_fetch', 'wait');
+}
+
+step s1_s1 { INSERT INTO test.tbl VALUES(42, 1) on conflict(i) do update set n = EXCLUDED.n + 1; }
+
+session s2
+step s2_s1 { UPDATE test.tbl SET n = n + 1 WHERE i = 42; }
+
+session s3
+step s3_s1 {
+ SELECT injection_points_detach('index_getnext_slot_before_fetch');
+ SELECT injection_points_wakeup('index_getnext_slot_before_fetch');
+}
+
+permutation
+ s1_s1
+ s2_s1(*)
+ s3_s1(s1_s1)
\ No newline at end of file
--
2.43.0
[application/octet-stream] v6-0002-Fix-btree-index-scan-concurrency-issues-with-dirt.patch (9.7K, 3-v6-0002-Fix-btree-index-scan-concurrency-issues-with-dirt.patch)
download | inline diff:
From 318822f0d2848fb3a3400dd2ea5f2c0400f9505b Mon Sep 17 00:00:00 2001
From: nkey <[email protected]>
Date: Wed, 12 Mar 2025 00:53:02 +0100
Subject: [PATCH v6 2/2] Fix btree index scan concurrency issues with dirty
snapshots
This patch addresses an issue where non-MVCC index scans using SnapshotDirty
or SnapshotSelf could miss tuples due to concurrent modifications. The fix
retains read locks on pages for these special snapshot types until the scan
is done with the page's tuples, preventing concurrent modifications from
causing inconsistent results.
Updated README to document this special case in the btree locking mechanism.
---
src/backend/access/nbtree/README | 13 +++++++++-
src/backend/access/nbtree/nbtree.c | 17 +++++++++++++
src/backend/access/nbtree/nbtsearch.c | 35 +++++++++++++++++++++++----
src/backend/access/nbtree/nbtutils.c | 8 +++++-
src/include/access/nbtree.h | 3 +++
5 files changed, 69 insertions(+), 7 deletions(-)
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 53d4a61dc3f..a9280415633 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -85,7 +85,8 @@ move right until we find a page whose right-link matches the page we
came from. (Actually, it's even harder than that; see page deletion
discussion below.)
-Page read locks are held only for as long as a scan is examining a page.
+Page read locks are held only for as long as a scan is examining a page
+(with exception for SnapshotDirty and SnapshotSelf scans - see below).
To minimize lock/unlock traffic, an index scan always searches a leaf page
to identify all the matching items at once, copying their heap tuple IDs
into backend-local storage. The heap tuple IDs are then processed while
@@ -103,6 +104,16 @@ We also remember the left-link, and follow it when the scan moves backwards
(though this requires extra handling to account for concurrent splits of
the left sibling; see detailed move-left algorithm below).
+Despite the described mechanics in place, inconsistent results may still occur
+during non-MVCC scans (SnapshotDirty and SnapshotSelf). This issue can occur if a
+concurrent transaction deletes a tuple and inserts a new tuple with a new TID in the
+same page. If the scan has already visited the page and cached its content in the
+backend-local storage, it might skip the old tuple due to deletion and miss the new
+tuple because the scan does not re-read the page. To address this issue, for
+SnapshotDirty and SnapshotSelf scans, we retain the read lock on the page until
+we're completely done processing all the tuples from that page, preventing
+concurrent modifications that could lead to inconsistent results.
+
In most cases we release our lock and pin on a page before attempting
to acquire pin and lock on the page we are moving to. In a few places
it is necessary to lock the next page before releasing the current one.
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 765659887af..8239076c518 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -389,6 +389,12 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
/* Before leaving current page, deal with any killed items */
if (so->numKilled > 0)
_bt_killitems(scan);
+ /* Release any extended lock held for SnapshotDirty/Self scans */
+ if (so->currPos.extra_unlock)
+ {
+ _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+ so->currPos.extra_unlock = false;
+ }
BTScanPosUnpinIfPinned(so->currPos);
BTScanPosInvalidate(so->currPos);
}
@@ -445,6 +451,12 @@ btendscan(IndexScanDesc scan)
/* Before leaving current page, deal with any killed items */
if (so->numKilled > 0)
_bt_killitems(scan);
+ /* Release any extended lock held for SnapshotDirty/Self scans */
+ if (so->currPos.extra_unlock)
+ {
+ _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+ so->currPos.extra_unlock = false;
+ }
BTScanPosUnpinIfPinned(so->currPos);
}
@@ -525,6 +537,11 @@ btrestrpos(IndexScanDesc scan)
/* Before leaving current page, deal with any killed items */
if (so->numKilled > 0)
_bt_killitems(scan);
+ if (so->currPos.extra_unlock)
+ {
+ _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+ so->currPos.extra_unlock = false;
+ }
BTScanPosUnpinIfPinned(so->currPos);
}
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index fe9a3886913..2c5a34cacfe 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -61,11 +61,22 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
* This will prevent vacuum from stalling in a blocked state trying to read a
* page when a cursor is sitting on it.
*
+ * For SnapshotDirty and SnapshotSelf scans, we don't actually unlock the buffer
+ * here, but instead set extra_unlock to indicate that the lock should be held
+ * until we're completely done with this page. This prevents concurrent
+ * modifications from causing inconsistent results during non-MVCC scans.
+ *
* See nbtree/README section on making concurrent TID recycling safe.
*/
static void
_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
{
+ if (scan->xs_snapshot->snapshot_type == SNAPSHOT_DIRTY ||
+ scan->xs_snapshot->snapshot_type == SNAPSHOT_SELF)
+ {
+ sp->extra_unlock = true;
+ return;
+ }
_bt_unlockbuf(scan->indexRelation, sp->buf);
if (IsMVCCSnapshot(scan->xs_snapshot) &&
@@ -1527,7 +1538,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
* _bt_next() -- Get the next item in a scan.
*
* On entry, so->currPos describes the current page, which may be pinned
- * but is not locked, and so->currPos.itemIndex identifies which item was
+ * but is not locked (except for SnapshotDirty and SnapshotSelf scans, where
+ * the page remains locked), and so->currPos.itemIndex identifies which item was
* previously returned.
*
* On success exit, so->currPos is updated as needed, and _bt_returnitem
@@ -2107,10 +2119,11 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
*
* Wrapper on _bt_readnextpage that performs final steps for the current page.
*
- * On entry, if so->currPos.buf is valid the buffer is pinned but not locked.
- * If there's no pin held, it's because _bt_drop_lock_and_maybe_pin dropped
- * the pin eagerly earlier on. The scan must have so->currPos.currPage set to
- * a valid block, in any case.
+ * On entry, if so->currPos.buf is valid the buffer is pinned but not locked,
+ * except for SnapshotDirty and SnapshotSelf scans where the buffer remains locked
+ * until we're done with all tuples from the page. If there's no pin held, it's
+ * because _bt_drop_lock_and_maybe_pin dropped the pin eagerly earlier on.
+ * The scan must have so->currPos.currPage set to a valid block, in any case.
*/
static bool
_bt_steppage(IndexScanDesc scan, ScanDirection dir)
@@ -2169,8 +2182,20 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
/* mark/restore not supported by parallel scans */
Assert(!scan->parallel_scan);
+ Assert(scan->xs_snapshot->snapshot_type != SNAPSHOT_DIRTY);
+ Assert(scan->xs_snapshot->snapshot_type != SNAPSHOT_SELF);
}
+ /*
+ * For SnapshotDirty/Self scans, we kept the read lock after processing
+ * the page's tuples (see _bt_drop_lock_and_maybe_pin). Now that we're
+ * moving to another page, we need to explicitly release that lock.
+ */
+ if (so->currPos.extra_unlock)
+ {
+ _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+ so->currPos.extra_unlock = false;
+ }
BTScanPosUnpinIfPinned(so->currPos);
/* Walk to the next page with data */
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 1a15dfcb7d3..61008a36b5d 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -3383,13 +3383,17 @@ _bt_killitems(IndexScanDesc scan)
* LSN.
*/
droppedpin = false;
- _bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
+ /* For SnapshotDirty/Self scans, the buffer is already locked */
+ if (!so->currPos.extra_unlock)
+ _bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
page = BufferGetPage(so->currPos.buf);
}
else
{
Buffer buf;
+ /* extra_unlock should never be set without a valid buffer pin */
+ Assert(!so->currPos.extra_unlock);
droppedpin = true;
/* Attempt to re-read the buffer, getting pin and lock. */
@@ -3526,6 +3530,8 @@ _bt_killitems(IndexScanDesc scan)
}
_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+ /* Reset the extra_unlock flag since we've now released the lock */
+ so->currPos.extra_unlock = false;
}
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index ebca02588d3..2c2485f34bd 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -962,6 +962,7 @@ typedef struct BTScanPosItem /* what we remember about each match */
typedef struct BTScanPosData
{
Buffer buf; /* currPage buf (invalid means unpinned) */
+ bool extra_unlock; /* for SnapshotDirty/Self, read lock is held even after _bt_drop_lock_and_maybe_pin */
/* page details as of the saved position's call to _bt_readpage */
BlockNumber currPage; /* page referenced by items array */
@@ -1009,6 +1010,7 @@ typedef BTScanPosData *BTScanPos;
)
#define BTScanPosUnpin(scanpos) \
do { \
+ Assert(!(scanpos).extra_unlock); \
ReleaseBuffer((scanpos).buf); \
(scanpos).buf = InvalidBuffer; \
} while (0)
@@ -1028,6 +1030,7 @@ typedef BTScanPosData *BTScanPos;
do { \
(scanpos).buf = InvalidBuffer; \
(scanpos).currPage = InvalidBlockNumber; \
+ (scanpos).extra_unlock = false; \
} while (0)
/* We need one of these for each equality-type SK_SEARCHARRAY scan key */
--
2.43.0
view thread (37+ messages) latest in thread
reply
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Reply to all the recipients using the --to and --cc options:
reply via email
To: [email protected]
Cc: [email protected], [email protected], [email protected], [email protected], [email protected], [email protected]
Subject: Re: [BUG?] check_exclusion_or_unique_constraint false negative
In-Reply-To: <CADzfLwW61K09Q8HnQ6zO+QHD2oHBmm5VmZbHa19=ZkMMX_Vcow@mail.gmail.com>
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
This inbox is served by agora; see mirroring instructions
for how to clone and mirror all data and code used for this inbox