public inbox for [email protected]  
help / color / mirror / Atom feed
From: Andrey Borodin <[email protected]>
To: Zsolt Parragi <[email protected]>
Cc: pgsql-hackers <[email protected]>
Subject: Re: amcheck: add index-all-keys-match verification for B-Tree
Date: Fri, 27 Feb 2026 16:21:15 +0500
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
References: <[email protected]>
	<CAN4CZFNCyEOC8exfEn9SVV=uhzUA4KQYcVRUea2E0ts-rvXg3w@mail.gmail.com>
	<[email protected]>



> On 26 Feb 2026, at 10:28, Andrey Borodin <[email protected]> wrote:
> 
> I have a gut feeling that we might do without snapshot at all...

I decided that it's a good idea to verify only visible tuples.
Mismatch of dead tuples might be bad sign, but it's not a corruption.
So we need a snapshot.

PFA patch with documentation.

Thanks!


Best regards, Andrey Borodin.



Attachments:

  [application/octet-stream] v3-0001-amcheck-add-indexallkeysmatch-verification-for-B-.patch (28.1K, 2-v3-0001-amcheck-add-indexallkeysmatch-verification-for-B-.patch)
  download | inline diff:
From d12e7009277cea96b75797698fd647e4ef2dd83d Mon Sep 17 00:00:00 2001
From: Andrey Borodin <[email protected]>
Date: Tue, 17 Feb 2026 12:05:51 +0500
Subject: [PATCH v3 1/3] amcheck: add indexallkeysmatch verification for B-Tree
 indexes

Add a new "indexallkeysmatch" option to bt_index_check() and
bt_index_parent_check() that verifies each index tuple points to a
heap tuple with the same key.  This is the reverse of the existing
"heapallindexed" check, which verifies every heap tuple is present
in the index.

The implementation uses a Bloom filter to amortize random heap
lookups.  A sequential heap scan first fingerprints all visible
(key, tid) pairs.  During the index scan, each leaf tuple (including
posting list entries) is probed against this filter.  Only when the
filter says "not present" do we perform an actual heap fetch and
key comparison via FormIndexDatum, reporting corruption if the keys
differ.

This check detects corruption where an index entry stores a
different key than the heap tuple it points to -- a scenario that
the existing heapallindexed and structural checks cannot catch,
particularly when it manifests within posting lists before it
develops into a detectable ordering violation.

Bump amcheck extension version to 1.6.
---
 contrib/amcheck/Makefile                      |   4 +-
 contrib/amcheck/amcheck--1.5--1.6.sql         |  21 ++
 contrib/amcheck/amcheck.control               |   2 +-
 contrib/amcheck/expected/check_btree.out      |  20 ++
 contrib/amcheck/meson.build                   |   2 +
 contrib/amcheck/sql/check_btree.sql           |   7 +
 .../t/007_verify_nbtree_indexallkeysmatch.pl  | 111 +++++++
 contrib/amcheck/verify_nbtree.c               | 310 +++++++++++++++---
 8 files changed, 437 insertions(+), 40 deletions(-)
 create mode 100644 contrib/amcheck/amcheck--1.5--1.6.sql
 create mode 100644 contrib/amcheck/t/007_verify_nbtree_indexallkeysmatch.pl

diff --git a/contrib/amcheck/Makefile b/contrib/amcheck/Makefile
index 1b7a63cbaa4..2c23109a200 100644
--- a/contrib/amcheck/Makefile
+++ b/contrib/amcheck/Makefile
@@ -10,12 +10,12 @@ OBJS = \
 
 EXTENSION = amcheck
 DATA = amcheck--1.2--1.3.sql amcheck--1.1--1.2.sql amcheck--1.0--1.1.sql amcheck--1.0.sql \
-		amcheck--1.3--1.4.sql amcheck--1.4--1.5.sql
+		amcheck--1.3--1.4.sql amcheck--1.4--1.5.sql amcheck--1.5--1.6.sql
 PGFILEDESC = "amcheck - function for verifying relation integrity"
 
 REGRESS = check check_btree check_gin check_heap
 
-EXTRA_INSTALL = contrib/pg_walinspect
+EXTRA_INSTALL = contrib/pg_walinspect contrib/pg_surgery
 TAP_TESTS = 1
 
 ifdef USE_PGXS
diff --git a/contrib/amcheck/amcheck--1.5--1.6.sql b/contrib/amcheck/amcheck--1.5--1.6.sql
new file mode 100644
index 00000000000..ac01f429d7d
--- /dev/null
+++ b/contrib/amcheck/amcheck--1.5--1.6.sql
@@ -0,0 +1,21 @@
+/* contrib/amcheck/amcheck--1.5--1.6.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "ALTER EXTENSION amcheck UPDATE TO '1.6'" to load this file. \quit
+
+-- Add indexallkeysmatch parameter to bt_index_check and bt_index_parent_check
+CREATE FUNCTION bt_index_check(index regclass,
+    heapallindexed boolean, checkunique boolean, indexallkeysmatch boolean)
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'bt_index_check'
+LANGUAGE C STRICT PARALLEL RESTRICTED;
+
+CREATE FUNCTION bt_index_parent_check(index regclass,
+    heapallindexed boolean, rootdescend boolean, checkunique boolean,
+    indexallkeysmatch boolean)
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'bt_index_parent_check'
+LANGUAGE C STRICT PARALLEL RESTRICTED;
+
+REVOKE ALL ON FUNCTION bt_index_check(regclass, boolean, boolean, boolean) FROM PUBLIC;
+REVOKE ALL ON FUNCTION bt_index_parent_check(regclass, boolean, boolean, boolean, boolean) FROM PUBLIC;
diff --git a/contrib/amcheck/amcheck.control b/contrib/amcheck/amcheck.control
index c8ba6d7c9bc..2f329ef2cf4 100644
--- a/contrib/amcheck/amcheck.control
+++ b/contrib/amcheck/amcheck.control
@@ -1,5 +1,5 @@
 # amcheck extension
 comment = 'functions for verifying relation integrity'
-default_version = '1.5'
+default_version = '1.6'
 module_pathname = '$libdir/amcheck'
 relocatable = true
diff --git a/contrib/amcheck/expected/check_btree.out b/contrib/amcheck/expected/check_btree.out
index 6558f2c5a4f..d4da803b355 100644
--- a/contrib/amcheck/expected/check_btree.out
+++ b/contrib/amcheck/expected/check_btree.out
@@ -232,6 +232,26 @@ SELECT bt_index_parent_check('bttest_b_idx', heapallindexed => true, rootdescend
  
 (1 row)
 
+-- indexallkeysmatch: verify each index tuple points to heap tuple with same key
+SELECT bt_index_check('bttest_a_idx', heapallindexed => false, checkunique => false, indexallkeysmatch => true);
+ bt_index_check 
+----------------
+ 
+(1 row)
+
+SELECT bt_index_parent_check('bttest_a_idx', heapallindexed => false, rootdescend => false, checkunique => false, indexallkeysmatch => true);
+ bt_index_parent_check 
+-----------------------
+ 
+(1 row)
+
+-- indexallkeysmatch on expression index (exercises FormIndexDatum/ii_ExpressionsState path)
+SELECT bt_index_check('bttest_a_expr_idx', heapallindexed => false, checkunique => false, indexallkeysmatch => true);
+ bt_index_check 
+----------------
+ 
+(1 row)
+
 -- Check that null values in an unique index are not treated as equal
 CREATE TABLE bttest_unique_nulls (a serial, b int, c int UNIQUE);
 INSERT INTO bttest_unique_nulls VALUES (generate_series(1, 10000), 2, default);
diff --git a/contrib/amcheck/meson.build b/contrib/amcheck/meson.build
index d5137ef691d..220b1ce1d59 100644
--- a/contrib/amcheck/meson.build
+++ b/contrib/amcheck/meson.build
@@ -27,6 +27,7 @@ install_data(
   'amcheck--1.2--1.3.sql',
   'amcheck--1.3--1.4.sql',
   'amcheck--1.4--1.5.sql',
+  'amcheck--1.5--1.6.sql',
   kwargs: contrib_data_args,
 )
 
@@ -50,6 +51,7 @@ tests += {
       't/004_verify_nbtree_unique.pl',
       't/005_pitr.pl',
       't/006_verify_gin.pl',
+      't/007_verify_nbtree_indexallkeysmatch.pl',
     ],
   },
 }
diff --git a/contrib/amcheck/sql/check_btree.sql b/contrib/amcheck/sql/check_btree.sql
index 171f7f691ec..bc5a76c0b59 100644
--- a/contrib/amcheck/sql/check_btree.sql
+++ b/contrib/amcheck/sql/check_btree.sql
@@ -148,6 +148,13 @@ SELECT bt_index_check('bttest_b_idx', heapallindexed => false, checkunique => tr
 SELECT bt_index_parent_check('bttest_a_idx', heapallindexed => true, rootdescend => true, checkunique => true);
 SELECT bt_index_parent_check('bttest_b_idx', heapallindexed => true, rootdescend => false, checkunique => true);
 
+-- indexallkeysmatch: verify each index tuple points to heap tuple with same key
+SELECT bt_index_check('bttest_a_idx', heapallindexed => false, checkunique => false, indexallkeysmatch => true);
+SELECT bt_index_parent_check('bttest_a_idx', heapallindexed => false, rootdescend => false, checkunique => false, indexallkeysmatch => true);
+
+-- indexallkeysmatch on expression index (exercises FormIndexDatum/ii_ExpressionsState path)
+SELECT bt_index_check('bttest_a_expr_idx', heapallindexed => false, checkunique => false, indexallkeysmatch => true);
+
 -- Check that null values in an unique index are not treated as equal
 CREATE TABLE bttest_unique_nulls (a serial, b int, c int UNIQUE);
 INSERT INTO bttest_unique_nulls VALUES (generate_series(1, 10000), 2, default);
diff --git a/contrib/amcheck/t/007_verify_nbtree_indexallkeysmatch.pl b/contrib/amcheck/t/007_verify_nbtree_indexallkeysmatch.pl
new file mode 100644
index 00000000000..caf2cbd93e7
--- /dev/null
+++ b/contrib/amcheck/t/007_verify_nbtree_indexallkeysmatch.pl
@@ -0,0 +1,111 @@
+
+# Copyright (c) 2026, PostgreSQL Global Development Group
+#
+# Test indexallkeysmatch verification: each index tuple must point to a heap
+# tuple with the same key.
+
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+my $node = PostgreSQL::Test::Cluster->new('test');
+$node->init;
+$node->append_conf('postgresql.conf', 'autovacuum=off');
+$node->start;
+
+$node->safe_psql('postgres', q(CREATE EXTENSION amcheck));
+
+#
+# Test 1: indexallkeysmatch on uncorrupted table (plain column)
+#
+$node->safe_psql('postgres', q(
+	CREATE TABLE idxall_plain (id int);
+	INSERT INTO idxall_plain SELECT generate_series(1, 1000);
+	CREATE INDEX idxall_plain_idx ON idxall_plain (id);
+));
+
+my $result = $node->safe_psql('postgres',
+	q(SELECT bt_index_check('idxall_plain_idx', false, false, true)));
+is($result, '', 'indexallkeysmatch passes on uncorrupted plain table');
+
+#
+# Test 2: indexallkeysmatch on uncorrupted table with deduplication
+#
+$node->safe_psql('postgres', q(
+	CREATE TABLE idxall_dedup (id int);
+	INSERT INTO idxall_dedup SELECT i % 10 FROM generate_series(1, 1000) i;
+	CREATE INDEX idxall_dedup_idx ON idxall_dedup (id)
+		WITH (deduplicate_items = on);
+));
+
+$result = $node->safe_psql('postgres',
+	q(SELECT bt_index_check('idxall_dedup_idx', false, false, true)));
+is($result, '', 'indexallkeysmatch passes on uncorrupted table with deduplication');
+
+#
+# Test 3: indexallkeysmatch on uncorrupted expression index
+#
+$node->safe_psql('postgres', q(
+	CREATE FUNCTION idxall_func(int) RETURNS int LANGUAGE sql IMMUTABLE AS
+	$$ SELECT $1; $$;
+
+	CREATE TABLE idxall_expr (id int);
+	INSERT INTO idxall_expr SELECT generate_series(1, 500);
+	CREATE INDEX idxall_expr_idx ON idxall_expr (idxall_func(id));
+));
+
+$result = $node->safe_psql('postgres',
+	q(SELECT bt_index_check('idxall_expr_idx', false, false, true)));
+is($result, '', 'indexallkeysmatch passes on uncorrupted expression index');
+
+#
+# Test 4: Detect corruption -- swap expression function so heap re-evaluation
+# produces different keys than what the index stores.
+#
+# The index was built with idxall_func(x) = x.  Now change it to return x+1.
+# This simulates the corruption scenario: index says key=42 for a TID, but
+# re-reading the heap and re-evaluating gives key=43.
+#
+$node->safe_psql('postgres', q(
+	CREATE OR REPLACE FUNCTION idxall_func(int) RETURNS int LANGUAGE sql IMMUTABLE AS
+	$$ SELECT $1 + 1; $$;
+));
+
+my ($stdout, $stderr);
+($result, $stdout, $stderr) = $node->psql('postgres',
+	q(SELECT bt_index_check('idxall_expr_idx', false, false, true)));
+like($stderr,
+	qr/index tuple in index "idxall_expr_idx" does not match heap tuple/,
+	'detected index-heap key mismatch via expression function swap');
+
+#
+# Test 5: Restore function and verify no corruption reported
+#
+$node->safe_psql('postgres', q(
+	CREATE OR REPLACE FUNCTION idxall_func(int) RETURNS int LANGUAGE sql IMMUTABLE AS
+	$$ SELECT $1; $$;
+));
+
+$result = $node->safe_psql('postgres',
+	q(SELECT bt_index_check('idxall_expr_idx', false, false, true)));
+is($result, '', 'indexallkeysmatch passes after restoring correct function');
+
+#
+# Test 6: indexallkeysmatch with bt_index_parent_check
+#
+$node->safe_psql('postgres', q(
+	CREATE OR REPLACE FUNCTION idxall_func(int) RETURNS int LANGUAGE sql IMMUTABLE AS
+	$$ SELECT $1 + 1; $$;
+));
+
+($result, $stdout, $stderr) = $node->psql('postgres',
+	q(SELECT bt_index_parent_check('idxall_expr_idx', false, false, false, true)));
+like($stderr,
+	qr/index tuple in index "idxall_expr_idx" does not match heap tuple/,
+	'bt_index_parent_check also detects index-heap key mismatch');
+
+$node->stop;
+done_testing();
diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index 34433e819d4..ec6297d21b3 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -13,6 +13,11 @@
  * verify its structure.  A heap scan later uses Bloom filter probes to verify
  * that every visible heap tuple has a matching index tuple.
  *
+ * When heap-to-index verification (indexallkeysmatch) is requested, a Bloom
+ * filter fingerprints (key,tid) from a heap scan first.  The index scan then
+ * probes this filter; when the probe fails, a heap lookup verifies that the
+ * index tuple points to a heap tuple with the same key.
+ *
  *
  * Copyright (c) 2017-2026, PostgreSQL Global Development Group
  *
@@ -30,8 +35,9 @@
 #include "access/tableam.h"
 #include "access/transam.h"
 #include "access/xact.h"
-#include "verify_common.h"
 #include "catalog/index.h"
+#include "executor/executor.h"
+#include "verify_common.h"
 #include "catalog/pg_am.h"
 #include "catalog/pg_opfamily_d.h"
 #include "common/pg_prng.h"
@@ -82,6 +88,8 @@ typedef struct BtreeCheckState
 	bool		readonly;
 	/* Also verifying heap has no unindexed tuples? */
 	bool		heapallindexed;
+	/* Also verifying each index tuple points to heap tuple with same key? */
+	bool		indexallkeysmatch;
 	/* Also making sure non-pivot tuples can be found by new search? */
 	bool		rootdescend;
 	/* Also check uniqueness constraint if index is unique */
@@ -132,6 +140,15 @@ typedef struct BtreeCheckState
 	bloom_filter *filter;
 	/* Debug counter */
 	int64		heaptuplespresent;
+
+	/*
+	 * Mutable state, for optional indexallkeysmatch verification:
+	 */
+
+	/* Bloom filter fingerprints heap (key,tid) pairs */
+	bloom_filter *heapfilter;
+	/* Debug counter for index tuples verified */
+	int64		indextuplesverified;
 } BtreeCheckState;
 
 /*
@@ -169,6 +186,7 @@ typedef struct BTCallbackState
 {
 	bool		parentcheck;
 	bool		heapallindexed;
+	bool		indexallkeysmatch;
 	bool		rootdescend;
 	bool		checkunique;
 } BTCallbackState;
@@ -180,7 +198,7 @@ static void bt_index_check_callback(Relation indrel, Relation heaprel,
 									void *state, bool readonly);
 static void bt_check_every_level(Relation rel, Relation heaprel,
 								 bool heapkeyspace, bool readonly, bool heapallindexed,
-								 bool rootdescend, bool checkunique);
+								 bool indexallkeysmatch, bool rootdescend, bool checkunique);
 static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state,
 											   BtreeLevel level);
 static bool bt_leftmost_ignoring_half_dead(BtreeCheckState *state,
@@ -212,6 +230,13 @@ static void bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
 static void bt_tuple_present_callback(Relation index, ItemPointer tid,
 									  Datum *values, bool *isnull,
 									  bool tupleIsAlive, void *checkstate);
+static void bt_heap_fingerprint_callback(Relation index, ItemPointer tid,
+										  Datum *values, bool *isnull,
+										  bool tupleIsAlive, void *checkstate);
+static void bt_verify_index_tuple_points_to_heap(BtreeCheckState *state,
+												  IndexTuple itup,
+												  BlockNumber targetblock,
+												  OffsetNumber offset);
 static IndexTuple bt_normalize_tuple(BtreeCheckState *state,
 									 IndexTuple itup);
 static inline IndexTuple bt_posting_plain_tuple(IndexTuple itup, int n);
@@ -240,13 +265,15 @@ static inline ItemPointer BTreeTupleGetHeapTIDCareful(BtreeCheckState *state,
 static inline ItemPointer BTreeTupleGetPointsToTID(IndexTuple itup);
 
 /*
- * bt_index_check(index regclass, heapallindexed boolean, checkunique boolean)
+ * bt_index_check(index regclass, heapallindexed boolean, checkunique boolean, indexallkeysmatch boolean)
  *
  * Verify integrity of B-Tree index.
  *
  * Acquires AccessShareLock on heap & index relations.  Does not consider
  * invariants that exist between parent/child pages.  Optionally verifies
- * that heap does not contain any unindexed or incorrectly indexed tuples.
+ * that heap does not contain any unindexed or incorrectly indexed tuples
+ * (heapallindexed), or that each index tuple points to a heap tuple with
+ * the same key (indexallkeysmatch).
  */
 Datum
 bt_index_check(PG_FUNCTION_ARGS)
@@ -255,6 +282,7 @@ bt_index_check(PG_FUNCTION_ARGS)
 	BTCallbackState args;
 
 	args.heapallindexed = false;
+	args.indexallkeysmatch = false;
 	args.rootdescend = false;
 	args.parentcheck = false;
 	args.checkunique = false;
@@ -263,6 +291,8 @@ bt_index_check(PG_FUNCTION_ARGS)
 		args.heapallindexed = PG_GETARG_BOOL(1);
 	if (PG_NARGS() >= 3)
 		args.checkunique = PG_GETARG_BOOL(2);
+	if (PG_NARGS() >= 4)
+		args.indexallkeysmatch = PG_GETARG_BOOL(3);
 
 	amcheck_lock_relation_and_check(indrelid, BTREE_AM_OID,
 									bt_index_check_callback,
@@ -272,13 +302,15 @@ bt_index_check(PG_FUNCTION_ARGS)
 }
 
 /*
- * bt_index_parent_check(index regclass, heapallindexed boolean, rootdescend boolean, checkunique boolean)
+ * bt_index_parent_check(index regclass, heapallindexed boolean, rootdescend boolean, checkunique boolean, indexallkeysmatch boolean)
  *
  * Verify integrity of B-Tree index.
  *
  * Acquires ShareLock on heap & index relations.  Verifies that downlinks in
  * parent pages are valid lower bounds on child pages.  Optionally verifies
- * that heap does not contain any unindexed or incorrectly indexed tuples.
+ * that heap does not contain any unindexed or incorrectly indexed tuples
+ * (heapallindexed), or that each index tuple points to a heap tuple with
+ * the same key (indexallkeysmatch).
  */
 Datum
 bt_index_parent_check(PG_FUNCTION_ARGS)
@@ -287,6 +319,7 @@ bt_index_parent_check(PG_FUNCTION_ARGS)
 	BTCallbackState args;
 
 	args.heapallindexed = false;
+	args.indexallkeysmatch = false;
 	args.rootdescend = false;
 	args.parentcheck = true;
 	args.checkunique = false;
@@ -297,6 +330,8 @@ bt_index_parent_check(PG_FUNCTION_ARGS)
 		args.rootdescend = PG_GETARG_BOOL(2);
 	if (PG_NARGS() >= 4)
 		args.checkunique = PG_GETARG_BOOL(3);
+	if (PG_NARGS() >= 5)
+		args.indexallkeysmatch = PG_GETARG_BOOL(4);
 
 	amcheck_lock_relation_and_check(indrelid, BTREE_AM_OID,
 									bt_index_check_callback,
@@ -348,7 +383,8 @@ bt_index_check_callback(Relation indrel, Relation heaprel, void *state, bool rea
 
 	/* Check index, possibly against table it is an index on */
 	bt_check_every_level(indrel, heaprel, heapkeyspace, readonly,
-						 args->heapallindexed, args->rootdescend, args->checkunique);
+						 args->heapallindexed, args->indexallkeysmatch,
+						 args->rootdescend, args->checkunique);
 }
 
 /*
@@ -376,8 +412,8 @@ bt_index_check_callback(Relation indrel, Relation heaprel, void *state, bool rea
  */
 static void
 bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
-					 bool readonly, bool heapallindexed, bool rootdescend,
-					 bool checkunique)
+					 bool readonly, bool heapallindexed, bool indexallkeysmatch,
+					 bool rootdescend, bool checkunique)
 {
 	BtreeCheckState *state;
 	Page		metapage;
@@ -407,38 +443,17 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
 	state->heapkeyspace = heapkeyspace;
 	state->readonly = readonly;
 	state->heapallindexed = heapallindexed;
+	state->indexallkeysmatch = indexallkeysmatch;
 	state->rootdescend = rootdescend;
 	state->checkunique = checkunique;
 	state->snapshot = InvalidSnapshot;
 
-	if (state->heapallindexed)
+	if (state->heapallindexed || state->indexallkeysmatch)
 	{
-		int64		total_pages;
-		int64		total_elems;
-		uint64		seed;
-
 		/*
-		 * Size Bloom filter based on estimated number of tuples in index,
-		 * while conservatively assuming that each block must contain at least
-		 * MaxTIDsPerBTreePage / 3 "plain" tuples -- see
-		 * bt_posting_plain_tuple() for definition, and details of how posting
-		 * list tuples are handled.
-		 */
-		total_pages = RelationGetNumberOfBlocks(rel);
-		total_elems = Max(total_pages * (MaxTIDsPerBTreePage / 3),
-						  (int64) state->rel->rd_rel->reltuples);
-		/* Generate a random seed to avoid repetition */
-		seed = pg_prng_uint64(&pg_global_prng_state);
-		/* Create Bloom filter to fingerprint index */
-		state->filter = bloom_create(total_elems, maintenance_work_mem, seed);
-		state->heaptuplespresent = 0;
-
-		/*
-		 * Register our own snapshot for heapallindexed, rather than asking
-		 * table_index_build_scan() to do this for us later.  This needs to
-		 * happen before index fingerprinting begins, so we can later be
-		 * certain that index fingerprinting should have reached all tuples
-		 * returned by table_index_build_scan().
+		 * Register our own snapshot for heapallindexed/indexallkeysmatch, rather
+		 * than asking table_index_build_scan() to do this for us later.  This
+		 * needs to happen before fingerprinting begins.
 		 */
 		state->snapshot = RegisterSnapshot(GetTransactionSnapshot());
 
@@ -463,16 +478,55 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
 						   RelationGetRelationName(rel)));
 	}
 
+	if (state->heapallindexed)
+	{
+		int64		total_pages;
+		int64		total_elems;
+		uint64		seed;
+
+		/*
+		 * Size Bloom filter based on estimated number of tuples in index,
+		 * while conservatively assuming that each block must contain at least
+		 * MaxTIDsPerBTreePage / 3 "plain" tuples -- see
+		 * bt_posting_plain_tuple() for definition, and details of how posting
+		 * list tuples are handled.
+		 */
+		total_pages = RelationGetNumberOfBlocks(rel);
+		total_elems = Max(total_pages * (MaxTIDsPerBTreePage / 3),
+						  (int64) state->rel->rd_rel->reltuples);
+		seed = pg_prng_uint64(&pg_global_prng_state);
+		state->filter = bloom_create(total_elems, maintenance_work_mem, seed);
+		state->heaptuplespresent = 0;
+	}
+
+	if (state->indexallkeysmatch)
+	{
+		int64		total_pages;
+		int64		total_elems;
+		uint64		seed;
+
+		/*
+		 * Size Bloom filter based on estimated number of heap tuples.
+		 */
+		total_pages = RelationGetNumberOfBlocks(heaprel);
+		total_elems = Max(total_pages * MaxHeapTuplesPerPage,
+						  (int64) heaprel->rd_rel->reltuples);
+		seed = pg_prng_uint64(&pg_global_prng_state);
+		state->heapfilter = bloom_create(total_elems, maintenance_work_mem, seed);
+		state->indextuplesverified = 0;
+	}
+
 	/*
 	 * We need a snapshot to check the uniqueness of the index.  For better
 	 * performance, take it once per index check.  If one was already taken
 	 * above, use that.
 	 */
-	if (state->checkunique)
+	if (state->checkunique || state->indexallkeysmatch)
 	{
 		state->indexinfo = BuildIndexInfo(state->rel);
 
-		if (state->indexinfo->ii_Unique && state->snapshot == InvalidSnapshot)
+		if (state->checkunique && state->indexinfo->ii_Unique &&
+			state->snapshot == InvalidSnapshot)
 			state->snapshot = RegisterSnapshot(GetTransactionSnapshot());
 	}
 
@@ -490,6 +544,36 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
 												 ALLOCSET_DEFAULT_SIZES);
 	state->checkstrategy = GetAccessStrategy(BAS_BULKREAD);
 
+	/*
+	 * When indexallkeysmatch, fingerprint heap first so we can verify each index
+	 * tuple points to a heap tuple with the same key during the index scan.
+	 */
+	if (state->indexallkeysmatch)
+	{
+		IndexInfo  *indexinfo = BuildIndexInfo(state->rel);
+		TableScanDesc scan;
+
+		scan = table_beginscan_strat(state->heaprel,
+									 state->snapshot,
+									 0, NULL, true, true);
+		indexinfo->ii_Concurrent = true;
+		indexinfo->ii_Unique = false;
+		indexinfo->ii_ExclusionOps = NULL;
+		indexinfo->ii_ExclusionProcs = NULL;
+		indexinfo->ii_ExclusionStrats = NULL;
+
+		elog(DEBUG1, "fingerprinting heap \"%s\" for index \"%s\" verification",
+			 RelationGetRelationName(state->heaprel),
+			 RelationGetRelationName(state->rel));
+
+		table_index_build_scan(state->heaprel, state->rel, indexinfo, true, false,
+							   bt_heap_fingerprint_callback, state, scan);
+
+		ereport(DEBUG1,
+				(errmsg_internal("finished heap fingerprint with bitset %.2f%% set",
+								100.0 * bloom_prop_bits_set(state->heapfilter))));
+	}
+
 	/* Get true root block from meta-page */
 	metapage = palloc_btree_page(state, BTREE_METAPAGE);
 	metad = BTPageGetMeta(metapage);
@@ -596,6 +680,14 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
 		bloom_free(state->filter);
 	}
 
+	if (state->indexallkeysmatch)
+	{
+		ereport(DEBUG1,
+				(errmsg_internal("finished verifying " INT64_FORMAT " index tuples point to matching heap tuples",
+								 state->indextuplesverified)));
+		bloom_free(state->heapfilter);
+	}
+
 	/* Be tidy: */
 	if (state->snapshot != InvalidSnapshot)
 		UnregisterSnapshot(state->snapshot);
@@ -1516,6 +1608,28 @@ bt_target_page_check(BtreeCheckState *state)
 			}
 		}
 
+		/* Verify each index tuple points to heap tuple with same key */
+		if (state->indexallkeysmatch && P_ISLEAF(topaque) && !ItemIdIsDead(itemid))
+		{
+			if (BTreeTupleIsPosting(itup))
+			{
+				for (int i = 0; i < BTreeTupleGetNPosting(itup); i++)
+				{
+					IndexTuple	logtuple;
+
+					logtuple = bt_posting_plain_tuple(itup, i);
+					bt_verify_index_tuple_points_to_heap(state, logtuple,
+														  state->targetblock, offset);
+					pfree(logtuple);
+				}
+			}
+			else
+			{
+				bt_verify_index_tuple_points_to_heap(state, itup,
+													 state->targetblock, offset);
+			}
+		}
+
 		/*
 		 * * High key check *
 		 *
@@ -2812,6 +2926,128 @@ bt_tuple_present_callback(Relation index, ItemPointer tid, Datum *values,
 		pfree(norm);
 }
 
+/*
+ * Per-tuple callback from table_index_build_scan for indexallkeysmatch.  Add
+ * each visible heap tuple's (key, tid) to the Bloom filter for later probe
+ * during the index scan.
+ */
+static void
+bt_heap_fingerprint_callback(Relation index, ItemPointer tid, Datum *values,
+							  bool *isnull, bool tupleIsAlive, void *checkstate)
+{
+	BtreeCheckState *state = (BtreeCheckState *) checkstate;
+	IndexTuple	itup,
+				norm;
+
+	Assert(state->indexallkeysmatch);
+
+	itup = index_form_tuple(RelationGetDescr(index), values, isnull);
+	itup->t_tid = *tid;
+	norm = bt_normalize_tuple(state, itup);
+	bloom_add_element(state->heapfilter, (unsigned char *) norm,
+					  IndexTupleSize(norm));
+	pfree(itup);
+	if (norm != itup)
+		pfree(norm);
+}
+
+/*
+ * Verify that the index tuple points to a heap tuple with the same key.
+ * When the Bloom filter lacks the (key, tid), perform a heap lookup to confirm.
+ * Skip index tuples that point to dead heap tuples (not visible to snapshot).
+ */
+static void
+bt_verify_index_tuple_points_to_heap(BtreeCheckState *state, IndexTuple itup,
+									BlockNumber targetblock, OffsetNumber offset)
+{
+	ItemPointer tid = BTreeTupleGetHeapTID(itup);
+	IndexTuple	norm;
+	bool		in_filter;
+
+	Assert(state->indexallkeysmatch);
+
+	norm = bt_normalize_tuple(state, itup);
+	in_filter = !bloom_lacks_element(state->heapfilter, (unsigned char *) norm,
+									 IndexTupleSize(norm));
+	if (norm != itup)
+		pfree(norm);
+
+	if (in_filter)
+	{
+		/* Fingerprint contains only visible tuples, so this one is verified */
+		state->indextuplesverified++;
+		return;
+	}
+
+	/*
+	 * Bloom filter says (key, tid) not in heap.  Follow TID to verify; this
+	 * amortizes random heap lookups when the filter has false negatives, or
+	 * reports corruption when the index points to wrong heap tuple.  Skip
+	 * dead tuples (table_tuple_fetch_row_version returns false for them).
+	 */
+	{
+		TupleTableSlot *slot;
+		IndexTuple	heap_itup;
+		IndexTuple	heap_norm;
+		Datum		values[INDEX_MAX_KEYS];
+		bool		isnull[INDEX_MAX_KEYS];
+		IndexInfo  *indexinfo;
+		EState	   *estate;
+		bool		found;
+
+		slot = table_slot_create(state->heaprel, NULL);
+		found = table_tuple_fetch_row_version(state->heaprel, tid,
+											  state->snapshot, slot);
+		if (!found)
+		{
+			ExecDropSingleTupleTableSlot(slot);
+			return;			/* dead or non-existent heap tuple, skip */
+		}
+
+		indexinfo = state->indexinfo;
+		estate = CreateExecutorState();
+		GetPerTupleExprContext(estate)->ecxt_scantuple = slot;
+		FormIndexDatum(indexinfo, slot, estate, values, isnull);
+		FreeExecutorState(estate);
+
+		/* These may have been pointing to the now-gone estate */
+		indexinfo->ii_ExpressionsState = NIL;
+		indexinfo->ii_PredicateState = NULL;
+
+		heap_itup = index_form_tuple(RelationGetDescr(state->rel), values, isnull);
+		heap_itup->t_tid = *tid;
+		heap_norm = bt_normalize_tuple(state, heap_itup);
+
+		norm = bt_normalize_tuple(state, itup);
+		if (IndexTupleSize(heap_norm) != IndexTupleSize(norm) ||
+			memcmp(heap_norm, norm, IndexTupleSize(norm)) != 0)
+		{
+			ExecDropSingleTupleTableSlot(slot);
+			pfree(heap_itup);
+			if (heap_norm != heap_itup)
+				pfree(heap_norm);
+			if (norm != itup)
+				pfree(norm);
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg("index tuple in index \"%s\" does not match heap tuple",
+							RelationGetRelationName(state->rel)),
+					 errdetail_internal("Index tid=(%u,%u) points to heap tid=(%u,%u) with different key.",
+									   targetblock, offset,
+									   ItemPointerGetBlockNumber(tid),
+									   ItemPointerGetOffsetNumber(tid))));
+		}
+
+		ExecDropSingleTupleTableSlot(slot);
+		pfree(heap_itup);
+		if (heap_norm != heap_itup)
+			pfree(heap_norm);
+		if (norm != itup)
+			pfree(norm);
+		state->indextuplesverified++;
+	}
+}
+
 /*
  * Normalize an index tuple for fingerprinting.
  *
-- 
2.51.2



  [application/octet-stream] v3-0003-Document-indexallkeysmatch-parameter-for-amcheck-.patch (5.8K, 3-v3-0003-Document-indexallkeysmatch-parameter-for-amcheck-.patch)
  download | inline diff:
From 4b19ab8f1bafb17077e972ddd21e9333c940755d Mon Sep 17 00:00:00 2001
From: Andrey Borodin <[email protected]>
Date: Thu, 26 Feb 2026 22:01:16 +0500
Subject: [PATCH v3 3/3] Document indexallkeysmatch parameter for amcheck
 B-Tree verification

Add indexallkeysmatch to bt_index_check and bt_index_parent_check
signatures, describe the parameter, add a dedicated section explaining
the verification, and mention it in the corruption detection list.
---
 doc/src/sgml/amcheck.sgml | 55 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 49 insertions(+), 6 deletions(-)

diff --git a/doc/src/sgml/amcheck.sgml b/doc/src/sgml/amcheck.sgml
index 08006856579..793fd1661ff 100644
--- a/doc/src/sgml/amcheck.sgml
+++ b/doc/src/sgml/amcheck.sgml
@@ -61,7 +61,7 @@
   <variablelist>
    <varlistentry>
     <term>
-     <function>bt_index_check(index regclass, heapallindexed boolean, checkunique boolean) returns void</function>
+     <function>bt_index_check(index regclass, heapallindexed boolean, checkunique boolean, indexallkeysmatch boolean) returns void</function>
      <indexterm>
       <primary>bt_index_check</primary>
      </indexterm>
@@ -118,7 +118,10 @@ ORDER BY c.relpages DESC LIMIT 10;
       that span child/parent relationships, but will verify the
       presence of all heap tuples as index tuples within the index
       when <parameter>heapallindexed</parameter> is
-      <literal>true</literal>.  When <parameter>checkunique</parameter>
+      <literal>true</literal>.  When <parameter>indexallkeysmatch</parameter>
+      is <literal>true</literal>, it verifies that each index tuple
+      points to a heap tuple with the same key (the reverse of
+      <parameter>heapallindexed</parameter>).  When <parameter>checkunique</parameter>
       is <literal>true</literal> <function>bt_index_check</function> will
       check that no more than one among duplicate entries in unique
       index is visible.  When a routine, lightweight test for
@@ -132,7 +135,7 @@ ORDER BY c.relpages DESC LIMIT 10;
 
    <varlistentry>
     <term>
-     <function>bt_index_parent_check(index regclass, heapallindexed boolean, rootdescend boolean, checkunique boolean) returns void</function>
+     <function>bt_index_parent_check(index regclass, heapallindexed boolean, rootdescend boolean, checkunique boolean, indexallkeysmatch boolean) returns void</function>
      <indexterm>
       <primary>bt_index_parent_check</primary>
      </indexterm>
@@ -145,7 +148,9 @@ ORDER BY c.relpages DESC LIMIT 10;
       Optionally, when the <parameter>heapallindexed</parameter>
       argument is <literal>true</literal>, the function verifies the
       presence of all heap tuples that should be found within the
-      index.  When <parameter>checkunique</parameter>
+      index.  When <parameter>indexallkeysmatch</parameter> is
+      <literal>true</literal>, it verifies that each index tuple
+      points to a heap tuple with the same key.  When <parameter>checkunique</parameter>
       is <literal>true</literal> <function>bt_index_parent_check</function> will
       check that no more than one among duplicate entries in unique
       index is visible.  When the optional <parameter>rootdescend</parameter>
@@ -416,6 +421,41 @@ SET client_min_messages = DEBUG1;
 
  </sect2>
 
+ <sect2 id="amcheck-optional-indexallkeysmatch-verification">
+  <title>Optional <parameter>indexallkeysmatch</parameter> Verification</title>
+ <para>
+  When the <parameter>indexallkeysmatch</parameter> argument to B-Tree
+  verification functions is <literal>true</literal>, an additional
+  phase verifies that each index tuple points to a heap tuple with the
+  same key.  This is the reverse of <parameter>heapallindexed</parameter>:
+  <parameter>heapallindexed</parameter> checks that every heap tuple is
+  in the index, while <parameter>indexallkeysmatch</parameter> checks
+  that every index tuple points to a matching heap tuple.
+ </para>
+ <para>
+  The implementation uses a Bloom filter to amortize random heap
+  lookups.  A sequential heap scan first fingerprints all visible
+  (key, tid) pairs.  During the index scan, each leaf tuple is probed
+  against this filter.  Only when the filter says &quot;not present&quot; does
+  an actual heap fetch and key comparison occur.  Corruption is
+  reported when an index tuple points to a non-existent heap slot, or
+  when the heap tuple's key (as computed by <function>FormIndexDatum</function>)
+  differs from the index tuple's key.
+ </para>
+ <para>
+  Like <parameter>heapallindexed</parameter>, this check requires an
+  MVCC snapshot to obtain a consistent view of the heap and index.
+  Index tuples that point to dead heap tuples (not visible to the
+  snapshot) are skipped.
+ </para>
+ <para>
+  The summarizing structure is bound in size by
+  <varname>maintenance_work_mem</varname>, using the same sizing
+  approach as <parameter>heapallindexed</parameter>.
+ </para>
+
+ </sect2>
+
  <sect2 id="amcheck-using-amcheck-effectively">
   <title>Using <filename>amcheck</filename> Effectively</title>
 
@@ -458,12 +498,15 @@ SET client_min_messages = DEBUG1;
    <listitem>
     <para>
      Structural inconsistencies between indexes and the heap relations
-     that are indexed (when <parameter>heapallindexed</parameter>
-     verification is performed).
+     that are indexed (when <parameter>heapallindexed</parameter> or
+     <parameter>indexallkeysmatch</parameter> verification is performed).
     </para>
     <para>
      There is no cross-checking of indexes against their heap relation
      during normal operation.  Symptoms of heap corruption can be subtle.
+     <parameter>indexallkeysmatch</parameter> detects cases where an index
+     tuple stores a different key than the heap tuple it points to, or
+     points to a non-existent heap slot.
     </para>
    </listitem>
    <listitem>
-- 
2.51.2



  [application/octet-stream] v3-0002-amcheck-report-corruption-when-index-points-to-no.patch (2.7K, 4-v3-0002-amcheck-report-corruption-when-index-points-to-no.patch)
  download | inline diff:
From 5647b7768553e9f3b60f2b2805af586d8be9a641 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <[email protected]>
Date: Thu, 26 Feb 2026 09:32:06 +0500
Subject: [PATCH v3 2/3] amcheck: report corruption when index points to
 non-existent heap tuple

Use SnapshotAny to distinguish "tuple doesn't exist" (corruption) from
"tuple exists but is dead" (skip).  When table_tuple_fetch_row_version
with SnapshotAny returns false, the heap slot was reclaimed or the page
was reorganized (e.g. by VACUUM), so the index has an orphaned entry.
Report that as corruption instead of silently skipping.
---
 contrib/amcheck/verify_nbtree.c | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index ec6297d21b3..29ce35dcd6c 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -2982,8 +2982,12 @@ bt_verify_index_tuple_points_to_heap(BtreeCheckState *state, IndexTuple itup,
 	/*
 	 * Bloom filter says (key, tid) not in heap.  Follow TID to verify; this
 	 * amortizes random heap lookups when the filter has false negatives, or
-	 * reports corruption when the index points to wrong heap tuple.  Skip
-	 * dead tuples (table_tuple_fetch_row_version returns false for them).
+	 * reports corruption when the index points to wrong heap tuple.
+	 *
+	 * Use SnapshotAny first to distinguish "tuple doesn't exist" (corruption)
+	 * from "tuple exists but is dead" (skip).  SnapshotAny returns any tuple
+	 * at the TID; if that fails, the slot was reclaimed or the page was
+	 * reorganized (e.g. by VACUUM), so the index has an orphaned entry.
 	 */
 	{
 		TupleTableSlot *slot;
@@ -2997,11 +3001,25 @@ bt_verify_index_tuple_points_to_heap(BtreeCheckState *state, IndexTuple itup,
 
 		slot = table_slot_create(state->heaprel, NULL);
 		found = table_tuple_fetch_row_version(state->heaprel, tid,
-											  state->snapshot, slot);
+											  SnapshotAny, slot);
 		if (!found)
 		{
 			ExecDropSingleTupleTableSlot(slot);
-			return;			/* dead or non-existent heap tuple, skip */
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg("index tuple in index \"%s\" points to non-existent heap tuple",
+							RelationGetRelationName(state->rel)),
+					 errdetail_internal("Index tid=(%u,%u) points to heap tid=(%u,%u) that no longer exists.",
+									   targetblock, offset,
+									   ItemPointerGetBlockNumber(tid),
+									   ItemPointerGetOffsetNumber(tid))));
+		}
+
+		/* Skip dead tuples (not visible to our snapshot) */
+		if (!table_tuple_satisfies_snapshot(state->heaprel, slot, state->snapshot))
+		{
+			ExecDropSingleTupleTableSlot(slot);
+			return;
 		}
 
 		indexinfo = state->indexinfo;
-- 
2.51.2



view thread (6+ messages)  latest in thread

reply

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Reply to all the recipients using the --to and --cc options:
  reply via email

  To: [email protected]
  Cc: [email protected], [email protected]
  Subject: Re: amcheck: add index-all-keys-match verification for B-Tree
  In-Reply-To: <[email protected]>

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

This inbox is served by agora; see mirroring instructions
for how to clone and mirror all data and code used for this inbox