public inbox for [email protected]
help / color / mirror / Atom feedFrom: Alexandre Felipe <[email protected]>
To: [email protected] <[email protected]>
Subject: New access method for b-tree.
Date: Sun, 1 Feb 2026 10:02:06 +0000
Message-ID: <AS1PR02MB784695AFEC37179FFAF7EAE19A9DA@AS1PR02MB7846.eurprd02.prod.outlook.com> (raw)
Hello Hackers,
Please check this out,
It is an access method to scan a table sorting by the second column of an index, filtered on the first.
Queries like
SELECT x, y FROM grid
WHERE x in (array of Nx elements)
ORDER BY y, x
LIMIT M
Can execute streaming the rows directly from disk instead of loading everything.
Using btree index on (x, y)
On a grid with N x N will run by fetching only what is necessary
A skip scal will run with O(N * Nx) I/O, O(N x Nx) space, O(N x Nx * log( N * Nx)) comput (assuming a generic in memory sort)
The proposed access method does it O(M + Nx) I/O, O(Nx) space, and O(M * log(Nx)) compute.
Kind Regards,
Alexandre Felipe
Research & Development Engineer
Attachments:
[application/octet-stream] btree_merge_rebased.patch (54.5K, 3-btree_merge_rebased.patch)
download | inline diff:
diff --git a/.gitignore b/.gitignore
index 4e911395fe3..ac1f95d9cf0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -43,3 +43,11 @@ lib*.pc
/Release/
/tmp_install/
/portlock/
+
+# hidden files (e.g. .dbdata, .install, good practice to test locally in isolation)
+.*
+
+# Test output
+**/regression.diffs
+**/regression.out
+**/results/
diff --git a/src/backend/access/nbtree/Makefile b/src/backend/access/nbtree/Makefile
index 0daf640af96..72053cefdaa 100644
--- a/src/backend/access/nbtree/Makefile
+++ b/src/backend/access/nbtree/Makefile
@@ -16,6 +16,7 @@ OBJS = \
nbtcompare.o \
nbtdedup.o \
nbtinsert.o \
+ nbtmergescan.o \
nbtpage.o \
nbtpreprocesskeys.o \
nbtreadpage.o \
diff --git a/src/backend/access/nbtree/meson.build b/src/backend/access/nbtree/meson.build
index 812f067e710..1016fea62d5 100644
--- a/src/backend/access/nbtree/meson.build
+++ b/src/backend/access/nbtree/meson.build
@@ -4,6 +4,7 @@ backend_sources += files(
'nbtcompare.c',
'nbtdedup.c',
'nbtinsert.c',
+ 'nbtmergescan.c',
'nbtpage.c',
'nbtpreprocesskeys.c',
'nbtreadpage.c',
diff --git a/src/backend/access/nbtree/nbtmergescan.c b/src/backend/access/nbtree/nbtmergescan.c
new file mode 100644
index 00000000000..70828dc73d3
--- /dev/null
+++ b/src/backend/access/nbtree/nbtmergescan.c
@@ -0,0 +1,457 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtmergescan.c
+ * B-Tree merge scan for efficient evaluation of IN-list queries
+ *
+ * This module implements a K-way merge scan for B-tree indexes, optimized
+ * for queries of the form:
+ * WHERE prefix IN (v1, v2, ..., vK) AND suffix >= b ORDER BY suffix LIMIT N
+ *
+ * The algorithm maintains a min-heap of cursors, one per prefix value.
+ * Each cursor tracks its position within the index for that prefix.
+ * Tuples are returned in suffix order by repeatedly extracting the
+ * minimum from the heap.
+ *
+ * Target behavior: Access at most N + K - 1 index tuples for LIMIT N.
+ *
+ * Copyright (c) 2026, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/access/nbtree/nbtmergescan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/relscan.h"
+#include "lib/pairingheap.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "utils/datum.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+/* Forward declarations of static functions */
+static int bt_merge_heap_cmp(const pairingheap_node *a,
+ const pairingheap_node *b,
+ void *arg);
+static bool bt_merge_cursor_init(BTMergeScanState *state,
+ IndexScanDesc scan,
+ BTMergeCursor *cursor,
+ Datum prefix_value,
+ bool prefix_isnull);
+static bool bt_merge_cursor_advance(BTMergeScanState *state,
+ IndexScanDesc scan,
+ BTMergeCursor *cursor);
+static Datum bt_merge_extract_sortkey(BTMergeScanState *state,
+ IndexScanDesc scan,
+ BTMergeCursor *cursor,
+ bool *isnull);
+
+
+/*
+ * bt_merge_heap_cmp
+ * Compare two cursors by their current sort key (suffix value).
+ *
+ * When sort keys are equal, uses prefix value as tiebreaker for
+ * deterministic ordering (ORDER BY suffix, prefix).
+ *
+ * Returns positive if a > b (pairingheap is a max-heap, we want min-heap
+ * behavior so we invert the comparison).
+ */
+static int
+bt_merge_heap_cmp(const pairingheap_node *a,
+ const pairingheap_node *b,
+ void *arg)
+{
+ BTMergeScanState *state = (BTMergeScanState *) arg;
+ BTMergeCursor *cursor_a = pairingheap_container(BTMergeCursor, ph_node,
+ (pairingheap_node *) a);
+ BTMergeCursor *cursor_b = pairingheap_container(BTMergeCursor, ph_node,
+ (pairingheap_node *) b);
+ Datum key_a = cursor_a->sort_key;
+ Datum key_b = cursor_b->sort_key;
+ bool null_a = cursor_a->sort_key_isnull;
+ bool null_b = cursor_b->sort_key_isnull;
+ int32 cmp;
+
+ /* Handle NULLs - NULLs sort last (NULLS LAST default for ASC) */
+ if (null_a && null_b)
+ return 0;
+ if (null_a)
+ return -1; /* a is NULL, comes after b */
+ if (null_b)
+ return 1; /* b is NULL, comes after a */
+
+ /* Compare using the suffix column's comparison function */
+ cmp = DatumGetInt32(FunctionCall2Coll(&state->suffix_cmp,
+ state->suffix_collation,
+ key_a, key_b));
+
+ /*
+ * Use prefix value as tiebreaker for deterministic ordering.
+ * This ensures ORDER BY suffix, prefix behavior.
+ */
+ if (cmp == 0)
+ {
+ /* Compare prefix values (assumes pass-by-value int4 for now) */
+ int32 prefix_a = DatumGetInt32(cursor_a->prefix_value);
+ int32 prefix_b = DatumGetInt32(cursor_b->prefix_value);
+
+ if (prefix_a < prefix_b)
+ cmp = -1;
+ else if (prefix_a > prefix_b)
+ cmp = 1;
+ }
+
+ /* Negate for min-heap behavior */
+ return -cmp;
+}
+
+
+/*
+ * bt_merge_init
+ * Initialize a merge scan state.
+ *
+ * Creates the merge state with one cursor per prefix value.
+ * The cursors will be positioned at their first matching tuples
+ * when bt_merge_getnext is first called.
+ */
+BTMergeScanState *
+bt_merge_init(IndexScanDesc scan,
+ Datum *prefix_values,
+ bool *prefix_nulls,
+ int num_prefixes,
+ int prefix_attno,
+ int suffix_attno,
+ Oid suffix_cmp_oid,
+ Oid suffix_collation)
+{
+ BTMergeScanState *state;
+ MemoryContext merge_context;
+ MemoryContext old_context;
+ int i;
+
+ /* Create memory context for merge scan allocations */
+ merge_context = AllocSetContextCreate(CurrentMemoryContext,
+ "BTMergeScan",
+ ALLOCSET_DEFAULT_SIZES);
+ old_context = MemoryContextSwitchTo(merge_context);
+
+ /* Allocate main state structure */
+ state = palloc0(sizeof(BTMergeScanState));
+ state->merge_context = merge_context;
+ state->num_cursors = num_prefixes;
+ state->active_cursors = 0;
+ state->prefix_attno = prefix_attno;
+ state->suffix_attno = suffix_attno;
+ state->suffix_collation = suffix_collation;
+ state->direction = ForwardScanDirection;
+ state->initialized = false;
+ state->tuples_accessed = 0;
+
+ /* Set up suffix comparison function */
+ fmgr_info(suffix_cmp_oid, &state->suffix_cmp);
+
+ /* Allocate cursor array */
+ state->cursors = palloc0(num_prefixes * sizeof(BTMergeCursor));
+
+ /* Initialize cursor metadata (not positioned yet) */
+ for (i = 0; i < num_prefixes; i++)
+ {
+ BTMergeCursor *cursor = &state->cursors[i];
+
+ cursor->cursor_id = i;
+ cursor->prefix_value = datumCopy(prefix_values[i], true, sizeof(Datum));
+ cursor->prefix_isnull = prefix_nulls[i];
+ cursor->exhausted = prefix_nulls[i]; /* NULL prefix = exhausted */
+ cursor->sort_key_isnull = true;
+ BTScanPosInvalidate(cursor->pos);
+ cursor->tuples = NULL;
+ }
+
+ /* Initialize the merge heap */
+ state->merge_heap = pairingheap_allocate(bt_merge_heap_cmp, state);
+
+ MemoryContextSwitchTo(old_context);
+
+ return state;
+}
+
+
+/*
+ * bt_merge_getnext
+ * Get the next tuple from the merge scan.
+ *
+ * Returns true if a tuple was found, false if scan is exhausted.
+ * The tuple's TID is stored in scan->xs_heaptid.
+ */
+bool
+bt_merge_getnext(IndexScanDesc scan, ScanDirection dir)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ BTMergeScanState *state = so->mergeState;
+ BTMergeCursor *cursor;
+ pairingheap_node *node;
+ int i;
+
+ if (state == NULL)
+ return false;
+
+ /* Initialize cursors on first call */
+ if (!state->initialized)
+ {
+ state->initialized = true;
+ state->direction = dir;
+
+ for (i = 0; i < state->num_cursors; i++)
+ {
+ BTMergeCursor *c = &state->cursors[i];
+
+ if (!c->exhausted &&
+ bt_merge_cursor_init(state, scan, c,
+ c->prefix_value, c->prefix_isnull))
+ {
+ /* Cursor has at least one tuple, add to heap */
+ pairingheap_add(state->merge_heap, &c->ph_node);
+ state->active_cursors++;
+ }
+ }
+ }
+
+ /* Get the cursor with the smallest suffix value */
+ if (pairingheap_is_empty(state->merge_heap))
+ return false;
+
+ node = pairingheap_remove_first(state->merge_heap);
+ cursor = pairingheap_container(BTMergeCursor, ph_node, node);
+
+ /* Set up the heap TID from the current cursor position */
+ Assert(BTScanPosIsValid(cursor->pos));
+ scan->xs_heaptid = cursor->pos.items[cursor->pos.itemIndex].heapTid;
+
+ /* Advance cursor to next tuple */
+ if (bt_merge_cursor_advance(state, scan, cursor))
+ {
+ /* Cursor still has tuples, re-add to heap */
+ pairingheap_add(state->merge_heap, &cursor->ph_node);
+ }
+ else
+ {
+ /* Cursor exhausted */
+ state->active_cursors--;
+ }
+
+ return true;
+}
+
+
+/*
+ * bt_merge_end
+ * Clean up merge scan state.
+ */
+void
+bt_merge_end(BTMergeScanState *state)
+{
+ if (state == NULL)
+ return;
+
+ /* Free the memory context, which frees all allocations */
+ MemoryContextDelete(state->merge_context);
+}
+
+
+/*
+ * bt_merge_cursor_init
+ * Initialize a cursor and position it at the first matching tuple.
+ *
+ * Returns true if the cursor found at least one matching tuple.
+ */
+static bool
+bt_merge_cursor_init(BTMergeScanState *state,
+ IndexScanDesc scan,
+ BTMergeCursor *cursor,
+ Datum prefix_value,
+ bool prefix_isnull)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ bool found;
+
+ if (prefix_isnull)
+ {
+ cursor->exhausted = true;
+ return false;
+ }
+
+ /*
+ * Modify the scan key to use this cursor's prefix value.
+ * We reuse the scan's existing key infrastructure.
+ */
+ for (int i = 0; i < so->numberOfKeys; i++)
+ {
+ if (so->keyData[i].sk_attno == state->prefix_attno)
+ {
+ so->keyData[i].sk_argument = prefix_value;
+ so->keyData[i].sk_flags &= ~(SK_SEARCHARRAY);
+ break;
+ }
+ }
+
+ /* Invalidate current position to force _bt_first */
+ BTScanPosInvalidate(so->currPos);
+
+ /* Disable array key handling for this cursor's scan */
+ so->numArrayKeys = 0;
+
+ /* Position at first matching tuple */
+ found = _bt_first(scan, state->direction);
+
+ if (found)
+ {
+ /* Copy position to cursor */
+ memcpy(&cursor->pos, &so->currPos, sizeof(BTScanPosData));
+
+ /* Extract the sort key for heap ordering */
+ cursor->sort_key = bt_merge_extract_sortkey(state, scan, cursor,
+ &cursor->sort_key_isnull);
+ cursor->exhausted = false;
+
+ /* Count this as a tuple access */
+ state->tuples_accessed++;
+
+ /* Invalidate main scan position */
+ BTScanPosInvalidate(so->currPos);
+ }
+ else
+ {
+ cursor->exhausted = true;
+ }
+
+ return found;
+}
+
+
+/*
+ * bt_merge_cursor_advance
+ * Advance a cursor to its next tuple.
+ *
+ * Returns true if the cursor now points to a valid tuple, false if exhausted.
+ */
+static bool
+bt_merge_cursor_advance(BTMergeScanState *state,
+ IndexScanDesc scan,
+ BTMergeCursor *cursor)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ bool found = false;
+
+ if (cursor->exhausted)
+ return false;
+
+ /* Try to move to next tuple within current page's items array */
+ if (state->direction == ForwardScanDirection)
+ {
+ if (cursor->pos.itemIndex < cursor->pos.lastItem)
+ {
+ cursor->pos.itemIndex++;
+ found = true;
+ }
+ }
+ else
+ {
+ if (cursor->pos.itemIndex > cursor->pos.firstItem)
+ {
+ cursor->pos.itemIndex--;
+ found = true;
+ }
+ }
+
+ if (!found)
+ {
+ /*
+ * Current page exhausted. Use _bt_next to get the next page.
+ * We swap our cursor's position into the scan's currPos,
+ * call _bt_next, then swap back.
+ */
+ BTScanPosData save_pos;
+
+ memcpy(&save_pos, &so->currPos, sizeof(BTScanPosData));
+ memcpy(&so->currPos, &cursor->pos, sizeof(BTScanPosData));
+
+ found = _bt_next(scan, state->direction);
+
+ if (found)
+ memcpy(&cursor->pos, &so->currPos, sizeof(BTScanPosData));
+
+ memcpy(&so->currPos, &save_pos, sizeof(BTScanPosData));
+ }
+
+ if (found)
+ {
+ /* Extract new sort key */
+ cursor->sort_key = bt_merge_extract_sortkey(state, scan, cursor,
+ &cursor->sort_key_isnull);
+ state->tuples_accessed++;
+ }
+ else
+ {
+ cursor->exhausted = true;
+ }
+
+ return found;
+}
+
+
+/*
+ * bt_merge_extract_sortkey
+ * Extract the sort key (suffix column value) from the current tuple.
+ */
+static Datum
+bt_merge_extract_sortkey(BTMergeScanState *state,
+ IndexScanDesc scan,
+ BTMergeCursor *cursor,
+ bool *isnull)
+{
+ Relation rel = scan->indexRelation;
+ Buffer buf;
+ Page page;
+ OffsetNumber offnum;
+ ItemId itemid;
+ IndexTuple itup;
+ TupleDesc tupdesc;
+ Datum result;
+
+ if (cursor->pos.currPage == InvalidBlockNumber)
+ {
+ *isnull = true;
+ return (Datum) 0;
+ }
+
+ /* Read the page */
+ buf = ReadBuffer(rel, cursor->pos.currPage);
+ LockBuffer(buf, BT_READ);
+ page = BufferGetPage(buf);
+
+ offnum = cursor->pos.items[cursor->pos.itemIndex].indexOffset;
+ itemid = PageGetItemId(page, offnum);
+ itup = (IndexTuple) PageGetItem(page, itemid);
+ tupdesc = RelationGetDescr(rel);
+
+ /* Extract the suffix column value */
+ result = index_getattr(itup, state->suffix_attno, tupdesc, isnull);
+
+ /* Copy pass-by-reference values before releasing buffer */
+ if (!*isnull)
+ {
+ Form_pg_attribute attr = TupleDescAttr(tupdesc, state->suffix_attno - 1);
+
+ if (!attr->attbyval)
+ result = datumCopy(result, attr->attbyval, attr->attlen);
+ }
+
+ UnlockReleaseBuffer(buf);
+
+ return result;
+}
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 77224859685..0d4e7440760 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -20,6 +20,7 @@
#include "catalog/pg_am_d.h"
#include "catalog/pg_class.h"
#include "catalog/pg_index.h"
+#include "lib/pairingheap.h"
#include "lib/stringinfo.h"
#include "storage/bufmgr.h"
#include "storage/dsm.h"
@@ -1050,6 +1051,49 @@ typedef struct BTArrayKeyInfo
ScanKey high_compare; /* array's < or <= upper bound */
} BTArrayKeyInfo;
+/*
+ * BTMergeCursor - tracks scan state for one prefix value in merge scan
+ *
+ * Each cursor maintains its own position within the index for a specific
+ * prefix value. Cursors are organized in a min-heap ordered by their
+ * current suffix key value for efficient K-way merge.
+ */
+typedef struct BTMergeCursor
+{
+ pairingheap_node ph_node; /* pairing heap node for merge */
+ int cursor_id; /* index in merge state's cursors array */
+ Datum prefix_value; /* the prefix value for this sub-scan */
+ bool prefix_isnull; /* is prefix value NULL? */
+ Datum sort_key; /* current tuple's sort key (suffix) */
+ bool sort_key_isnull;/* is sort key NULL? */
+ bool exhausted; /* no more tuples for this prefix */
+ BTScanPosData pos; /* current position in index */
+ char *tuples; /* tuple storage workspace (BLCKSZ) */
+} BTMergeCursor;
+
+/*
+ * BTMergeScanState - state for K-way merge scan
+ *
+ * This structure manages multiple cursors for a merge scan, allowing
+ * lazy evaluation of queries like:
+ * WHERE prefix IN (v1, v2, ..., vK) AND suffix >= b ORDER BY suffix LIMIT N
+ */
+typedef struct BTMergeScanState
+{
+ int num_cursors; /* number of prefix values (K) */
+ int active_cursors; /* cursors not yet exhausted */
+ BTMergeCursor *cursors; /* array of cursors */
+ pairingheap *merge_heap; /* min-heap ordered by sort_key */
+ int prefix_attno; /* attribute number of prefix column (1-based) */
+ int suffix_attno; /* attribute number of suffix column (1-based) */
+ FmgrInfo suffix_cmp; /* comparison function for suffix */
+ Oid suffix_collation; /* collation for suffix comparison */
+ ScanDirection direction; /* scan direction */
+ bool initialized; /* have cursors been initialized? */
+ MemoryContext merge_context;/* memory context for allocations */
+ int64 tuples_accessed;/* count of index tuples accessed */
+} BTMergeScanState;
+
typedef struct BTScanOpaqueData
{
/* these fields are set by _bt_preprocess_keys(): */
@@ -1089,6 +1133,12 @@ typedef struct BTScanOpaqueData
*/
int markItemIndex; /* itemIndex, or -1 if not valid */
+ /*
+ * Merge scan state, if using merge scan optimization.
+ * NULL if not using merge scan.
+ */
+ BTMergeScanState *mergeState;
+
/* keep these last in struct for efficiency */
BTScanPosData currPos; /* current position data */
BTScanPosData markPos; /* marked position, if any */
@@ -1334,4 +1384,18 @@ extern IndexBuildResult *btbuild(Relation heap, Relation index,
struct IndexInfo *indexInfo);
extern void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc);
+/*
+ * prototypes for functions in nbtmergescan.c
+ */
+extern BTMergeScanState *bt_merge_init(IndexScanDesc scan,
+ Datum *prefix_values,
+ bool *prefix_nulls,
+ int num_prefixes,
+ int prefix_attno,
+ int suffix_attno,
+ Oid suffix_cmp_oid,
+ Oid suffix_collation);
+extern bool bt_merge_getnext(IndexScanDesc scan, ScanDirection dir);
+extern void bt_merge_end(BTMergeScanState *state);
+
#endif /* NBTREE_H */
diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build
index 2634a519935..b7b802bfdde 100644
--- a/src/test/modules/meson.build
+++ b/src/test/modules/meson.build
@@ -18,6 +18,7 @@ subdir('ssl_passphrase_callback')
subdir('test_aio')
subdir('test_binaryheap')
subdir('test_bitmapset')
+subdir('test_btree_merge')
subdir('test_bloomfilter')
subdir('test_cloexec')
subdir('test_copy_callbacks')
diff --git a/src/test/modules/test_btree_merge/Makefile b/src/test/modules/test_btree_merge/Makefile
new file mode 100644
index 00000000000..540416a2c91
--- /dev/null
+++ b/src/test/modules/test_btree_merge/Makefile
@@ -0,0 +1,24 @@
+# src/test/modules/test_btree_merge/Makefile
+
+MODULE_big = test_btree_merge
+OBJS = \
+ $(WIN32RES) \
+ test_btree_merge.o
+
+PGFILEDESC = "test_btree_merge - test code for btree merge scan"
+
+EXTENSION = test_btree_merge
+DATA = test_btree_merge--1.0.sql
+
+REGRESS = test_btree_merge
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_btree_merge
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/test_btree_merge/expected/test_btree_merge.out b/src/test/modules/test_btree_merge/expected/test_btree_merge.out
new file mode 100644
index 00000000000..baf4d7937e0
--- /dev/null
+++ b/src/test/modules/test_btree_merge/expected/test_btree_merge.out
@@ -0,0 +1,243 @@
+-- Unit tests for B-tree merge scan implementation
+-- Tests the core merge scan algorithm directly, bypassing the planner
+CREATE EXTENSION test_btree_merge;
+-- ============================================================================
+-- Setup: Create test tables with known data distributions
+-- ============================================================================
+-- Test table with integer prefix and suffix
+CREATE TABLE merge_test_int (
+ prefix_col int4,
+ suffix_col int4
+);
+-- Insert data: 10 prefix values, 100 suffix values each = 1000 rows
+INSERT INTO merge_test_int
+SELECT p, s
+FROM generate_series(1, 10) AS p,
+ generate_series(1, 100) AS s;
+CREATE INDEX merge_test_int_idx ON merge_test_int (prefix_col, suffix_col);
+ANALYZE merge_test_int;
+-- Test table with integer prefix and timestamp suffix
+CREATE TABLE merge_test_ts (
+ user_id int4,
+ event_time timestamp
+);
+-- Insert data: 5 users, 100 events each
+INSERT INTO merge_test_ts
+SELECT u, '2026-01-01 00:00:00'::timestamp + (e || ' minutes')::interval
+FROM generate_series(1, 5) AS u,
+ generate_series(1, 100) AS e;
+CREATE INDEX merge_test_ts_idx ON merge_test_ts (user_id, event_time);
+ANALYZE merge_test_ts;
+-- ============================================================================
+-- Test 1: Basic integer merge scan
+-- Query: WHERE prefix IN (1,2,3) AND suffix >= 50 LIMIT 5
+-- K = 3 prefix values, LIMIT = 5
+-- Expected tuples accessed: 5 + 3 - 1 = 7
+-- ============================================================================
+SELECT 'Test 1: Basic integer merge scan' AS test_name;
+ test_name
+----------------------------------
+ Test 1: Basic integer merge scan
+(1 row)
+
+SELECT * FROM test_btree_merge_scan_int(
+ 'merge_test_int',
+ 'merge_test_int_idx',
+ ARRAY[1, 2, 3],
+ 50,
+ 5
+);
+ tuples_returned | tuples_accessed | maximum_required_fetches
+-----------------+-----------------+--------------------------
+ 5 | 8 | 7
+(1 row)
+
+-- ============================================================================
+-- Test 2: More prefix values
+-- Query: WHERE prefix IN (1,2,3,4,5) AND suffix >= 80 LIMIT 3
+-- K = 5 prefix values, LIMIT = 3
+-- Expected tuples accessed: 3 + 5 - 1 = 7
+-- ============================================================================
+SELECT 'Test 2: More prefix values' AS test_name;
+ test_name
+----------------------------
+ Test 2: More prefix values
+(1 row)
+
+SELECT * FROM test_btree_merge_scan_int(
+ 'merge_test_int',
+ 'merge_test_int_idx',
+ ARRAY[1, 2, 3, 4, 5],
+ 80,
+ 3
+);
+ tuples_returned | tuples_accessed | maximum_required_fetches
+-----------------+-----------------+--------------------------
+ 3 | 8 | 7
+(1 row)
+
+-- ============================================================================
+-- Test 3: Single prefix value (degenerates to regular scan)
+-- K = 1, LIMIT = 5
+-- Expected tuples accessed: 5 + 1 - 1 = 5
+-- ============================================================================
+SELECT 'Test 3: Single prefix value' AS test_name;
+ test_name
+-----------------------------
+ Test 3: Single prefix value
+(1 row)
+
+SELECT * FROM test_btree_merge_scan_int(
+ 'merge_test_int',
+ 'merge_test_int_idx',
+ ARRAY[1],
+ 50,
+ 5
+);
+ tuples_returned | tuples_accessed | maximum_required_fetches
+-----------------+-----------------+--------------------------
+ 5 | 6 | 5
+(1 row)
+
+-- ============================================================================
+-- Test 4: Large LIMIT (more than matching rows)
+-- K = 3, prefix values that have 51 rows each (suffix >= 50)
+-- LIMIT = 200 but only 153 rows exist
+-- ============================================================================
+SELECT 'Test 4: Large LIMIT' AS test_name;
+ test_name
+---------------------
+ Test 4: Large LIMIT
+(1 row)
+
+SELECT * FROM test_btree_merge_scan_int(
+ 'merge_test_int',
+ 'merge_test_int_idx',
+ ARRAY[1, 2, 3],
+ 50,
+ 200
+);
+ tuples_returned | tuples_accessed | maximum_required_fetches
+-----------------+-----------------+--------------------------
+ 153 | 153 | 153
+(1 row)
+
+-- ============================================================================
+-- Test 5: Non-contiguous prefix values
+-- Query: WHERE prefix IN (2,5,8) AND suffix >= 50 LIMIT 5
+-- Tests that merge scan works with gaps in prefix values
+-- K = 3 prefix values (non-adjacent), LIMIT = 5
+-- ============================================================================
+SELECT 'Test 5: Non-contiguous prefix values' AS test_name;
+ test_name
+--------------------------------------
+ Test 5: Non-contiguous prefix values
+(1 row)
+
+SELECT * FROM test_btree_merge_scan_int(
+ 'merge_test_int',
+ 'merge_test_int_idx',
+ ARRAY[2, 5, 8],
+ 50,
+ 5
+);
+ tuples_returned | tuples_accessed | maximum_required_fetches
+-----------------+-----------------+--------------------------
+ 5 | 8 | 7
+(1 row)
+
+-- ============================================================================
+-- Test 6: Timestamp suffix column
+-- Query: WHERE user_id IN (1,2,3) AND event_time >= '2026-01-01 01:00:00' LIMIT 5
+-- K = 3, LIMIT = 5
+-- Expected tuples accessed: 5 + 3 - 1 = 7
+-- ============================================================================
+SELECT 'Test 6: Timestamp suffix' AS test_name;
+ test_name
+--------------------------
+ Test 6: Timestamp suffix
+(1 row)
+
+SELECT * FROM test_btree_merge_scan_ts(
+ 'merge_test_ts',
+ 'merge_test_ts_idx',
+ ARRAY[1, 2, 3],
+ '2026-01-01 01:00:00'::timestamp,
+ 5
+);
+ tuples_returned | tuples_accessed | maximum_required_fetches
+-----------------+-----------------+--------------------------
+ 5 | 8 | 7
+(1 row)
+
+-- ============================================================================
+-- Test 7: All users with timestamp
+-- K = 5, LIMIT = 10
+-- Expected tuples accessed: 10 + 5 - 1 = 14
+-- ============================================================================
+SELECT 'Test 7: All users timestamp' AS test_name;
+ test_name
+-----------------------------
+ Test 7: All users timestamp
+(1 row)
+
+SELECT * FROM test_btree_merge_scan_ts(
+ 'merge_test_ts',
+ 'merge_test_ts_idx',
+ ARRAY[1, 2, 3, 4, 5],
+ '2026-01-01 00:30:00'::timestamp,
+ 10
+);
+ tuples_returned | tuples_accessed | maximum_required_fetches
+-----------------+-----------------+--------------------------
+ 10 | 15 | 14
+(1 row)
+
+-- ============================================================================
+-- Test 8: Correctness verification
+-- Verify merge scan returns rows in exact ORDER BY suffix_col, prefix_col order
+-- Using WITH ORDINALITY to compare row positions
+-- ============================================================================
+SELECT 'Test 8: Correctness verification' AS test_name;
+ test_name
+----------------------------------
+ Test 8: Correctness verification
+(1 row)
+
+-- Compare merge scan vs regular query with row positions (should be empty)
+WITH merge_result AS (
+ SELECT row_number() OVER () AS rn, prefix_col, suffix_col
+ FROM test_btree_merge_fetch_int(
+ 'merge_test_int',
+ 'merge_test_int_idx',
+ ARRAY[1, 2, 3],
+ 90,
+ 10
+ )
+),
+regular_result AS (
+ SELECT row_number() OVER () AS rn, prefix_col, suffix_col
+ FROM (
+ SELECT prefix_col, suffix_col
+ FROM merge_test_int
+ WHERE prefix_col IN (1, 2, 3) AND suffix_col >= 90
+ ORDER BY suffix_col, prefix_col
+ LIMIT 10
+ ) t
+)
+SELECT 'MISMATCH' AS status, m.rn, m.prefix_col, m.suffix_col,
+ r.prefix_col AS expected_prefix, r.suffix_col AS expected_suffix
+FROM merge_result m
+FULL OUTER JOIN regular_result r ON m.rn = r.rn
+WHERE m.prefix_col IS DISTINCT FROM r.prefix_col
+ OR m.suffix_col IS DISTINCT FROM r.suffix_col;
+ status | rn | prefix_col | suffix_col | expected_prefix | expected_suffix
+--------+----+------------+------------+-----------------+-----------------
+(0 rows)
+
+-- ============================================================================
+-- Cleanup
+-- ============================================================================
+DROP TABLE merge_test_int;
+DROP TABLE merge_test_ts;
+DROP EXTENSION test_btree_merge;
diff --git a/src/test/modules/test_btree_merge/meson.build b/src/test/modules/test_btree_merge/meson.build
new file mode 100644
index 00000000000..665d6cf443e
--- /dev/null
+++ b/src/test/modules/test_btree_merge/meson.build
@@ -0,0 +1,33 @@
+# Copyright (c) 2026, PostgreSQL Global Development Group
+
+test_btree_merge_sources = files(
+ 'test_btree_merge.c',
+)
+
+if host_system == 'windows'
+ test_btree_merge_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+ '--NAME', 'test_btree_merge',
+ '--FILEDESC', 'test_btree_merge - test code for btree merge scan',])
+endif
+
+test_btree_merge = shared_module('test_btree_merge',
+ test_btree_merge_sources,
+ kwargs: pg_test_mod_args,
+)
+test_install_libs += test_btree_merge
+
+test_install_data += files(
+ 'test_btree_merge.control',
+ 'test_btree_merge--1.0.sql',
+)
+
+tests += {
+ 'name': 'test_btree_merge',
+ 'sd': meson.current_source_dir(),
+ 'bd': meson.current_build_dir(),
+ 'regress': {
+ 'sql': [
+ 'test_btree_merge',
+ ],
+ },
+}
diff --git a/src/test/modules/test_btree_merge/sql/test_btree_merge.sql b/src/test/modules/test_btree_merge/sql/test_btree_merge.sql
new file mode 100644
index 00000000000..5828b343b34
--- /dev/null
+++ b/src/test/modules/test_btree_merge/sql/test_btree_merge.sql
@@ -0,0 +1,207 @@
+-- Unit tests for B-tree merge scan implementation
+-- Tests the core merge scan algorithm directly, bypassing the planner
+
+CREATE EXTENSION test_btree_merge;
+
+-- ============================================================================
+-- Setup: Create test tables with known data distributions
+-- ============================================================================
+
+-- Test table with integer prefix and suffix
+CREATE TABLE merge_test_int (
+ prefix_col int4,
+ suffix_col int4
+);
+
+-- Insert data: 10 prefix values, 100 suffix values each = 1000 rows
+INSERT INTO merge_test_int
+SELECT p, s
+FROM generate_series(1, 10) AS p,
+ generate_series(1, 100) AS s;
+
+CREATE INDEX merge_test_int_idx ON merge_test_int (prefix_col, suffix_col);
+ANALYZE merge_test_int;
+
+-- Test table with integer prefix and timestamp suffix
+CREATE TABLE merge_test_ts (
+ user_id int4,
+ event_time timestamp
+);
+
+-- Insert data: 5 users, 100 events each
+INSERT INTO merge_test_ts
+SELECT u, '2026-01-01 00:00:00'::timestamp + (e || ' minutes')::interval
+FROM generate_series(1, 5) AS u,
+ generate_series(1, 100) AS e;
+
+CREATE INDEX merge_test_ts_idx ON merge_test_ts (user_id, event_time);
+ANALYZE merge_test_ts;
+
+
+-- ============================================================================
+-- Test 1: Basic integer merge scan
+-- Query: WHERE prefix IN (1,2,3) AND suffix >= 50 LIMIT 5
+-- K = 3 prefix values, LIMIT = 5
+-- Expected tuples accessed: 5 + 3 - 1 = 7
+-- ============================================================================
+
+SELECT 'Test 1: Basic integer merge scan' AS test_name;
+
+SELECT * FROM test_btree_merge_scan_int(
+ 'merge_test_int',
+ 'merge_test_int_idx',
+ ARRAY[1, 2, 3],
+ 50,
+ 5
+);
+
+
+-- ============================================================================
+-- Test 2: More prefix values
+-- Query: WHERE prefix IN (1,2,3,4,5) AND suffix >= 80 LIMIT 3
+-- K = 5 prefix values, LIMIT = 3
+-- Expected tuples accessed: 3 + 5 - 1 = 7
+-- ============================================================================
+
+SELECT 'Test 2: More prefix values' AS test_name;
+
+SELECT * FROM test_btree_merge_scan_int(
+ 'merge_test_int',
+ 'merge_test_int_idx',
+ ARRAY[1, 2, 3, 4, 5],
+ 80,
+ 3
+);
+
+
+-- ============================================================================
+-- Test 3: Single prefix value (degenerates to regular scan)
+-- K = 1, LIMIT = 5
+-- Expected tuples accessed: 5 + 1 - 1 = 5
+-- ============================================================================
+
+SELECT 'Test 3: Single prefix value' AS test_name;
+
+SELECT * FROM test_btree_merge_scan_int(
+ 'merge_test_int',
+ 'merge_test_int_idx',
+ ARRAY[1],
+ 50,
+ 5
+);
+
+
+-- ============================================================================
+-- Test 4: Large LIMIT (more than matching rows)
+-- K = 3, prefix values that have 51 rows each (suffix >= 50)
+-- LIMIT = 200 but only 153 rows exist
+-- ============================================================================
+
+SELECT 'Test 4: Large LIMIT' AS test_name;
+
+SELECT * FROM test_btree_merge_scan_int(
+ 'merge_test_int',
+ 'merge_test_int_idx',
+ ARRAY[1, 2, 3],
+ 50,
+ 200
+);
+
+
+-- ============================================================================
+-- Test 5: Non-contiguous prefix values
+-- Query: WHERE prefix IN (2,5,8) AND suffix >= 50 LIMIT 5
+-- Tests that merge scan works with gaps in prefix values
+-- K = 3 prefix values (non-adjacent), LIMIT = 5
+-- ============================================================================
+
+SELECT 'Test 5: Non-contiguous prefix values' AS test_name;
+
+SELECT * FROM test_btree_merge_scan_int(
+ 'merge_test_int',
+ 'merge_test_int_idx',
+ ARRAY[2, 5, 8],
+ 50,
+ 5
+);
+
+
+-- ============================================================================
+-- Test 6: Timestamp suffix column
+-- Query: WHERE user_id IN (1,2,3) AND event_time >= '2026-01-01 01:00:00' LIMIT 5
+-- K = 3, LIMIT = 5
+-- Expected tuples accessed: 5 + 3 - 1 = 7
+-- ============================================================================
+
+SELECT 'Test 6: Timestamp suffix' AS test_name;
+
+SELECT * FROM test_btree_merge_scan_ts(
+ 'merge_test_ts',
+ 'merge_test_ts_idx',
+ ARRAY[1, 2, 3],
+ '2026-01-01 01:00:00'::timestamp,
+ 5
+);
+
+
+-- ============================================================================
+-- Test 7: All users with timestamp
+-- K = 5, LIMIT = 10
+-- Expected tuples accessed: 10 + 5 - 1 = 14
+-- ============================================================================
+
+SELECT 'Test 7: All users timestamp' AS test_name;
+
+SELECT * FROM test_btree_merge_scan_ts(
+ 'merge_test_ts',
+ 'merge_test_ts_idx',
+ ARRAY[1, 2, 3, 4, 5],
+ '2026-01-01 00:30:00'::timestamp,
+ 10
+);
+
+
+-- ============================================================================
+-- Test 8: Correctness verification
+-- Verify merge scan returns rows in exact ORDER BY suffix_col, prefix_col order
+-- Using WITH ORDINALITY to compare row positions
+-- ============================================================================
+
+SELECT 'Test 8: Correctness verification' AS test_name;
+
+-- Compare merge scan vs regular query with row positions (should be empty)
+WITH merge_result AS (
+ SELECT row_number() OVER () AS rn, prefix_col, suffix_col
+ FROM test_btree_merge_fetch_int(
+ 'merge_test_int',
+ 'merge_test_int_idx',
+ ARRAY[1, 2, 3],
+ 90,
+ 10
+ )
+),
+regular_result AS (
+ SELECT row_number() OVER () AS rn, prefix_col, suffix_col
+ FROM (
+ SELECT prefix_col, suffix_col
+ FROM merge_test_int
+ WHERE prefix_col IN (1, 2, 3) AND suffix_col >= 90
+ ORDER BY suffix_col, prefix_col
+ LIMIT 10
+ ) t
+)
+SELECT 'MISMATCH' AS status, m.rn, m.prefix_col, m.suffix_col,
+ r.prefix_col AS expected_prefix, r.suffix_col AS expected_suffix
+FROM merge_result m
+FULL OUTER JOIN regular_result r ON m.rn = r.rn
+WHERE m.prefix_col IS DISTINCT FROM r.prefix_col
+ OR m.suffix_col IS DISTINCT FROM r.suffix_col;
+
+
+-- ============================================================================
+-- Cleanup
+-- ============================================================================
+
+DROP TABLE merge_test_int;
+DROP TABLE merge_test_ts;
+DROP EXTENSION test_btree_merge;
diff --git a/src/test/modules/test_btree_merge/test_btree_merge--1.0.sql b/src/test/modules/test_btree_merge/test_btree_merge--1.0.sql
new file mode 100644
index 00000000000..9872947d7d7
--- /dev/null
+++ b/src/test/modules/test_btree_merge/test_btree_merge--1.0.sql
@@ -0,0 +1,43 @@
+/* src/test/modules/test_btree_merge/test_btree_merge--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION test_btree_merge" to load this file. \quit
+
+-- Test merge scan with integer columns
+CREATE FUNCTION test_btree_merge_scan_int(
+ table_name text,
+ index_name text,
+ prefix_values int4[],
+ suffix_start int4,
+ limit_count int4
+) RETURNS TABLE (
+ tuples_returned int4,
+ tuples_accessed int4,
+ maximum_required_fetches int4
+) AS 'MODULE_PATHNAME' LANGUAGE C STRICT;
+
+-- Fetch actual rows from merge scan (for correctness verification)
+CREATE FUNCTION test_btree_merge_fetch_int(
+ table_name text,
+ index_name text,
+ prefix_values int4[],
+ suffix_start int4,
+ limit_count int4
+) RETURNS TABLE (
+ prefix_col int4,
+ suffix_col int4
+) AS 'MODULE_PATHNAME' LANGUAGE C STRICT;
+
+-- Test merge scan with timestamp suffix
+CREATE FUNCTION test_btree_merge_scan_ts(
+ table_name text,
+ index_name text,
+ prefix_values int4[],
+ suffix_start timestamp,
+ limit_count int4
+) RETURNS TABLE (
+ tuples_returned int4,
+ tuples_accessed int4,
+ maximum_required_fetches int4
+) AS 'MODULE_PATHNAME' LANGUAGE C STRICT;
+
diff --git a/src/test/modules/test_btree_merge/test_btree_merge.c b/src/test/modules/test_btree_merge/test_btree_merge.c
new file mode 100644
index 00000000000..78b22130ecf
--- /dev/null
+++ b/src/test/modules/test_btree_merge/test_btree_merge.c
@@ -0,0 +1,389 @@
+/*-------------------------------------------------------------------------
+ *
+ * test_btree_merge.c
+ * Unit tests for B-tree Merge Scan implementation
+ *
+ * This module provides SQL-callable functions to directly test the
+ * merge scan algorithm without going through the planner.
+ *
+ * Copyright (c) 2026, PostgreSQL Global Development Group
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/nbtree.h"
+#include "access/table.h"
+#include "catalog/namespace.h"
+#include "catalog/pg_am.h"
+#include "catalog/pg_type.h"
+#include "commands/defrem.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "utils/array.h"
+#include "utils/builtins.h"
+#include "utils/fmgroids.h"
+#include "utils/lsyscache.h"
+#include "utils/snapmgr.h"
+#include "utils/timestamp.h"
+
+PG_MODULE_MAGIC;
+
+#define MAX_RESULTS 10000
+
+/*
+ * MergeScanResult - holds results from a merge scan execution
+ */
+typedef struct MergeScanResult
+{
+ int tuples_returned;
+ int64 tuples_accessed;
+ int num_prefixes;
+ int limit_count;
+ /* For fetch function: collected row data */
+ int32 *prefixes;
+ int32 *suffixes;
+} MergeScanResult;
+
+/*
+ * do_merge_scan - common merge scan execution
+ *
+ * Performs a merge scan with the given parameters and collects results.
+ * If collect_rows is true, fetches and stores actual row data.
+ */
+static void
+do_merge_scan(const char *table_name,
+ const char *index_name,
+ Datum *prefix_values,
+ bool *prefix_nulls,
+ int num_prefixes,
+ Datum suffix_start,
+ Oid suffix_type,
+ RegProcedure suffix_eq_proc,
+ RegProcedure suffix_ge_proc,
+ int limit_count,
+ bool collect_rows,
+ MergeScanResult *result)
+{
+ Oid table_oid;
+ Oid index_oid;
+ Relation heap_rel;
+ Relation index_rel;
+ IndexScanDesc scan;
+ BTScanOpaque so;
+ BTMergeScanState *merge_state;
+ Snapshot snapshot;
+ Oid suffix_cmp_oid;
+ Oid opfamily;
+ const char *opfamily_name;
+ int tuples_returned = 0;
+ int max_results;
+
+ /* Determine operator family based on suffix type */
+ if (suffix_type == INT4OID)
+ opfamily_name = "integer_ops";
+ else if (suffix_type == TIMESTAMPOID)
+ opfamily_name = "datetime_ops";
+ else
+ elog(ERROR, "unsupported suffix type: %u", suffix_type);
+
+ /* Look up table and index */
+ table_oid = RelnameGetRelid(table_name);
+ if (!OidIsValid(table_oid))
+ elog(ERROR, "table \"%s\" does not exist", table_name);
+
+ index_oid = RelnameGetRelid(index_name);
+ if (!OidIsValid(index_oid))
+ elog(ERROR, "index \"%s\" does not exist", index_name);
+
+ /* Open relations */
+ heap_rel = table_open(table_oid, AccessShareLock);
+ index_rel = index_open(index_oid, AccessShareLock);
+
+ /* Get comparison function for suffix type */
+ opfamily = get_opfamily_oid(BTREE_AM_OID,
+ list_make1(makeString(pstrdup(opfamily_name))),
+ false);
+ suffix_cmp_oid = get_opfamily_proc(opfamily, suffix_type, suffix_type,
+ BTORDER_PROC);
+ if (!OidIsValid(suffix_cmp_oid))
+ elog(ERROR, "could not find comparison function for type %u", suffix_type);
+
+ /* Begin index scan */
+ snapshot = GetActiveSnapshot();
+ scan = index_beginscan(heap_rel, index_rel, snapshot, NULL, 2, 0);
+
+ /* Set up scan keys */
+ {
+ ScanKeyData keys[2];
+
+ ScanKeyInit(&keys[0], 1, BTEqualStrategyNumber, suffix_eq_proc,
+ prefix_values[0]);
+ ScanKeyInit(&keys[1], 2, BTGreaterEqualStrategyNumber, suffix_ge_proc,
+ suffix_start);
+ index_rescan(scan, keys, 2, NULL, 0);
+ }
+
+ so = (BTScanOpaque) scan->opaque;
+
+ /* Initialize merge scan */
+ merge_state = bt_merge_init(scan, prefix_values, prefix_nulls,
+ num_prefixes, 1, 2, suffix_cmp_oid, InvalidOid);
+ so->mergeState = merge_state;
+
+ /* Execute scan */
+ max_results = (limit_count > 0) ? limit_count : MAX_RESULTS;
+
+ while (tuples_returned < max_results)
+ {
+ CHECK_FOR_INTERRUPTS();
+
+ if (!bt_merge_getnext(scan, ForwardScanDirection))
+ break;
+
+ if (collect_rows && result->prefixes != NULL)
+ {
+ /* Fetch heap tuple to get actual values */
+ HeapTupleData heapTuple;
+ Buffer heapBuffer;
+ bool isnull;
+
+ heapTuple.t_self = scan->xs_heaptid;
+ if (heap_fetch(heap_rel, snapshot, &heapTuple, &heapBuffer, false))
+ {
+ result->prefixes[tuples_returned] =
+ DatumGetInt32(heap_getattr(&heapTuple, 1,
+ RelationGetDescr(heap_rel), &isnull));
+ result->suffixes[tuples_returned] =
+ DatumGetInt32(heap_getattr(&heapTuple, 2,
+ RelationGetDescr(heap_rel), &isnull));
+ ReleaseBuffer(heapBuffer);
+ }
+ }
+
+ tuples_returned++;
+
+ if (tuples_returned >= MAX_RESULTS)
+ {
+ elog(WARNING, "merge scan hit safety limit of %d tuples", MAX_RESULTS);
+ break;
+ }
+ }
+
+ /* Collect results before cleanup */
+ result->tuples_returned = tuples_returned;
+ result->tuples_accessed = merge_state->tuples_accessed;
+ result->num_prefixes = num_prefixes;
+ result->limit_count = limit_count;
+
+ /* Clean up */
+ bt_merge_end(merge_state);
+ so->mergeState = NULL;
+ index_endscan(scan);
+ index_close(index_rel, AccessShareLock);
+ table_close(heap_rel, AccessShareLock);
+}
+
+/*
+ * build_stats_result - build the stats result tuple
+ */
+static Datum
+build_stats_result(FunctionCallInfo fcinfo, MergeScanResult *result)
+{
+ TupleDesc tupdesc;
+ Datum values[3];
+ bool nulls[3] = {false, false, false};
+ HeapTuple tuple;
+ int max_required_fetches;
+
+ /* Calculate expected max fetches */
+ if (result->tuples_returned < result->limit_count)
+ max_required_fetches = result->tuples_returned;
+ else
+ max_required_fetches = result->limit_count + result->num_prefixes - 1;
+
+ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("function returning record called in context "
+ "that cannot accept type record")));
+
+ tupdesc = BlessTupleDesc(tupdesc);
+
+ values[0] = Int32GetDatum(result->tuples_returned);
+ values[1] = Int32GetDatum((int32) result->tuples_accessed);
+ values[2] = Int32GetDatum(max_required_fetches);
+
+ tuple = heap_form_tuple(tupdesc, values, nulls);
+ return HeapTupleGetDatum(tuple);
+}
+
+
+/*
+ * test_btree_merge_scan_int - test merge scan with integer columns
+ */
+PG_FUNCTION_INFO_V1(test_btree_merge_scan_int);
+
+Datum
+test_btree_merge_scan_int(PG_FUNCTION_ARGS)
+{
+ text *table_name = PG_GETARG_TEXT_PP(0);
+ text *index_name = PG_GETARG_TEXT_PP(1);
+ ArrayType *prefix_array = PG_GETARG_ARRAYTYPE_P(2);
+ int32 suffix_start = PG_GETARG_INT32(3);
+ int32 limit_count = PG_GETARG_INT32(4);
+ Datum *prefix_values;
+ bool *prefix_nulls;
+ int num_prefixes;
+ MergeScanResult result = {0};
+
+ deconstruct_array(prefix_array, INT4OID, sizeof(int32), true, TYPALIGN_INT,
+ &prefix_values, &prefix_nulls, &num_prefixes);
+
+ if (num_prefixes == 0)
+ elog(ERROR, "prefix_values array cannot be empty");
+
+ do_merge_scan(text_to_cstring(table_name),
+ text_to_cstring(index_name),
+ prefix_values, prefix_nulls, num_prefixes,
+ Int32GetDatum(suffix_start), INT4OID,
+ F_INT4EQ, F_INT4GE,
+ limit_count, false, &result);
+
+ return build_stats_result(fcinfo, &result);
+}
+
+
+/*
+ * test_btree_merge_scan_ts - test merge scan with timestamp suffix
+ */
+PG_FUNCTION_INFO_V1(test_btree_merge_scan_ts);
+
+Datum
+test_btree_merge_scan_ts(PG_FUNCTION_ARGS)
+{
+ text *table_name = PG_GETARG_TEXT_PP(0);
+ text *index_name = PG_GETARG_TEXT_PP(1);
+ ArrayType *prefix_array = PG_GETARG_ARRAYTYPE_P(2);
+ Timestamp suffix_start = PG_GETARG_TIMESTAMP(3);
+ int32 limit_count = PG_GETARG_INT32(4);
+ Datum *prefix_values;
+ bool *prefix_nulls;
+ int num_prefixes;
+ MergeScanResult result = {0};
+
+ deconstruct_array(prefix_array, INT4OID, sizeof(int32), true, TYPALIGN_INT,
+ &prefix_values, &prefix_nulls, &num_prefixes);
+
+ if (num_prefixes == 0)
+ elog(ERROR, "prefix_values array cannot be empty");
+
+ do_merge_scan(text_to_cstring(table_name),
+ text_to_cstring(index_name),
+ prefix_values, prefix_nulls, num_prefixes,
+ TimestampGetDatum(suffix_start), TIMESTAMPOID,
+ F_INT4EQ, F_TIMESTAMP_GE,
+ limit_count, false, &result);
+
+ return build_stats_result(fcinfo, &result);
+}
+
+
+/*
+ * test_btree_merge_fetch_int - fetch actual rows from merge scan
+ */
+PG_FUNCTION_INFO_V1(test_btree_merge_fetch_int);
+
+Datum
+test_btree_merge_fetch_int(PG_FUNCTION_ARGS)
+{
+ FuncCallContext *funcctx;
+
+ typedef struct
+ {
+ int32 *prefixes;
+ int32 *suffixes;
+ int num_results;
+ int current_idx;
+ } FetchContext;
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ text *table_name = PG_GETARG_TEXT_PP(0);
+ text *index_name = PG_GETARG_TEXT_PP(1);
+ ArrayType *prefix_array = PG_GETARG_ARRAYTYPE_P(2);
+ int32 suffix_start = PG_GETARG_INT32(3);
+ int32 limit_count = PG_GETARG_INT32(4);
+ Datum *prefix_values;
+ bool *prefix_nulls;
+ int num_prefixes;
+ MemoryContext oldcontext;
+ FetchContext *fctx;
+ MergeScanResult result = {0};
+ TupleDesc tupdesc;
+ int max_results;
+
+ funcctx = SRF_FIRSTCALL_INIT();
+ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+ deconstruct_array(prefix_array, INT4OID, sizeof(int32), true, TYPALIGN_INT,
+ &prefix_values, &prefix_nulls, &num_prefixes);
+
+ if (num_prefixes == 0)
+ elog(ERROR, "prefix_values array cannot be empty");
+
+ /* Allocate result storage */
+ max_results = (limit_count > 0) ? limit_count : MAX_RESULTS;
+ fctx = palloc(sizeof(FetchContext));
+ fctx->prefixes = palloc(max_results * sizeof(int32));
+ fctx->suffixes = palloc(max_results * sizeof(int32));
+ fctx->current_idx = 0;
+
+ /* Point result to our storage */
+ result.prefixes = fctx->prefixes;
+ result.suffixes = fctx->suffixes;
+
+ do_merge_scan(text_to_cstring(table_name),
+ text_to_cstring(index_name),
+ prefix_values, prefix_nulls, num_prefixes,
+ Int32GetDatum(suffix_start), INT4OID,
+ F_INT4EQ, F_INT4GE,
+ limit_count, true, &result);
+
+ fctx->num_results = result.tuples_returned;
+
+ /* Build result tuple descriptor */
+ tupdesc = CreateTemplateTupleDesc(2);
+ TupleDescInitEntry(tupdesc, 1, "prefix_col", INT4OID, -1, 0);
+ TupleDescInitEntry(tupdesc, 2, "suffix_col", INT4OID, -1, 0);
+ funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+ funcctx->user_fctx = fctx;
+
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ funcctx = SRF_PERCALL_SETUP();
+
+ {
+ FetchContext *fctx = funcctx->user_fctx;
+
+ if (fctx->current_idx < fctx->num_results)
+ {
+ Datum values[2];
+ bool nulls[2] = {false, false};
+ HeapTuple tuple;
+
+ values[0] = Int32GetDatum(fctx->prefixes[fctx->current_idx]);
+ values[1] = Int32GetDatum(fctx->suffixes[fctx->current_idx]);
+ fctx->current_idx++;
+
+ tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+ SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
+ }
+ else
+ {
+ SRF_RETURN_DONE(funcctx);
+ }
+ }
+}
diff --git a/src/test/modules/test_btree_merge/test_btree_merge.control b/src/test/modules/test_btree_merge/test_btree_merge.control
new file mode 100644
index 00000000000..f8146bd0f74
--- /dev/null
+++ b/src/test/modules/test_btree_merge/test_btree_merge.control
@@ -0,0 +1,5 @@
+# test_btree_merge extension
+comment = 'Unit tests for B-tree merge scan'
+default_version = '1.0'
+module_pathname = '$libdir/test_btree_merge'
+relocatable = true
diff --git a/src/test/regress/expected/btree_merge.out b/src/test/regress/expected/btree_merge.out
new file mode 100644
index 00000000000..441ae1d0657
--- /dev/null
+++ b/src/test/regress/expected/btree_merge.out
@@ -0,0 +1,113 @@
+-- B-Tree Merge Scan Access Method Test
+--
+-- B-Tree Merge Scan is an access method that allows lazily producing
+-- output sorted by a non-leading column when the prefix has few distinct values.
+--
+--
+-- Let S be an infinite set of lattic points (x,y).
+-- Let S(x=1,y>=b) be the sequence of points
+-- SELECT * FROM S WHERE x = a and y >= b ORDER BY b;
+-- i.e. (a, b), (a, b+1), (a, b+2), ...
+-- Similarly, S(x IN X, y=b) being the sequence of points
+-- SELECT * FROM S WHERE x IN X and y = b ORDER BY x;
+-- i.e. (x[1], b), ..., (x[n], b), (x[1], b+1), ...
+-- The output of S(x IN X, y >= b) can be computed as a
+--
+-- Proposition (uncomputable):
+-- S(x, IN X, y >= b) is the K-way merge of the sequences
+-- {S(x=x[i], y >= b), x[i] in X}
+--
+--
+--
+-- Proposition (computable): Bounded suffix
+--
+-- S(x, IN X, b1 <= y <= b2) as bounded
+-- can be computed with (SELECT count(distinct x) + count(1) FROM bounded)
+-- tuple accesses.
+-- (Constructive) Proof:
+-- The result of
+-- SELECT * FROM X
+-- JOIN S on x = x[i] WHERE y BETWEEN b1 AND b2;
+-- is the same as
+-- SELECT * FROM X,
+-- LATERAL (
+-- (SELECT * FROM S
+-- WHERE x = x[i] AND y BETWEEN b1 AND b2
+-- ) AS subscan[i]
+-- ) as merged
+--
+-- Each of subscan[i] is covered by a single range in the index and can
+-- and require at most
+-- (count(1) FROM subscan[i]) + 1 -- subscan tuple access count
+-- tupples to be accessed.
+-- The merged result can be computed using a K-way merge sort
+-- whose number of rows is
+-- sum(count(1) FROM subscan[i]) -- query output rows
+-- Q.E.D.
+--
+--
+-- Proposition (computable): Limitted query
+-- The query
+-- S(x, IN X, y >= b) LIMIT N as limited
+-- Can be computed with at most
+-- N + count(distinct X) - 1
+-- tuple accesses.
+--
+-- (Constructive) Proof:
+-- If an upper `u` bound for `MAX(y IN S(x, IN X, y >= b) LIMIT N)` is known,
+-- then the query can be rewritten as
+-- S(x, IN X, b <= y <= u) LIMIT N
+-- The K-way can produce the next element as soon as it has fetched
+-- the next element for each subquery
+-- 1 row can be produced after count(distinct X) fetches,
+-- After that it can produce one new row for each fetch.
+-- Thus, the total number of fetches is at most
+-- N + count(distinct X) - 1
+-- Q.E.D.
+-- Generate a table with lattice points
+-- Could be infinite
+CREATE TABLE btree_merge_test AS (
+ SELECT x, y FROM
+ generate_series(1, 50) AS x,
+ generate_series(1, 50) AS y
+ ORDER BY random()
+);
+CREATE INDEX btree_merge_test_idx ON btree_merge_test USING btree (x, y);
+ANALYSE btree_merge_test;
+SET enable_seqscan = OFF;
+SET enable_bitmapscan = OFF;
+SHOW track_counts; -- should be 'on'
+ track_counts
+--------------
+ on
+(1 row)
+
+-- From the limited query proposition this can be computed with 10
+-- tupple accesses.
+SELECT x, y
+FROM btree_merge_test
+WHERE x IN (1,2,5,8,13,21,34,55) AND y >= 19
+ORDER BY y, x -- sort x to make result unique
+LIMIT 3;
+ x | y
+---+----
+ 1 | 19
+ 2 | 19
+ 5 | 19
+(3 rows)
+
+SELECT pg_stat_force_next_flush();
+ pg_stat_force_next_flush
+--------------------------
+
+(1 row)
+
+SELECT idx_scan, idx_tup_read, idx_tup_fetch
+FROM pg_stat_user_indexes
+WHERE indexrelname = 'btree_merge_test_idx';
+ idx_scan | idx_tup_read | idx_tup_fetch
+----------+--------------+---------------
+ 5 | 10 | 10
+(1 row)
+
+DROP TABLE btree_merge_test;
diff --git a/src/test/regress/sql/btree_merge.sql b/src/test/regress/sql/btree_merge.sql
new file mode 100644
index 00000000000..be00c33c2a5
--- /dev/null
+++ b/src/test/regress/sql/btree_merge.sql
@@ -0,0 +1,100 @@
+-- B-Tree Merge Scan Access Method Test
+--
+-- B-Tree Merge Scan is an access method that allows lazily producing
+-- output sorted by a non-leading column when the prefix has few distinct values.
+--
+--
+-- Let S be an infinite set of lattic points (x,y).
+-- Let S(x=1,y>=b) be the sequence of points
+-- SELECT * FROM S WHERE x = a and y >= b ORDER BY b;
+-- i.e. (a, b), (a, b+1), (a, b+2), ...
+-- Similarly, S(x IN X, y=b) being the sequence of points
+-- SELECT * FROM S WHERE x IN X and y = b ORDER BY x;
+-- i.e. (x[1], b), ..., (x[n], b), (x[1], b+1), ...
+-- The output of S(x IN X, y >= b) can be computed as a
+--
+-- Proposition (uncomputable):
+-- S(x, IN X, y >= b) is the K-way merge of the sequences
+-- {S(x=x[i], y >= b), x[i] in X}
+--
+--
+--
+-- Proposition (computable): Bounded suffix
+--
+-- S(x, IN X, b1 <= y <= b2) as bounded
+-- can be computed with (SELECT count(distinct x) + count(1) FROM bounded)
+-- tuple accesses.
+-- (Constructive) Proof:
+-- The result of
+-- SELECT * FROM X
+-- JOIN S on x = x[i] WHERE y BETWEEN b1 AND b2;
+-- is the same as
+-- SELECT * FROM X,
+-- LATERAL (
+-- (SELECT * FROM S
+-- WHERE x = x[i] AND y BETWEEN b1 AND b2
+-- ) AS subscan[i]
+-- ) as merged
+--
+-- Each of subscan[i] is covered by a single range in the index and can
+-- and require at most
+-- (count(1) FROM subscan[i]) + 1 -- subscan tuple access count
+-- tupples to be accessed.
+-- The merged result can be computed using a K-way merge sort
+-- whose number of rows is
+-- sum(count(1) FROM subscan[i]) -- query output rows
+-- Q.E.D.
+--
+--
+-- Proposition (computable): Limitted query
+-- The query
+-- S(x, IN X, y >= b) LIMIT N as limited
+-- Can be computed with at most
+-- N + count(distinct X) - 1
+-- tuple accesses.
+--
+-- (Constructive) Proof:
+-- If an upper `u` bound for `MAX(y IN S(x, IN X, y >= b) LIMIT N)` is known,
+-- then the query can be rewritten as
+-- S(x, IN X, b <= y <= u) LIMIT N
+-- The K-way can produce the next element as soon as it has fetched
+-- the next element for each subquery
+-- 1 row can be produced after count(distinct X) fetches,
+-- After that it can produce one new row for each fetch.
+-- Thus, the total number of fetches is at most
+-- N + count(distinct X) - 1
+-- Q.E.D.
+
+
+-- Generate a table with lattice points
+-- Could be infinite
+CREATE TABLE btree_merge_test AS (
+ SELECT x, y FROM
+ generate_series(1, 50) AS x,
+ generate_series(1, 50) AS y
+ ORDER BY random()
+);
+CREATE INDEX btree_merge_test_idx ON btree_merge_test USING btree (x, y);
+
+ANALYSE btree_merge_test;
+
+SET enable_seqscan = OFF;
+SET enable_bitmapscan = OFF;
+SHOW track_counts; -- should be 'on'
+-- From the limited query proposition this can be computed with 10
+-- tupple accesses.
+SELECT x, y
+FROM btree_merge_test
+WHERE x IN (1,2,5,8,13,21,34,55) AND y >= 19
+ORDER BY y, x -- sort x to make result unique
+LIMIT 3;
+
+
+SELECT pg_stat_force_next_flush();
+
+
+SELECT idx_scan, idx_tup_read, idx_tup_fetch
+FROM pg_stat_user_indexes
+WHERE indexrelname = 'btree_merge_test_idx';
+
+DROP TABLE btree_merge_test;
\ No newline at end of file
view thread (12+ messages) latest in thread
reply
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Reply to all the recipients using the --to and --cc options:
reply via email
To: [email protected]
Cc: [email protected]
Subject: Re: New access method for b-tree.
In-Reply-To: <AS1PR02MB784695AFEC37179FFAF7EAE19A9DA@AS1PR02MB7846.eurprd02.prod.outlook.com>
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
This inbox is served by agora; see mirroring instructions
for how to clone and mirror all data and code used for this inbox