New access method for b-tree.

public inbox for [email protected]  
help / color / mirror / Atom feed

From: Alexandre Felipe <[email protected]>
To: [email protected] <[email protected]>
Subject: New access method for b-tree.
Date: Sun, 1 Feb 2026 10:02:06 +0000
Message-ID: <AS1PR02MB784695AFEC37179FFAF7EAE19A9DA@AS1PR02MB7846.eurprd02.prod.outlook.com> (raw)

Hello Hackers,

Please check this out,

It is an access method to scan a table sorting by the second column of an index, filtered on the first.
Queries like
SELECT x, y FROM grid
WHERE x in (array of Nx elements)
ORDER BY y, x
LIMIT M

Can execute streaming the rows directly from disk instead of loading everything.

Using  btree index on (x, y)

On a grid with N x N will run by fetching only what is necessary
A skip scal will run with O(N * Nx) I/O, O(N x Nx) space, O(N x Nx * log( N * Nx)) comput (assuming a generic in memory sort)

The proposed access method does it O(M + Nx) I/O, O(Nx) space, and O(M * log(Nx)) compute.



Kind Regards,


Alexandre Felipe

Research & Development Engineer







Attachments:

  [application/octet-stream] btree_merge_rebased.patch (54.5K, 3-btree_merge_rebased.patch)
  download | inline diff:
diff --git a/.gitignore b/.gitignore
index 4e911395fe3..ac1f95d9cf0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -43,3 +43,11 @@ lib*.pc
 /Release/
 /tmp_install/
 /portlock/
+
+# hidden files (e.g. .dbdata, .install, good practice to test locally in isolation)
+.*
+
+# Test output
+**/regression.diffs
+**/regression.out
+**/results/
diff --git a/src/backend/access/nbtree/Makefile b/src/backend/access/nbtree/Makefile
index 0daf640af96..72053cefdaa 100644
--- a/src/backend/access/nbtree/Makefile
+++ b/src/backend/access/nbtree/Makefile
@@ -16,6 +16,7 @@ OBJS = \
 	nbtcompare.o \
 	nbtdedup.o \
 	nbtinsert.o \
+	nbtmergescan.o \
 	nbtpage.o \
 	nbtpreprocesskeys.o \
 	nbtreadpage.o \
diff --git a/src/backend/access/nbtree/meson.build b/src/backend/access/nbtree/meson.build
index 812f067e710..1016fea62d5 100644
--- a/src/backend/access/nbtree/meson.build
+++ b/src/backend/access/nbtree/meson.build
@@ -4,6 +4,7 @@ backend_sources += files(
   'nbtcompare.c',
   'nbtdedup.c',
   'nbtinsert.c',
+  'nbtmergescan.c',
   'nbtpage.c',
   'nbtpreprocesskeys.c',
   'nbtreadpage.c',
diff --git a/src/backend/access/nbtree/nbtmergescan.c b/src/backend/access/nbtree/nbtmergescan.c
new file mode 100644
index 00000000000..70828dc73d3
--- /dev/null
+++ b/src/backend/access/nbtree/nbtmergescan.c
@@ -0,0 +1,457 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtmergescan.c
+ *	  B-Tree merge scan for efficient evaluation of IN-list queries
+ *
+ * This module implements a K-way merge scan for B-tree indexes, optimized
+ * for queries of the form:
+ *   WHERE prefix IN (v1, v2, ..., vK) AND suffix >= b ORDER BY suffix LIMIT N
+ *
+ * The algorithm maintains a min-heap of cursors, one per prefix value.
+ * Each cursor tracks its position within the index for that prefix.
+ * Tuples are returned in suffix order by repeatedly extracting the
+ * minimum from the heap.
+ *
+ * Target behavior: Access at most N + K - 1 index tuples for LIMIT N.
+ *
+ * Copyright (c) 2026, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/nbtree/nbtmergescan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/relscan.h"
+#include "lib/pairingheap.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "utils/datum.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+/* Forward declarations of static functions */
+static int	bt_merge_heap_cmp(const pairingheap_node *a,
+							  const pairingheap_node *b,
+							  void *arg);
+static bool bt_merge_cursor_init(BTMergeScanState *state,
+								 IndexScanDesc scan,
+								 BTMergeCursor *cursor,
+								 Datum prefix_value,
+								 bool prefix_isnull);
+static bool bt_merge_cursor_advance(BTMergeScanState *state,
+									IndexScanDesc scan,
+									BTMergeCursor *cursor);
+static Datum bt_merge_extract_sortkey(BTMergeScanState *state,
+									  IndexScanDesc scan,
+									  BTMergeCursor *cursor,
+									  bool *isnull);
+
+
+/*
+ * bt_merge_heap_cmp
+ *	  Compare two cursors by their current sort key (suffix value).
+ *
+ * When sort keys are equal, uses prefix value as tiebreaker for
+ * deterministic ordering (ORDER BY suffix, prefix).
+ *
+ * Returns positive if a > b (pairingheap is a max-heap, we want min-heap
+ * behavior so we invert the comparison).
+ */
+static int
+bt_merge_heap_cmp(const pairingheap_node *a,
+				  const pairingheap_node *b,
+				  void *arg)
+{
+	BTMergeScanState *state = (BTMergeScanState *) arg;
+	BTMergeCursor *cursor_a = pairingheap_container(BTMergeCursor, ph_node,
+													(pairingheap_node *) a);
+	BTMergeCursor *cursor_b = pairingheap_container(BTMergeCursor, ph_node,
+													(pairingheap_node *) b);
+	Datum		key_a = cursor_a->sort_key;
+	Datum		key_b = cursor_b->sort_key;
+	bool		null_a = cursor_a->sort_key_isnull;
+	bool		null_b = cursor_b->sort_key_isnull;
+	int32		cmp;
+
+	/* Handle NULLs - NULLs sort last (NULLS LAST default for ASC) */
+	if (null_a && null_b)
+		return 0;
+	if (null_a)
+		return -1;				/* a is NULL, comes after b */
+	if (null_b)
+		return 1;				/* b is NULL, comes after a */
+
+	/* Compare using the suffix column's comparison function */
+	cmp = DatumGetInt32(FunctionCall2Coll(&state->suffix_cmp,
+										  state->suffix_collation,
+										  key_a, key_b));
+
+	/*
+	 * Use prefix value as tiebreaker for deterministic ordering.
+	 * This ensures ORDER BY suffix, prefix behavior.
+	 */
+	if (cmp == 0)
+	{
+		/* Compare prefix values (assumes pass-by-value int4 for now) */
+		int32		prefix_a = DatumGetInt32(cursor_a->prefix_value);
+		int32		prefix_b = DatumGetInt32(cursor_b->prefix_value);
+
+		if (prefix_a < prefix_b)
+			cmp = -1;
+		else if (prefix_a > prefix_b)
+			cmp = 1;
+	}
+
+	/* Negate for min-heap behavior */
+	return -cmp;
+}
+
+
+/*
+ * bt_merge_init
+ *	  Initialize a merge scan state.
+ *
+ * Creates the merge state with one cursor per prefix value.
+ * The cursors will be positioned at their first matching tuples
+ * when bt_merge_getnext is first called.
+ */
+BTMergeScanState *
+bt_merge_init(IndexScanDesc scan,
+			  Datum *prefix_values,
+			  bool *prefix_nulls,
+			  int num_prefixes,
+			  int prefix_attno,
+			  int suffix_attno,
+			  Oid suffix_cmp_oid,
+			  Oid suffix_collation)
+{
+	BTMergeScanState *state;
+	MemoryContext merge_context;
+	MemoryContext old_context;
+	int			i;
+
+	/* Create memory context for merge scan allocations */
+	merge_context = AllocSetContextCreate(CurrentMemoryContext,
+										  "BTMergeScan",
+										  ALLOCSET_DEFAULT_SIZES);
+	old_context = MemoryContextSwitchTo(merge_context);
+
+	/* Allocate main state structure */
+	state = palloc0(sizeof(BTMergeScanState));
+	state->merge_context = merge_context;
+	state->num_cursors = num_prefixes;
+	state->active_cursors = 0;
+	state->prefix_attno = prefix_attno;
+	state->suffix_attno = suffix_attno;
+	state->suffix_collation = suffix_collation;
+	state->direction = ForwardScanDirection;
+	state->initialized = false;
+	state->tuples_accessed = 0;
+
+	/* Set up suffix comparison function */
+	fmgr_info(suffix_cmp_oid, &state->suffix_cmp);
+
+	/* Allocate cursor array */
+	state->cursors = palloc0(num_prefixes * sizeof(BTMergeCursor));
+
+	/* Initialize cursor metadata (not positioned yet) */
+	for (i = 0; i < num_prefixes; i++)
+	{
+		BTMergeCursor *cursor = &state->cursors[i];
+
+		cursor->cursor_id = i;
+		cursor->prefix_value = datumCopy(prefix_values[i], true, sizeof(Datum));
+		cursor->prefix_isnull = prefix_nulls[i];
+		cursor->exhausted = prefix_nulls[i];	/* NULL prefix = exhausted */
+		cursor->sort_key_isnull = true;
+		BTScanPosInvalidate(cursor->pos);
+		cursor->tuples = NULL;
+	}
+
+	/* Initialize the merge heap */
+	state->merge_heap = pairingheap_allocate(bt_merge_heap_cmp, state);
+
+	MemoryContextSwitchTo(old_context);
+
+	return state;
+}
+
+
+/*
+ * bt_merge_getnext
+ *	  Get the next tuple from the merge scan.
+ *
+ * Returns true if a tuple was found, false if scan is exhausted.
+ * The tuple's TID is stored in scan->xs_heaptid.
+ */
+bool
+bt_merge_getnext(IndexScanDesc scan, ScanDirection dir)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	BTMergeScanState *state = so->mergeState;
+	BTMergeCursor *cursor;
+	pairingheap_node *node;
+	int			i;
+
+	if (state == NULL)
+		return false;
+
+	/* Initialize cursors on first call */
+	if (!state->initialized)
+	{
+		state->initialized = true;
+		state->direction = dir;
+
+		for (i = 0; i < state->num_cursors; i++)
+		{
+			BTMergeCursor *c = &state->cursors[i];
+
+			if (!c->exhausted &&
+				bt_merge_cursor_init(state, scan, c,
+									 c->prefix_value, c->prefix_isnull))
+			{
+				/* Cursor has at least one tuple, add to heap */
+				pairingheap_add(state->merge_heap, &c->ph_node);
+				state->active_cursors++;
+			}
+		}
+	}
+
+	/* Get the cursor with the smallest suffix value */
+	if (pairingheap_is_empty(state->merge_heap))
+		return false;
+
+	node = pairingheap_remove_first(state->merge_heap);
+	cursor = pairingheap_container(BTMergeCursor, ph_node, node);
+
+	/* Set up the heap TID from the current cursor position */
+	Assert(BTScanPosIsValid(cursor->pos));
+	scan->xs_heaptid = cursor->pos.items[cursor->pos.itemIndex].heapTid;
+
+	/* Advance cursor to next tuple */
+	if (bt_merge_cursor_advance(state, scan, cursor))
+	{
+		/* Cursor still has tuples, re-add to heap */
+		pairingheap_add(state->merge_heap, &cursor->ph_node);
+	}
+	else
+	{
+		/* Cursor exhausted */
+		state->active_cursors--;
+	}
+
+	return true;
+}
+
+
+/*
+ * bt_merge_end
+ *	  Clean up merge scan state.
+ */
+void
+bt_merge_end(BTMergeScanState *state)
+{
+	if (state == NULL)
+		return;
+
+	/* Free the memory context, which frees all allocations */
+	MemoryContextDelete(state->merge_context);
+}
+
+
+/*
+ * bt_merge_cursor_init
+ *	  Initialize a cursor and position it at the first matching tuple.
+ *
+ * Returns true if the cursor found at least one matching tuple.
+ */
+static bool
+bt_merge_cursor_init(BTMergeScanState *state,
+					 IndexScanDesc scan,
+					 BTMergeCursor *cursor,
+					 Datum prefix_value,
+					 bool prefix_isnull)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	bool		found;
+
+	if (prefix_isnull)
+	{
+		cursor->exhausted = true;
+		return false;
+	}
+
+	/*
+	 * Modify the scan key to use this cursor's prefix value.
+	 * We reuse the scan's existing key infrastructure.
+	 */
+	for (int i = 0; i < so->numberOfKeys; i++)
+	{
+		if (so->keyData[i].sk_attno == state->prefix_attno)
+		{
+			so->keyData[i].sk_argument = prefix_value;
+			so->keyData[i].sk_flags &= ~(SK_SEARCHARRAY);
+			break;
+		}
+	}
+
+	/* Invalidate current position to force _bt_first */
+	BTScanPosInvalidate(so->currPos);
+
+	/* Disable array key handling for this cursor's scan */
+	so->numArrayKeys = 0;
+
+	/* Position at first matching tuple */
+	found = _bt_first(scan, state->direction);
+
+	if (found)
+	{
+		/* Copy position to cursor */
+		memcpy(&cursor->pos, &so->currPos, sizeof(BTScanPosData));
+
+		/* Extract the sort key for heap ordering */
+		cursor->sort_key = bt_merge_extract_sortkey(state, scan, cursor,
+													&cursor->sort_key_isnull);
+		cursor->exhausted = false;
+
+		/* Count this as a tuple access */
+		state->tuples_accessed++;
+
+		/* Invalidate main scan position */
+		BTScanPosInvalidate(so->currPos);
+	}
+	else
+	{
+		cursor->exhausted = true;
+	}
+
+	return found;
+}
+
+
+/*
+ * bt_merge_cursor_advance
+ *	  Advance a cursor to its next tuple.
+ *
+ * Returns true if the cursor now points to a valid tuple, false if exhausted.
+ */
+static bool
+bt_merge_cursor_advance(BTMergeScanState *state,
+						IndexScanDesc scan,
+						BTMergeCursor *cursor)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	bool		found = false;
+
+	if (cursor->exhausted)
+		return false;
+
+	/* Try to move to next tuple within current page's items array */
+	if (state->direction == ForwardScanDirection)
+	{
+		if (cursor->pos.itemIndex < cursor->pos.lastItem)
+		{
+			cursor->pos.itemIndex++;
+			found = true;
+		}
+	}
+	else
+	{
+		if (cursor->pos.itemIndex > cursor->pos.firstItem)
+		{
+			cursor->pos.itemIndex--;
+			found = true;
+		}
+	}
+
+	if (!found)
+	{
+		/*
+		 * Current page exhausted. Use _bt_next to get the next page.
+		 * We swap our cursor's position into the scan's currPos,
+		 * call _bt_next, then swap back.
+		 */
+		BTScanPosData save_pos;
+
+		memcpy(&save_pos, &so->currPos, sizeof(BTScanPosData));
+		memcpy(&so->currPos, &cursor->pos, sizeof(BTScanPosData));
+
+		found = _bt_next(scan, state->direction);
+
+		if (found)
+			memcpy(&cursor->pos, &so->currPos, sizeof(BTScanPosData));
+
+		memcpy(&so->currPos, &save_pos, sizeof(BTScanPosData));
+	}
+
+	if (found)
+	{
+		/* Extract new sort key */
+		cursor->sort_key = bt_merge_extract_sortkey(state, scan, cursor,
+													&cursor->sort_key_isnull);
+		state->tuples_accessed++;
+	}
+	else
+	{
+		cursor->exhausted = true;
+	}
+
+	return found;
+}
+
+
+/*
+ * bt_merge_extract_sortkey
+ *	  Extract the sort key (suffix column value) from the current tuple.
+ */
+static Datum
+bt_merge_extract_sortkey(BTMergeScanState *state,
+						 IndexScanDesc scan,
+						 BTMergeCursor *cursor,
+						 bool *isnull)
+{
+	Relation	rel = scan->indexRelation;
+	Buffer		buf;
+	Page		page;
+	OffsetNumber offnum;
+	ItemId		itemid;
+	IndexTuple	itup;
+	TupleDesc	tupdesc;
+	Datum		result;
+
+	if (cursor->pos.currPage == InvalidBlockNumber)
+	{
+		*isnull = true;
+		return (Datum) 0;
+	}
+
+	/* Read the page */
+	buf = ReadBuffer(rel, cursor->pos.currPage);
+	LockBuffer(buf, BT_READ);
+	page = BufferGetPage(buf);
+
+	offnum = cursor->pos.items[cursor->pos.itemIndex].indexOffset;
+	itemid = PageGetItemId(page, offnum);
+	itup = (IndexTuple) PageGetItem(page, itemid);
+	tupdesc = RelationGetDescr(rel);
+
+	/* Extract the suffix column value */
+	result = index_getattr(itup, state->suffix_attno, tupdesc, isnull);
+
+	/* Copy pass-by-reference values before releasing buffer */
+	if (!*isnull)
+	{
+		Form_pg_attribute attr = TupleDescAttr(tupdesc, state->suffix_attno - 1);
+
+		if (!attr->attbyval)
+			result = datumCopy(result, attr->attbyval, attr->attlen);
+	}
+
+	UnlockReleaseBuffer(buf);
+
+	return result;
+}
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 77224859685..0d4e7440760 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -20,6 +20,7 @@
 #include "catalog/pg_am_d.h"
 #include "catalog/pg_class.h"
 #include "catalog/pg_index.h"
+#include "lib/pairingheap.h"
 #include "lib/stringinfo.h"
 #include "storage/bufmgr.h"
 #include "storage/dsm.h"
@@ -1050,6 +1051,49 @@ typedef struct BTArrayKeyInfo
 	ScanKey		high_compare;	/* array's < or <= upper bound */
 } BTArrayKeyInfo;
 
+/*
+ * BTMergeCursor - tracks scan state for one prefix value in merge scan
+ *
+ * Each cursor maintains its own position within the index for a specific
+ * prefix value. Cursors are organized in a min-heap ordered by their
+ * current suffix key value for efficient K-way merge.
+ */
+typedef struct BTMergeCursor
+{
+	pairingheap_node ph_node;	/* pairing heap node for merge */
+	int			cursor_id;		/* index in merge state's cursors array */
+	Datum		prefix_value;	/* the prefix value for this sub-scan */
+	bool		prefix_isnull;	/* is prefix value NULL? */
+	Datum		sort_key;		/* current tuple's sort key (suffix) */
+	bool		sort_key_isnull;/* is sort key NULL? */
+	bool		exhausted;		/* no more tuples for this prefix */
+	BTScanPosData pos;			/* current position in index */
+	char	   *tuples;			/* tuple storage workspace (BLCKSZ) */
+} BTMergeCursor;
+
+/*
+ * BTMergeScanState - state for K-way merge scan
+ *
+ * This structure manages multiple cursors for a merge scan, allowing
+ * lazy evaluation of queries like:
+ *   WHERE prefix IN (v1, v2, ..., vK) AND suffix >= b ORDER BY suffix LIMIT N
+ */
+typedef struct BTMergeScanState
+{
+	int			num_cursors;	/* number of prefix values (K) */
+	int			active_cursors;	/* cursors not yet exhausted */
+	BTMergeCursor *cursors;		/* array of cursors */
+	pairingheap *merge_heap;	/* min-heap ordered by sort_key */
+	int			prefix_attno;	/* attribute number of prefix column (1-based) */
+	int			suffix_attno;	/* attribute number of suffix column (1-based) */
+	FmgrInfo	suffix_cmp;		/* comparison function for suffix */
+	Oid			suffix_collation;	/* collation for suffix comparison */
+	ScanDirection direction;	/* scan direction */
+	bool		initialized;	/* have cursors been initialized? */
+	MemoryContext merge_context;/* memory context for allocations */
+	int64		tuples_accessed;/* count of index tuples accessed */
+} BTMergeScanState;
+
 typedef struct BTScanOpaqueData
 {
 	/* these fields are set by _bt_preprocess_keys(): */
@@ -1089,6 +1133,12 @@ typedef struct BTScanOpaqueData
 	 */
 	int			markItemIndex;	/* itemIndex, or -1 if not valid */
 
+	/*
+	 * Merge scan state, if using merge scan optimization.
+	 * NULL if not using merge scan.
+	 */
+	BTMergeScanState *mergeState;
+
 	/* keep these last in struct for efficiency */
 	BTScanPosData currPos;		/* current position data */
 	BTScanPosData markPos;		/* marked position, if any */
@@ -1334,4 +1384,18 @@ extern IndexBuildResult *btbuild(Relation heap, Relation index,
 								 struct IndexInfo *indexInfo);
 extern void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc);
 
+/*
+ * prototypes for functions in nbtmergescan.c
+ */
+extern BTMergeScanState *bt_merge_init(IndexScanDesc scan,
+									   Datum *prefix_values,
+									   bool *prefix_nulls,
+									   int num_prefixes,
+									   int prefix_attno,
+									   int suffix_attno,
+									   Oid suffix_cmp_oid,
+									   Oid suffix_collation);
+extern bool bt_merge_getnext(IndexScanDesc scan, ScanDirection dir);
+extern void bt_merge_end(BTMergeScanState *state);
+
 #endif							/* NBTREE_H */
diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build
index 2634a519935..b7b802bfdde 100644
--- a/src/test/modules/meson.build
+++ b/src/test/modules/meson.build
@@ -18,6 +18,7 @@ subdir('ssl_passphrase_callback')
 subdir('test_aio')
 subdir('test_binaryheap')
 subdir('test_bitmapset')
+subdir('test_btree_merge')
 subdir('test_bloomfilter')
 subdir('test_cloexec')
 subdir('test_copy_callbacks')
diff --git a/src/test/modules/test_btree_merge/Makefile b/src/test/modules/test_btree_merge/Makefile
new file mode 100644
index 00000000000..540416a2c91
--- /dev/null
+++ b/src/test/modules/test_btree_merge/Makefile
@@ -0,0 +1,24 @@
+# src/test/modules/test_btree_merge/Makefile
+
+MODULE_big = test_btree_merge
+OBJS = \
+	$(WIN32RES) \
+	test_btree_merge.o
+
+PGFILEDESC = "test_btree_merge - test code for btree merge scan"
+
+EXTENSION = test_btree_merge
+DATA = test_btree_merge--1.0.sql
+
+REGRESS = test_btree_merge
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_btree_merge
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/test_btree_merge/expected/test_btree_merge.out b/src/test/modules/test_btree_merge/expected/test_btree_merge.out
new file mode 100644
index 00000000000..baf4d7937e0
--- /dev/null
+++ b/src/test/modules/test_btree_merge/expected/test_btree_merge.out
@@ -0,0 +1,243 @@
+-- Unit tests for B-tree merge scan implementation
+-- Tests the core merge scan algorithm directly, bypassing the planner
+CREATE EXTENSION test_btree_merge;
+-- ============================================================================
+-- Setup: Create test tables with known data distributions
+-- ============================================================================
+-- Test table with integer prefix and suffix
+CREATE TABLE merge_test_int (
+    prefix_col int4,
+    suffix_col int4
+);
+-- Insert data: 10 prefix values, 100 suffix values each = 1000 rows
+INSERT INTO merge_test_int
+SELECT p, s
+FROM generate_series(1, 10) AS p,
+     generate_series(1, 100) AS s;
+CREATE INDEX merge_test_int_idx ON merge_test_int (prefix_col, suffix_col);
+ANALYZE merge_test_int;
+-- Test table with integer prefix and timestamp suffix
+CREATE TABLE merge_test_ts (
+    user_id int4,
+    event_time timestamp
+);
+-- Insert data: 5 users, 100 events each
+INSERT INTO merge_test_ts
+SELECT u, '2026-01-01 00:00:00'::timestamp + (e || ' minutes')::interval
+FROM generate_series(1, 5) AS u,
+     generate_series(1, 100) AS e;
+CREATE INDEX merge_test_ts_idx ON merge_test_ts (user_id, event_time);
+ANALYZE merge_test_ts;
+-- ============================================================================
+-- Test 1: Basic integer merge scan
+-- Query: WHERE prefix IN (1,2,3) AND suffix >= 50 LIMIT 5
+-- K = 3 prefix values, LIMIT = 5
+-- Expected tuples accessed: 5 + 3 - 1 = 7
+-- ============================================================================
+SELECT 'Test 1: Basic integer merge scan' AS test_name;
+            test_name             
+----------------------------------
+ Test 1: Basic integer merge scan
+(1 row)
+
+SELECT * FROM test_btree_merge_scan_int(
+    'merge_test_int',
+    'merge_test_int_idx',
+    ARRAY[1, 2, 3],
+    50,
+    5
+);
+ tuples_returned | tuples_accessed | maximum_required_fetches 
+-----------------+-----------------+--------------------------
+               5 |               8 |                        7
+(1 row)
+
+-- ============================================================================
+-- Test 2: More prefix values
+-- Query: WHERE prefix IN (1,2,3,4,5) AND suffix >= 80 LIMIT 3
+-- K = 5 prefix values, LIMIT = 3
+-- Expected tuples accessed: 3 + 5 - 1 = 7
+-- ============================================================================
+SELECT 'Test 2: More prefix values' AS test_name;
+         test_name          
+----------------------------
+ Test 2: More prefix values
+(1 row)
+
+SELECT * FROM test_btree_merge_scan_int(
+    'merge_test_int',
+    'merge_test_int_idx',
+    ARRAY[1, 2, 3, 4, 5],
+    80,
+    3
+);
+ tuples_returned | tuples_accessed | maximum_required_fetches 
+-----------------+-----------------+--------------------------
+               3 |               8 |                        7
+(1 row)
+
+-- ============================================================================
+-- Test 3: Single prefix value (degenerates to regular scan)
+-- K = 1, LIMIT = 5
+-- Expected tuples accessed: 5 + 1 - 1 = 5
+-- ============================================================================
+SELECT 'Test 3: Single prefix value' AS test_name;
+          test_name          
+-----------------------------
+ Test 3: Single prefix value
+(1 row)
+
+SELECT * FROM test_btree_merge_scan_int(
+    'merge_test_int',
+    'merge_test_int_idx',
+    ARRAY[1],
+    50,
+    5
+);
+ tuples_returned | tuples_accessed | maximum_required_fetches 
+-----------------+-----------------+--------------------------
+               5 |               6 |                        5
+(1 row)
+
+-- ============================================================================
+-- Test 4: Large LIMIT (more than matching rows)
+-- K = 3, prefix values that have 51 rows each (suffix >= 50)
+-- LIMIT = 200 but only 153 rows exist
+-- ============================================================================
+SELECT 'Test 4: Large LIMIT' AS test_name;
+      test_name      
+---------------------
+ Test 4: Large LIMIT
+(1 row)
+
+SELECT * FROM test_btree_merge_scan_int(
+    'merge_test_int',
+    'merge_test_int_idx',
+    ARRAY[1, 2, 3],
+    50,
+    200
+);
+ tuples_returned | tuples_accessed | maximum_required_fetches 
+-----------------+-----------------+--------------------------
+             153 |             153 |                      153
+(1 row)
+
+-- ============================================================================
+-- Test 5: Non-contiguous prefix values
+-- Query: WHERE prefix IN (2,5,8) AND suffix >= 50 LIMIT 5
+-- Tests that merge scan works with gaps in prefix values
+-- K = 3 prefix values (non-adjacent), LIMIT = 5
+-- ============================================================================
+SELECT 'Test 5: Non-contiguous prefix values' AS test_name;
+              test_name               
+--------------------------------------
+ Test 5: Non-contiguous prefix values
+(1 row)
+
+SELECT * FROM test_btree_merge_scan_int(
+    'merge_test_int',
+    'merge_test_int_idx',
+    ARRAY[2, 5, 8],
+    50,
+    5
+);
+ tuples_returned | tuples_accessed | maximum_required_fetches 
+-----------------+-----------------+--------------------------
+               5 |               8 |                        7
+(1 row)
+
+-- ============================================================================
+-- Test 6: Timestamp suffix column
+-- Query: WHERE user_id IN (1,2,3) AND event_time >= '2026-01-01 01:00:00' LIMIT 5
+-- K = 3, LIMIT = 5
+-- Expected tuples accessed: 5 + 3 - 1 = 7
+-- ============================================================================
+SELECT 'Test 6: Timestamp suffix' AS test_name;
+        test_name         
+--------------------------
+ Test 6: Timestamp suffix
+(1 row)
+
+SELECT * FROM test_btree_merge_scan_ts(
+    'merge_test_ts',
+    'merge_test_ts_idx',
+    ARRAY[1, 2, 3],
+    '2026-01-01 01:00:00'::timestamp,
+    5
+);
+ tuples_returned | tuples_accessed | maximum_required_fetches 
+-----------------+-----------------+--------------------------
+               5 |               8 |                        7
+(1 row)
+
+-- ============================================================================
+-- Test 7: All users with timestamp
+-- K = 5, LIMIT = 10
+-- Expected tuples accessed: 10 + 5 - 1 = 14
+-- ============================================================================
+SELECT 'Test 7: All users timestamp' AS test_name;
+          test_name          
+-----------------------------
+ Test 7: All users timestamp
+(1 row)
+
+SELECT * FROM test_btree_merge_scan_ts(
+    'merge_test_ts',
+    'merge_test_ts_idx',
+    ARRAY[1, 2, 3, 4, 5],
+    '2026-01-01 00:30:00'::timestamp,
+    10
+);
+ tuples_returned | tuples_accessed | maximum_required_fetches 
+-----------------+-----------------+--------------------------
+              10 |              15 |                       14
+(1 row)
+
+-- ============================================================================
+-- Test 8: Correctness verification
+-- Verify merge scan returns rows in exact ORDER BY suffix_col, prefix_col order
+-- Using WITH ORDINALITY to compare row positions
+-- ============================================================================
+SELECT 'Test 8: Correctness verification' AS test_name;
+            test_name             
+----------------------------------
+ Test 8: Correctness verification
+(1 row)
+
+-- Compare merge scan vs regular query with row positions (should be empty)
+WITH merge_result AS (
+    SELECT row_number() OVER () AS rn, prefix_col, suffix_col
+    FROM test_btree_merge_fetch_int(
+        'merge_test_int',
+        'merge_test_int_idx',
+        ARRAY[1, 2, 3],
+        90,
+        10
+    )
+),
+regular_result AS (
+    SELECT row_number() OVER () AS rn, prefix_col, suffix_col
+    FROM (
+        SELECT prefix_col, suffix_col
+        FROM merge_test_int
+        WHERE prefix_col IN (1, 2, 3) AND suffix_col >= 90
+        ORDER BY suffix_col, prefix_col
+        LIMIT 10
+    ) t
+)
+SELECT 'MISMATCH' AS status, m.rn, m.prefix_col, m.suffix_col,
+       r.prefix_col AS expected_prefix, r.suffix_col AS expected_suffix
+FROM merge_result m
+FULL OUTER JOIN regular_result r ON m.rn = r.rn
+WHERE m.prefix_col IS DISTINCT FROM r.prefix_col
+   OR m.suffix_col IS DISTINCT FROM r.suffix_col;
+ status | rn | prefix_col | suffix_col | expected_prefix | expected_suffix 
+--------+----+------------+------------+-----------------+-----------------
+(0 rows)
+
+-- ============================================================================
+-- Cleanup
+-- ============================================================================
+DROP TABLE merge_test_int;
+DROP TABLE merge_test_ts;
+DROP EXTENSION test_btree_merge;
diff --git a/src/test/modules/test_btree_merge/meson.build b/src/test/modules/test_btree_merge/meson.build
new file mode 100644
index 00000000000..665d6cf443e
--- /dev/null
+++ b/src/test/modules/test_btree_merge/meson.build
@@ -0,0 +1,33 @@
+# Copyright (c) 2026, PostgreSQL Global Development Group
+
+test_btree_merge_sources = files(
+  'test_btree_merge.c',
+)
+
+if host_system == 'windows'
+  test_btree_merge_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+    '--NAME', 'test_btree_merge',
+    '--FILEDESC', 'test_btree_merge - test code for btree merge scan',])
+endif
+
+test_btree_merge = shared_module('test_btree_merge',
+  test_btree_merge_sources,
+  kwargs: pg_test_mod_args,
+)
+test_install_libs += test_btree_merge
+
+test_install_data += files(
+  'test_btree_merge.control',
+  'test_btree_merge--1.0.sql',
+)
+
+tests += {
+  'name': 'test_btree_merge',
+  'sd': meson.current_source_dir(),
+  'bd': meson.current_build_dir(),
+  'regress': {
+    'sql': [
+      'test_btree_merge',
+    ],
+  },
+}
diff --git a/src/test/modules/test_btree_merge/sql/test_btree_merge.sql b/src/test/modules/test_btree_merge/sql/test_btree_merge.sql
new file mode 100644
index 00000000000..5828b343b34
--- /dev/null
+++ b/src/test/modules/test_btree_merge/sql/test_btree_merge.sql
@@ -0,0 +1,207 @@
+-- Unit tests for B-tree merge scan implementation
+-- Tests the core merge scan algorithm directly, bypassing the planner
+
+CREATE EXTENSION test_btree_merge;
+
+-- ============================================================================
+-- Setup: Create test tables with known data distributions
+-- ============================================================================
+
+-- Test table with integer prefix and suffix
+CREATE TABLE merge_test_int (
+    prefix_col int4,
+    suffix_col int4
+);
+
+-- Insert data: 10 prefix values, 100 suffix values each = 1000 rows
+INSERT INTO merge_test_int
+SELECT p, s
+FROM generate_series(1, 10) AS p,
+     generate_series(1, 100) AS s;
+
+CREATE INDEX merge_test_int_idx ON merge_test_int (prefix_col, suffix_col);
+ANALYZE merge_test_int;
+
+-- Test table with integer prefix and timestamp suffix
+CREATE TABLE merge_test_ts (
+    user_id int4,
+    event_time timestamp
+);
+
+-- Insert data: 5 users, 100 events each
+INSERT INTO merge_test_ts
+SELECT u, '2026-01-01 00:00:00'::timestamp + (e || ' minutes')::interval
+FROM generate_series(1, 5) AS u,
+     generate_series(1, 100) AS e;
+
+CREATE INDEX merge_test_ts_idx ON merge_test_ts (user_id, event_time);
+ANALYZE merge_test_ts;
+
+
+-- ============================================================================
+-- Test 1: Basic integer merge scan
+-- Query: WHERE prefix IN (1,2,3) AND suffix >= 50 LIMIT 5
+-- K = 3 prefix values, LIMIT = 5
+-- Expected tuples accessed: 5 + 3 - 1 = 7
+-- ============================================================================
+
+SELECT 'Test 1: Basic integer merge scan' AS test_name;
+
+SELECT * FROM test_btree_merge_scan_int(
+    'merge_test_int',
+    'merge_test_int_idx',
+    ARRAY[1, 2, 3],
+    50,
+    5
+);
+
+
+-- ============================================================================
+-- Test 2: More prefix values
+-- Query: WHERE prefix IN (1,2,3,4,5) AND suffix >= 80 LIMIT 3
+-- K = 5 prefix values, LIMIT = 3
+-- Expected tuples accessed: 3 + 5 - 1 = 7
+-- ============================================================================
+
+SELECT 'Test 2: More prefix values' AS test_name;
+
+SELECT * FROM test_btree_merge_scan_int(
+    'merge_test_int',
+    'merge_test_int_idx',
+    ARRAY[1, 2, 3, 4, 5],
+    80,
+    3
+);
+
+
+-- ============================================================================
+-- Test 3: Single prefix value (degenerates to regular scan)
+-- K = 1, LIMIT = 5
+-- Expected tuples accessed: 5 + 1 - 1 = 5
+-- ============================================================================
+
+SELECT 'Test 3: Single prefix value' AS test_name;
+
+SELECT * FROM test_btree_merge_scan_int(
+    'merge_test_int',
+    'merge_test_int_idx',
+    ARRAY[1],
+    50,
+    5
+);
+
+
+-- ============================================================================
+-- Test 4: Large LIMIT (more than matching rows)
+-- K = 3, prefix values that have 51 rows each (suffix >= 50)
+-- LIMIT = 200 but only 153 rows exist
+-- ============================================================================
+
+SELECT 'Test 4: Large LIMIT' AS test_name;
+
+SELECT * FROM test_btree_merge_scan_int(
+    'merge_test_int',
+    'merge_test_int_idx',
+    ARRAY[1, 2, 3],
+    50,
+    200
+);
+
+
+-- ============================================================================
+-- Test 5: Non-contiguous prefix values
+-- Query: WHERE prefix IN (2,5,8) AND suffix >= 50 LIMIT 5
+-- Tests that merge scan works with gaps in prefix values
+-- K = 3 prefix values (non-adjacent), LIMIT = 5
+-- ============================================================================
+
+SELECT 'Test 5: Non-contiguous prefix values' AS test_name;
+
+SELECT * FROM test_btree_merge_scan_int(
+    'merge_test_int',
+    'merge_test_int_idx',
+    ARRAY[2, 5, 8],
+    50,
+    5
+);
+
+
+-- ============================================================================
+-- Test 6: Timestamp suffix column
+-- Query: WHERE user_id IN (1,2,3) AND event_time >= '2026-01-01 01:00:00' LIMIT 5
+-- K = 3, LIMIT = 5
+-- Expected tuples accessed: 5 + 3 - 1 = 7
+-- ============================================================================
+
+SELECT 'Test 6: Timestamp suffix' AS test_name;
+
+SELECT * FROM test_btree_merge_scan_ts(
+    'merge_test_ts',
+    'merge_test_ts_idx',
+    ARRAY[1, 2, 3],
+    '2026-01-01 01:00:00'::timestamp,
+    5
+);
+
+
+-- ============================================================================
+-- Test 7: All users with timestamp
+-- K = 5, LIMIT = 10
+-- Expected tuples accessed: 10 + 5 - 1 = 14
+-- ============================================================================
+
+SELECT 'Test 7: All users timestamp' AS test_name;
+
+SELECT * FROM test_btree_merge_scan_ts(
+    'merge_test_ts',
+    'merge_test_ts_idx',
+    ARRAY[1, 2, 3, 4, 5],
+    '2026-01-01 00:30:00'::timestamp,
+    10
+);
+
+
+-- ============================================================================
+-- Test 8: Correctness verification
+-- Verify merge scan returns rows in exact ORDER BY suffix_col, prefix_col order
+-- Using WITH ORDINALITY to compare row positions
+-- ============================================================================
+
+SELECT 'Test 8: Correctness verification' AS test_name;
+
+-- Compare merge scan vs regular query with row positions (should be empty)
+WITH merge_result AS (
+    SELECT row_number() OVER () AS rn, prefix_col, suffix_col
+    FROM test_btree_merge_fetch_int(
+        'merge_test_int',
+        'merge_test_int_idx',
+        ARRAY[1, 2, 3],
+        90,
+        10
+    )
+),
+regular_result AS (
+    SELECT row_number() OVER () AS rn, prefix_col, suffix_col
+    FROM (
+        SELECT prefix_col, suffix_col
+        FROM merge_test_int
+        WHERE prefix_col IN (1, 2, 3) AND suffix_col >= 90
+        ORDER BY suffix_col, prefix_col
+        LIMIT 10
+    ) t
+)
+SELECT 'MISMATCH' AS status, m.rn, m.prefix_col, m.suffix_col,
+       r.prefix_col AS expected_prefix, r.suffix_col AS expected_suffix
+FROM merge_result m
+FULL OUTER JOIN regular_result r ON m.rn = r.rn
+WHERE m.prefix_col IS DISTINCT FROM r.prefix_col
+   OR m.suffix_col IS DISTINCT FROM r.suffix_col;
+
+
+-- ============================================================================
+-- Cleanup
+-- ============================================================================
+
+DROP TABLE merge_test_int;
+DROP TABLE merge_test_ts;
+DROP EXTENSION test_btree_merge;
diff --git a/src/test/modules/test_btree_merge/test_btree_merge--1.0.sql b/src/test/modules/test_btree_merge/test_btree_merge--1.0.sql
new file mode 100644
index 00000000000..9872947d7d7
--- /dev/null
+++ b/src/test/modules/test_btree_merge/test_btree_merge--1.0.sql
@@ -0,0 +1,43 @@
+/* src/test/modules/test_btree_merge/test_btree_merge--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION test_btree_merge" to load this file. \quit
+
+-- Test merge scan with integer columns
+CREATE FUNCTION test_btree_merge_scan_int(
+    table_name text,
+    index_name text,
+    prefix_values int4[],
+    suffix_start int4,
+    limit_count int4
+) RETURNS TABLE (
+    tuples_returned int4,
+    tuples_accessed int4,
+    maximum_required_fetches int4
+) AS 'MODULE_PATHNAME' LANGUAGE C STRICT;
+
+-- Fetch actual rows from merge scan (for correctness verification)
+CREATE FUNCTION test_btree_merge_fetch_int(
+    table_name text,
+    index_name text,
+    prefix_values int4[],
+    suffix_start int4,
+    limit_count int4
+) RETURNS TABLE (
+    prefix_col int4,
+    suffix_col int4
+) AS 'MODULE_PATHNAME' LANGUAGE C STRICT;
+
+-- Test merge scan with timestamp suffix
+CREATE FUNCTION test_btree_merge_scan_ts(
+    table_name text,
+    index_name text,
+    prefix_values int4[],
+    suffix_start timestamp,
+    limit_count int4
+) RETURNS TABLE (
+    tuples_returned int4,
+    tuples_accessed int4,
+    maximum_required_fetches int4
+) AS 'MODULE_PATHNAME' LANGUAGE C STRICT;
+
diff --git a/src/test/modules/test_btree_merge/test_btree_merge.c b/src/test/modules/test_btree_merge/test_btree_merge.c
new file mode 100644
index 00000000000..78b22130ecf
--- /dev/null
+++ b/src/test/modules/test_btree_merge/test_btree_merge.c
@@ -0,0 +1,389 @@
+/*-------------------------------------------------------------------------
+ *
+ * test_btree_merge.c
+ *		Unit tests for B-tree Merge Scan implementation
+ *
+ * This module provides SQL-callable functions to directly test the
+ * merge scan algorithm without going through the planner.
+ *
+ * Copyright (c) 2026, PostgreSQL Global Development Group
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/nbtree.h"
+#include "access/table.h"
+#include "catalog/namespace.h"
+#include "catalog/pg_am.h"
+#include "catalog/pg_type.h"
+#include "commands/defrem.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "utils/array.h"
+#include "utils/builtins.h"
+#include "utils/fmgroids.h"
+#include "utils/lsyscache.h"
+#include "utils/snapmgr.h"
+#include "utils/timestamp.h"
+
+PG_MODULE_MAGIC;
+
+#define MAX_RESULTS 10000
+
+/*
+ * MergeScanResult - holds results from a merge scan execution
+ */
+typedef struct MergeScanResult
+{
+	int			tuples_returned;
+	int64		tuples_accessed;
+	int			num_prefixes;
+	int			limit_count;
+	/* For fetch function: collected row data */
+	int32	   *prefixes;
+	int32	   *suffixes;
+} MergeScanResult;
+
+/*
+ * do_merge_scan - common merge scan execution
+ *
+ * Performs a merge scan with the given parameters and collects results.
+ * If collect_rows is true, fetches and stores actual row data.
+ */
+static void
+do_merge_scan(const char *table_name,
+			  const char *index_name,
+			  Datum *prefix_values,
+			  bool *prefix_nulls,
+			  int num_prefixes,
+			  Datum suffix_start,
+			  Oid suffix_type,
+			  RegProcedure suffix_eq_proc,
+			  RegProcedure suffix_ge_proc,
+			  int limit_count,
+			  bool collect_rows,
+			  MergeScanResult *result)
+{
+	Oid			table_oid;
+	Oid			index_oid;
+	Relation	heap_rel;
+	Relation	index_rel;
+	IndexScanDesc scan;
+	BTScanOpaque so;
+	BTMergeScanState *merge_state;
+	Snapshot	snapshot;
+	Oid			suffix_cmp_oid;
+	Oid			opfamily;
+	const char *opfamily_name;
+	int			tuples_returned = 0;
+	int			max_results;
+
+	/* Determine operator family based on suffix type */
+	if (suffix_type == INT4OID)
+		opfamily_name = "integer_ops";
+	else if (suffix_type == TIMESTAMPOID)
+		opfamily_name = "datetime_ops";
+	else
+		elog(ERROR, "unsupported suffix type: %u", suffix_type);
+
+	/* Look up table and index */
+	table_oid = RelnameGetRelid(table_name);
+	if (!OidIsValid(table_oid))
+		elog(ERROR, "table \"%s\" does not exist", table_name);
+
+	index_oid = RelnameGetRelid(index_name);
+	if (!OidIsValid(index_oid))
+		elog(ERROR, "index \"%s\" does not exist", index_name);
+
+	/* Open relations */
+	heap_rel = table_open(table_oid, AccessShareLock);
+	index_rel = index_open(index_oid, AccessShareLock);
+
+	/* Get comparison function for suffix type */
+	opfamily = get_opfamily_oid(BTREE_AM_OID,
+								list_make1(makeString(pstrdup(opfamily_name))),
+								false);
+	suffix_cmp_oid = get_opfamily_proc(opfamily, suffix_type, suffix_type,
+									   BTORDER_PROC);
+	if (!OidIsValid(suffix_cmp_oid))
+		elog(ERROR, "could not find comparison function for type %u", suffix_type);
+
+	/* Begin index scan */
+	snapshot = GetActiveSnapshot();
+	scan = index_beginscan(heap_rel, index_rel, snapshot, NULL, 2, 0);
+
+	/* Set up scan keys */
+	{
+		ScanKeyData keys[2];
+
+		ScanKeyInit(&keys[0], 1, BTEqualStrategyNumber, suffix_eq_proc,
+					prefix_values[0]);
+		ScanKeyInit(&keys[1], 2, BTGreaterEqualStrategyNumber, suffix_ge_proc,
+					suffix_start);
+		index_rescan(scan, keys, 2, NULL, 0);
+	}
+
+	so = (BTScanOpaque) scan->opaque;
+
+	/* Initialize merge scan */
+	merge_state = bt_merge_init(scan, prefix_values, prefix_nulls,
+								num_prefixes, 1, 2, suffix_cmp_oid, InvalidOid);
+	so->mergeState = merge_state;
+
+	/* Execute scan */
+	max_results = (limit_count > 0) ? limit_count : MAX_RESULTS;
+
+	while (tuples_returned < max_results)
+	{
+		CHECK_FOR_INTERRUPTS();
+
+		if (!bt_merge_getnext(scan, ForwardScanDirection))
+			break;
+
+		if (collect_rows && result->prefixes != NULL)
+		{
+			/* Fetch heap tuple to get actual values */
+			HeapTupleData heapTuple;
+			Buffer		heapBuffer;
+			bool		isnull;
+
+			heapTuple.t_self = scan->xs_heaptid;
+			if (heap_fetch(heap_rel, snapshot, &heapTuple, &heapBuffer, false))
+			{
+				result->prefixes[tuples_returned] =
+					DatumGetInt32(heap_getattr(&heapTuple, 1,
+											   RelationGetDescr(heap_rel), &isnull));
+				result->suffixes[tuples_returned] =
+					DatumGetInt32(heap_getattr(&heapTuple, 2,
+											   RelationGetDescr(heap_rel), &isnull));
+				ReleaseBuffer(heapBuffer);
+			}
+		}
+
+		tuples_returned++;
+
+		if (tuples_returned >= MAX_RESULTS)
+		{
+			elog(WARNING, "merge scan hit safety limit of %d tuples", MAX_RESULTS);
+			break;
+		}
+	}
+
+	/* Collect results before cleanup */
+	result->tuples_returned = tuples_returned;
+	result->tuples_accessed = merge_state->tuples_accessed;
+	result->num_prefixes = num_prefixes;
+	result->limit_count = limit_count;
+
+	/* Clean up */
+	bt_merge_end(merge_state);
+	so->mergeState = NULL;
+	index_endscan(scan);
+	index_close(index_rel, AccessShareLock);
+	table_close(heap_rel, AccessShareLock);
+}
+
+/*
+ * build_stats_result - build the stats result tuple
+ */
+static Datum
+build_stats_result(FunctionCallInfo fcinfo, MergeScanResult *result)
+{
+	TupleDesc	tupdesc;
+	Datum		values[3];
+	bool		nulls[3] = {false, false, false};
+	HeapTuple	tuple;
+	int			max_required_fetches;
+
+	/* Calculate expected max fetches */
+	if (result->tuples_returned < result->limit_count)
+		max_required_fetches = result->tuples_returned;
+	else
+		max_required_fetches = result->limit_count + result->num_prefixes - 1;
+
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("function returning record called in context "
+						"that cannot accept type record")));
+
+	tupdesc = BlessTupleDesc(tupdesc);
+
+	values[0] = Int32GetDatum(result->tuples_returned);
+	values[1] = Int32GetDatum((int32) result->tuples_accessed);
+	values[2] = Int32GetDatum(max_required_fetches);
+
+	tuple = heap_form_tuple(tupdesc, values, nulls);
+	return HeapTupleGetDatum(tuple);
+}
+
+
+/*
+ * test_btree_merge_scan_int - test merge scan with integer columns
+ */
+PG_FUNCTION_INFO_V1(test_btree_merge_scan_int);
+
+Datum
+test_btree_merge_scan_int(PG_FUNCTION_ARGS)
+{
+	text	   *table_name = PG_GETARG_TEXT_PP(0);
+	text	   *index_name = PG_GETARG_TEXT_PP(1);
+	ArrayType  *prefix_array = PG_GETARG_ARRAYTYPE_P(2);
+	int32		suffix_start = PG_GETARG_INT32(3);
+	int32		limit_count = PG_GETARG_INT32(4);
+	Datum	   *prefix_values;
+	bool	   *prefix_nulls;
+	int			num_prefixes;
+	MergeScanResult result = {0};
+
+	deconstruct_array(prefix_array, INT4OID, sizeof(int32), true, TYPALIGN_INT,
+					  &prefix_values, &prefix_nulls, &num_prefixes);
+
+	if (num_prefixes == 0)
+		elog(ERROR, "prefix_values array cannot be empty");
+
+	do_merge_scan(text_to_cstring(table_name),
+				  text_to_cstring(index_name),
+				  prefix_values, prefix_nulls, num_prefixes,
+				  Int32GetDatum(suffix_start), INT4OID,
+				  F_INT4EQ, F_INT4GE,
+				  limit_count, false, &result);
+
+	return build_stats_result(fcinfo, &result);
+}
+
+
+/*
+ * test_btree_merge_scan_ts - test merge scan with timestamp suffix
+ */
+PG_FUNCTION_INFO_V1(test_btree_merge_scan_ts);
+
+Datum
+test_btree_merge_scan_ts(PG_FUNCTION_ARGS)
+{
+	text	   *table_name = PG_GETARG_TEXT_PP(0);
+	text	   *index_name = PG_GETARG_TEXT_PP(1);
+	ArrayType  *prefix_array = PG_GETARG_ARRAYTYPE_P(2);
+	Timestamp	suffix_start = PG_GETARG_TIMESTAMP(3);
+	int32		limit_count = PG_GETARG_INT32(4);
+	Datum	   *prefix_values;
+	bool	   *prefix_nulls;
+	int			num_prefixes;
+	MergeScanResult result = {0};
+
+	deconstruct_array(prefix_array, INT4OID, sizeof(int32), true, TYPALIGN_INT,
+					  &prefix_values, &prefix_nulls, &num_prefixes);
+
+	if (num_prefixes == 0)
+		elog(ERROR, "prefix_values array cannot be empty");
+
+	do_merge_scan(text_to_cstring(table_name),
+				  text_to_cstring(index_name),
+				  prefix_values, prefix_nulls, num_prefixes,
+				  TimestampGetDatum(suffix_start), TIMESTAMPOID,
+				  F_INT4EQ, F_TIMESTAMP_GE,
+				  limit_count, false, &result);
+
+	return build_stats_result(fcinfo, &result);
+}
+
+
+/*
+ * test_btree_merge_fetch_int - fetch actual rows from merge scan
+ */
+PG_FUNCTION_INFO_V1(test_btree_merge_fetch_int);
+
+Datum
+test_btree_merge_fetch_int(PG_FUNCTION_ARGS)
+{
+	FuncCallContext *funcctx;
+
+	typedef struct
+	{
+		int32	   *prefixes;
+		int32	   *suffixes;
+		int			num_results;
+		int			current_idx;
+	} FetchContext;
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		text	   *table_name = PG_GETARG_TEXT_PP(0);
+		text	   *index_name = PG_GETARG_TEXT_PP(1);
+		ArrayType  *prefix_array = PG_GETARG_ARRAYTYPE_P(2);
+		int32		suffix_start = PG_GETARG_INT32(3);
+		int32		limit_count = PG_GETARG_INT32(4);
+		Datum	   *prefix_values;
+		bool	   *prefix_nulls;
+		int			num_prefixes;
+		MemoryContext oldcontext;
+		FetchContext *fctx;
+		MergeScanResult result = {0};
+		TupleDesc	tupdesc;
+		int			max_results;
+
+		funcctx = SRF_FIRSTCALL_INIT();
+		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+		deconstruct_array(prefix_array, INT4OID, sizeof(int32), true, TYPALIGN_INT,
+						  &prefix_values, &prefix_nulls, &num_prefixes);
+
+		if (num_prefixes == 0)
+			elog(ERROR, "prefix_values array cannot be empty");
+
+		/* Allocate result storage */
+		max_results = (limit_count > 0) ? limit_count : MAX_RESULTS;
+		fctx = palloc(sizeof(FetchContext));
+		fctx->prefixes = palloc(max_results * sizeof(int32));
+		fctx->suffixes = palloc(max_results * sizeof(int32));
+		fctx->current_idx = 0;
+
+		/* Point result to our storage */
+		result.prefixes = fctx->prefixes;
+		result.suffixes = fctx->suffixes;
+
+		do_merge_scan(text_to_cstring(table_name),
+					  text_to_cstring(index_name),
+					  prefix_values, prefix_nulls, num_prefixes,
+					  Int32GetDatum(suffix_start), INT4OID,
+					  F_INT4EQ, F_INT4GE,
+					  limit_count, true, &result);
+
+		fctx->num_results = result.tuples_returned;
+
+		/* Build result tuple descriptor */
+		tupdesc = CreateTemplateTupleDesc(2);
+		TupleDescInitEntry(tupdesc, 1, "prefix_col", INT4OID, -1, 0);
+		TupleDescInitEntry(tupdesc, 2, "suffix_col", INT4OID, -1, 0);
+		funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+		funcctx->user_fctx = fctx;
+
+		MemoryContextSwitchTo(oldcontext);
+	}
+
+	funcctx = SRF_PERCALL_SETUP();
+
+	{
+		FetchContext *fctx = funcctx->user_fctx;
+
+		if (fctx->current_idx < fctx->num_results)
+		{
+			Datum		values[2];
+			bool		nulls[2] = {false, false};
+			HeapTuple	tuple;
+
+			values[0] = Int32GetDatum(fctx->prefixes[fctx->current_idx]);
+			values[1] = Int32GetDatum(fctx->suffixes[fctx->current_idx]);
+			fctx->current_idx++;
+
+			tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+			SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
+		}
+		else
+		{
+			SRF_RETURN_DONE(funcctx);
+		}
+	}
+}
diff --git a/src/test/modules/test_btree_merge/test_btree_merge.control b/src/test/modules/test_btree_merge/test_btree_merge.control
new file mode 100644
index 00000000000..f8146bd0f74
--- /dev/null
+++ b/src/test/modules/test_btree_merge/test_btree_merge.control
@@ -0,0 +1,5 @@
+# test_btree_merge extension
+comment = 'Unit tests for B-tree merge scan'
+default_version = '1.0'
+module_pathname = '$libdir/test_btree_merge'
+relocatable = true
diff --git a/src/test/regress/expected/btree_merge.out b/src/test/regress/expected/btree_merge.out
new file mode 100644
index 00000000000..441ae1d0657
--- /dev/null
+++ b/src/test/regress/expected/btree_merge.out
@@ -0,0 +1,113 @@
+-- B-Tree Merge Scan Access Method Test
+--
+-- B-Tree Merge Scan is an access method that allows lazily producing
+-- output sorted by a non-leading column when the prefix has few distinct values.
+--
+--
+-- Let S be an infinite set of lattic points (x,y).
+-- Let S(x=1,y>=b) be the sequence of points 
+--     SELECT * FROM S WHERE x = a and y >= b ORDER BY b;
+--     i.e. (a, b), (a, b+1), (a, b+2), ...
+-- Similarly, S(x IN X, y=b) being the sequence of points
+--     SELECT * FROM S WHERE x IN X and y = b ORDER BY x;
+--     i.e. (x[1], b), ..., (x[n], b), (x[1], b+1), ...
+-- The output of S(x IN X, y >= b) can be computed as a
+--
+-- Proposition (uncomputable): 
+-- S(x, IN X, y >= b) is the K-way merge of the sequences 
+-- {S(x=x[i], y >= b), x[i] in X}
+-- 
+--
+--
+-- Proposition (computable): Bounded suffix
+--
+-- S(x, IN X, b1 <= y <= b2) as bounded
+-- can be computed with (SELECT count(distinct x) + count(1) FROM bounded)
+-- tuple accesses.
+-- (Constructive) Proof:
+-- The result of 
+--    SELECT * FROM X 
+--    JOIN S on x = x[i] WHERE y BETWEEN b1 AND b2;
+-- is the same as 
+--    SELECT * FROM X,
+--    LATERAL (
+--        (SELECT * FROM S 
+--           WHERE x = x[i] AND y BETWEEN b1 AND b2
+--        ) AS subscan[i]
+--    ) as merged
+-- 
+-- Each of subscan[i] is covered by a single range in the index and can
+-- and require at most 
+--  (count(1) FROM subscan[i]) + 1    -- subscan tuple access count
+-- tupples to be accessed.
+-- The merged result can be computed using a K-way merge sort 
+-- whose number of rows is
+--   sum(count(1) FROM subscan[i])    -- query output rows
+-- Q.E.D.
+--
+--
+-- Proposition (computable): Limitted query
+-- The query
+--   S(x, IN X, y >= b) LIMIT N as limited
+-- Can be computed with at most 
+--   N + count(distinct X) - 1
+-- tuple accesses.
+--
+-- (Constructive) Proof:
+-- If an upper `u` bound for `MAX(y IN S(x, IN X, y >= b) LIMIT N)` is known,
+-- then the query can be rewritten as
+--   S(x, IN X, b <= y <= u) LIMIT N
+-- The K-way can produce the next element as soon as it has fetched
+-- the next element for each subquery
+-- 1 row can be produced after count(distinct X) fetches,
+-- After that it can produce one new row for each fetch.
+-- Thus, the total number of fetches is at most
+--   N + count(distinct X) - 1
+-- Q.E.D.
+-- Generate a table with lattice points 
+-- Could be infinite
+CREATE TABLE btree_merge_test AS (
+    SELECT x, y FROM 
+        generate_series(1, 50) AS x, 
+        generate_series(1, 50) AS y
+    ORDER BY random()
+);
+CREATE INDEX btree_merge_test_idx ON btree_merge_test USING btree (x, y);
+ANALYSE btree_merge_test;
+SET enable_seqscan = OFF;
+SET enable_bitmapscan = OFF;
+SHOW track_counts;  -- should be 'on'
+ track_counts 
+--------------
+ on
+(1 row)
+
+-- From the limited query proposition this can be computed with 10
+-- tupple accesses.
+SELECT x, y
+FROM btree_merge_test
+WHERE x IN (1,2,5,8,13,21,34,55) AND y >= 19
+ORDER BY y, x -- sort x to make result unique
+LIMIT 3;
+ x | y  
+---+----
+ 1 | 19
+ 2 | 19
+ 5 | 19
+(3 rows)
+
+SELECT pg_stat_force_next_flush();
+ pg_stat_force_next_flush 
+--------------------------
+ 
+(1 row)
+
+SELECT idx_scan, idx_tup_read, idx_tup_fetch 
+FROM pg_stat_user_indexes 
+WHERE indexrelname = 'btree_merge_test_idx';
+ idx_scan | idx_tup_read | idx_tup_fetch 
+----------+--------------+---------------
+        5 |           10 |            10
+(1 row)
+
+DROP TABLE btree_merge_test;
diff --git a/src/test/regress/sql/btree_merge.sql b/src/test/regress/sql/btree_merge.sql
new file mode 100644
index 00000000000..be00c33c2a5
--- /dev/null
+++ b/src/test/regress/sql/btree_merge.sql
@@ -0,0 +1,100 @@
+-- B-Tree Merge Scan Access Method Test
+--
+-- B-Tree Merge Scan is an access method that allows lazily producing
+-- output sorted by a non-leading column when the prefix has few distinct values.
+--
+--
+-- Let S be an infinite set of lattic points (x,y).
+-- Let S(x=1,y>=b) be the sequence of points 
+--     SELECT * FROM S WHERE x = a and y >= b ORDER BY b;
+--     i.e. (a, b), (a, b+1), (a, b+2), ...
+-- Similarly, S(x IN X, y=b) being the sequence of points
+--     SELECT * FROM S WHERE x IN X and y = b ORDER BY x;
+--     i.e. (x[1], b), ..., (x[n], b), (x[1], b+1), ...
+-- The output of S(x IN X, y >= b) can be computed as a
+--
+-- Proposition (uncomputable): 
+-- S(x, IN X, y >= b) is the K-way merge of the sequences 
+-- {S(x=x[i], y >= b), x[i] in X}
+-- 
+--
+--
+-- Proposition (computable): Bounded suffix
+--
+-- S(x, IN X, b1 <= y <= b2) as bounded
+-- can be computed with (SELECT count(distinct x) + count(1) FROM bounded)
+-- tuple accesses.
+-- (Constructive) Proof:
+-- The result of 
+--    SELECT * FROM X 
+--    JOIN S on x = x[i] WHERE y BETWEEN b1 AND b2;
+-- is the same as 
+--    SELECT * FROM X,
+--    LATERAL (
+--        (SELECT * FROM S 
+--           WHERE x = x[i] AND y BETWEEN b1 AND b2
+--        ) AS subscan[i]
+--    ) as merged
+-- 
+-- Each of subscan[i] is covered by a single range in the index and can
+-- and require at most 
+--  (count(1) FROM subscan[i]) + 1    -- subscan tuple access count
+-- tupples to be accessed.
+-- The merged result can be computed using a K-way merge sort 
+-- whose number of rows is
+--   sum(count(1) FROM subscan[i])    -- query output rows
+-- Q.E.D.
+--
+--
+-- Proposition (computable): Limitted query
+-- The query
+--   S(x, IN X, y >= b) LIMIT N as limited
+-- Can be computed with at most 
+--   N + count(distinct X) - 1
+-- tuple accesses.
+--
+-- (Constructive) Proof:
+-- If an upper `u` bound for `MAX(y IN S(x, IN X, y >= b) LIMIT N)` is known,
+-- then the query can be rewritten as
+--   S(x, IN X, b <= y <= u) LIMIT N
+-- The K-way can produce the next element as soon as it has fetched
+-- the next element for each subquery
+-- 1 row can be produced after count(distinct X) fetches,
+-- After that it can produce one new row for each fetch.
+-- Thus, the total number of fetches is at most
+--   N + count(distinct X) - 1
+-- Q.E.D.
+
+
+-- Generate a table with lattice points 
+-- Could be infinite
+CREATE TABLE btree_merge_test AS (
+    SELECT x, y FROM 
+        generate_series(1, 50) AS x, 
+        generate_series(1, 50) AS y
+    ORDER BY random()
+);
+CREATE INDEX btree_merge_test_idx ON btree_merge_test USING btree (x, y);
+
+ANALYSE btree_merge_test;
+
+SET enable_seqscan = OFF;
+SET enable_bitmapscan = OFF;
+SHOW track_counts;  -- should be 'on'
+-- From the limited query proposition this can be computed with 10
+-- tupple accesses.
+SELECT x, y
+FROM btree_merge_test
+WHERE x IN (1,2,5,8,13,21,34,55) AND y >= 19
+ORDER BY y, x -- sort x to make result unique
+LIMIT 3;
+
+
+SELECT pg_stat_force_next_flush();
+
+
+SELECT idx_scan, idx_tup_read, idx_tup_fetch 
+FROM pg_stat_user_indexes 
+WHERE indexrelname = 'btree_merge_test_idx';
+
+DROP TABLE btree_merge_test;
\ No newline at end of file

view thread (12+ messages)  latest in thread

reply

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Reply to all the recipients using the --to and --cc options:
  reply via email

  To: [email protected]
  Cc: [email protected]
  Subject: Re: New access method for b-tree.
  In-Reply-To: <AS1PR02MB784695AFEC37179FFAF7EAE19A9DA@AS1PR02MB7846.eurprd02.prod.outlook.com>

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

This inbox is served by agora; see mirroring instructions
for how to clone and mirror all data and code used for this inbox