public inbox for [email protected]
help / color / mirror / Atom feedFrom: =?utf-8?B?Y2NhNTUwNw==?= <[email protected]>
To: =?utf-8?B?cGdzcWwtaGFja2Vycw==?= <[email protected]>
Subject: Support loser tree for k-way merge
Date: Wed, 3 Dec 2025 19:48:10 +0800
Message-ID: <[email protected]> (raw)
Hi hackers,
Background
==========
Now we use 'heap' during the k-way merge, it's O(n log k). The 'loser tree' is also O(n log k), but
it's usually has fewer comparisons than the 'heap'. When the tuple comparator is complex, the
'loser tree' can significantly speed up the k-way merge.
Test
====
With the WIP patch(v1-0001), I got a 3% ~ 13%(different work_mem) speed up in the following test case:
SET max_parallel_workers_per_gather = 0;
CREATE UNLOGGED TABLE t AS SELECT generate_series(1, 20000000) AS a, md5(random()::text) AS b;
create extension if not exists pg_prewarm;
select pg_prewarm('t');
SET enable_loser_tree = OFF;
# SET work_mem = '4MB'; ('8MB' '16MB' '32MB' '64MB' ...)
explain analyze select * from t order by b;
SET enable_loser_tree = ON;
# SET work_mem = '4MB'; ('8MB' '16MB' '32MB' '64MB' ...)
explain analyze select * from t order by b;
Open questions
==============
1) Now I add a GUC 'enable_loser_tree' to control the use of loser tree, maybe we should
decide whether to use the 'loser tree' based on the value of 'k', the complexity of tuple
comparators or just always use the 'loser tree'?
Looking forward to your reply and comment.
--
Regards,
ChangAo Chen
Attachments:
[application/octet-stream] v1-0001-Support-loser-tree-for-k-way-merge.patch (7.8K, 2-v1-0001-Support-loser-tree-for-k-way-merge.patch)
download | inline diff:
From 2981406dbf5bb738273b3ff4c26854ce7d715dd1 Mon Sep 17 00:00:00 2001
From: ChangAo Chen <[email protected]>
Date: Wed, 3 Dec 2025 18:58:28 +0800
Subject: [PATCH v1] Support loser tree for k-way merge.
The loser tree usually has fewer comparisons than the heap during
k-way merge.
---
src/backend/utils/misc/guc_parameters.dat | 7 +
src/backend/utils/sort/tuplesort.c | 160 +++++++++++++++++++++-
src/include/utils/guc.h | 1 +
3 files changed, 167 insertions(+), 1 deletion(-)
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index 3b9d8349078..123ece8ea75 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -882,6 +882,13 @@
boot_val => 'true',
},
+{ name => 'enable_loser_tree', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD',
+ short_desc => 'Enables loser tree for k-way merge.',
+ flags => 'GUC_NOT_IN_SAMPLE',
+ variable => 'enable_loser_tree',
+ boot_val => 'false',
+},
+
{ name => 'enable_material', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD',
short_desc => 'Enables the planner\'s use of materialization.',
flags => 'GUC_EXPLAIN',
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 5d4411dc33f..381f5909094 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -122,6 +122,9 @@
/* GUC variables */
bool trace_sort = false;
+bool enable_loser_tree = false;
+
+#define LOSER_TREE_EOF -1
#ifdef DEBUG_BOUNDED_SORT
bool optimize_bounded_sort = true;
@@ -219,6 +222,9 @@ struct Tuplesortstate
int memtupsize; /* allocated length of memtuples array */
bool growmemtuples; /* memtuples' growth still underway? */
+ bool useLoserTree; /* use loser tree for k-way merge if true */
+ int *losers; /* array of losers, losers[0] is the winner */
+
/*
* Memory for tuples is sometimes allocated using a simple slab allocator,
* rather than with palloc(). Currently, we switch to slab allocation
@@ -468,6 +474,8 @@ static void tuplesort_sort_memtuples(Tuplesortstate *state);
static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple);
static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple);
static void tuplesort_heap_delete_top(Tuplesortstate *state);
+static void tuplesort_loser_tree_build(Tuplesortstate *state);
+static void tuplesort_loser_tree_adjust(Tuplesortstate *state, int start);
static void reversedirection(Tuplesortstate *state);
static unsigned int getlen(LogicalTape *tape, bool eofOK);
static void markrunend(LogicalTape *tape);
@@ -681,6 +689,8 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate, int sortopt)
state->base.sortopt = sortopt;
state->base.tuples = true;
state->abbrevNext = 10;
+ state->useLoserTree = enable_loser_tree;
+ state->losers = NULL;
/*
* workMem is forced to be at least 64KB, the current minimum valid value
@@ -1647,6 +1657,37 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
state->lastReturnedTuple = NULL;
}
+ if (state->useLoserTree && state->memtupcount > 0)
+ {
+ int i = state->losers[0];
+ int start = i + state->memtupcount;
+ int srcTapeIndex = state->memtuples[i].srctape;
+ LogicalTape *srcTape = state->inputTapes[srcTapeIndex];
+ SortTuple newtup;
+
+ *stup = state->memtuples[i];
+
+ state->lastReturnedTuple = stup->tuple;
+
+ if (mergereadnext(state, srcTape, &newtup))
+ {
+ newtup.srctape = srcTapeIndex;
+ state->memtuples[i] = newtup;
+ }
+ else
+ {
+ state->losers[start] = LOSER_TREE_EOF;
+ state->nInputRuns--;
+ LogicalTapeClose(srcTape);
+ }
+ tuplesort_loser_tree_adjust(state, start);
+
+ if (state->losers[0] == LOSER_TREE_EOF)
+ state->memtupcount = 0;
+
+ return true;
+ }
+
/*
* This code should match the inner loop of mergeonerun().
*/
@@ -2206,6 +2247,41 @@ mergeonerun(Tuplesortstate *state)
Assert(state->slabAllocatorUsed);
+ if (state->useLoserTree && state->memtupcount > 0)
+ {
+ while (state->losers[0] != LOSER_TREE_EOF)
+ {
+ int i = state->losers[0];
+ int start = i + state->memtupcount;
+ SortTuple stup;
+
+ /* write the tuple to destTape */
+ srcTapeIndex = state->memtuples[i].srctape;
+ srcTape = state->inputTapes[srcTapeIndex];
+ WRITETUP(state, state->destTape, &state->memtuples[i]);
+
+ /* recycle the slot of the tuple we just wrote out, for the next read */
+ if (state->memtuples[i].tuple)
+ RELEASE_SLAB_SLOT(state, state->memtuples[i].tuple);
+
+ if (mergereadnext(state, srcTape, &stup))
+ {
+ stup.srctape = srcTapeIndex;
+ state->memtuples[i] = stup;
+ }
+ else
+ {
+ state->losers[start] = LOSER_TREE_EOF;
+ state->nInputRuns--;
+ }
+
+ tuplesort_loser_tree_adjust(state, start);
+ }
+ state->memtupcount = 0;
+ markrunend(state->destTape);
+ return;
+ }
+
/*
* Execute merge by repeatedly extracting lowest tuple in heap, writing it
* out, and replacing it with next tuple from same tape (if there is
@@ -2270,9 +2346,16 @@ beginmerge(Tuplesortstate *state)
if (mergereadnext(state, state->inputTapes[srcTapeIndex], &tup))
{
tup.srctape = srcTapeIndex;
- tuplesort_heap_insert(state, &tup);
+
+ if (state->useLoserTree)
+ state->memtuples[state->memtupcount++] = tup;
+ else
+ tuplesort_heap_insert(state, &tup);
}
}
+
+ if (state->useLoserTree && state->memtupcount > 0)
+ tuplesort_loser_tree_build(state);
}
/*
@@ -2823,6 +2906,81 @@ tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple)
memtuples[i] = *tuple;
}
+static void
+tuplesort_loser_tree_build(Tuplesortstate *state)
+{
+ int k = state->memtupcount; /* k-way */
+ int winner[MAXORDER * 2];
+ int i;
+
+ Assert(k > 0 && k <= MAXORDER);
+
+ if (state->losers == NULL)
+ state->losers = (int *) MemoryContextAlloc(state->base.maincontext,
+ MAXORDER * 2 * sizeof(int));
+
+ for (i = k; i < 2 * k; i++)
+ {
+ winner[i] = i - k;
+ state->losers[i] = i - k;
+ }
+
+ for (i = k - 1; i >= 1; i--)
+ {
+ int l = i * 2;
+ int r = l + 1;
+
+ if (COMPARETUP(state,
+ &state->memtuples[winner[l]],
+ &state->memtuples[winner[r]]) < 0)
+ {
+ winner[i] = winner[l];
+ state->losers[i] = winner[r];
+ }
+ else
+ {
+ winner[i] = winner[r];
+ state->losers[i] = winner[l];
+ }
+ }
+
+ /* set the final winner to state->losers[0] */
+ state->losers[0] = winner[1];
+}
+
+static void
+tuplesort_loser_tree_adjust(Tuplesortstate *state, int start)
+{
+ int winner;
+ int parent;
+
+ Assert(state->losers != NULL);
+ Assert(start >= state->memtupcount && start < state->memtupcount * 2);
+
+ winner = state->losers[start];
+
+ for (parent = start / 2; parent > 0; parent /= 2)
+ {
+ int loser = state->losers[parent];
+
+ /* eof is always the loser */
+ if (loser == LOSER_TREE_EOF)
+ continue;
+
+ if (winner == LOSER_TREE_EOF ||
+ COMPARETUP(state,
+ &state->memtuples[winner],
+ &state->memtuples[loser]) > 0)
+ {
+ state->losers[parent] = winner;
+ winner = loser;
+ }
+ }
+
+ /* set the final winner to state->losers[0] */
+ state->losers[0] = winner;
+}
+
/*
* Function to reverse the sort direction from its current state
*
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index f21ec37da89..8c22eed716b 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -324,6 +324,7 @@ extern PGDLLIMPORT int tcp_user_timeout;
extern PGDLLIMPORT char *role_string;
extern PGDLLIMPORT bool in_hot_standby_guc;
extern PGDLLIMPORT bool trace_sort;
+extern PGDLLIMPORT bool enable_loser_tree;
#ifdef DEBUG_BOUNDED_SORT
extern PGDLLIMPORT bool optimize_bounded_sort;
--
2.34.1
reply
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Reply to all the recipients using the --to and --cc options:
reply via email
To: [email protected]
Cc: [email protected], [email protected]
Subject: Re: Support loser tree for k-way merge
In-Reply-To: <[email protected]>
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
This inbox is served by agora; see mirroring instructions
for how to clone and mirror all data and code used for this inbox