public inbox for [email protected]  
help / color / mirror / Atom feed
From: Chao Li <[email protected]>
To: John Naylor <[email protected]>
Cc: Chengpeng Yan <[email protected]>
Cc: [email protected] <[email protected]>
Cc: Peter Geoghegan <[email protected]>
Subject: Re: tuple radix sort
Date: Wed, 3 Dec 2025 16:22:13 +0800
Message-ID: <[email protected]> (raw)
In-Reply-To: <CANWCAZaKRzsYPn9YWRR8g5dkX-ep_jgyUb4r3KFekK5ZTNo7ew@mail.gmail.com>
References: <CANWCAZYzx7a7E9AY16Jt_U3+GVKDADfgApZ-42SYNiig8dTnFA@mail.gmail.com>
	<CANWCAZbsx5VY-F5RVvZ5H1abBWcinMY90gNra6EbdGS1CTx=7g@mail.gmail.com>
	<CANWCAZaN8kW6o7Ymc_jzPO_Z5SqekQTTNra3xTGYTH+cjrVp8g@mail.gmail.com>
	<[email protected]>
	<CANWCAZZiMGj6QuHfyBPv9at7xn23NPAz4nit=G6fr26V+h8MKg@mail.gmail.com>
	<[email protected]>
	<CANWCAZaKRzsYPn9YWRR8g5dkX-ep_jgyUb4r3KFekK5ZTNo7ew@mail.gmail.com>

Hi John,

I played with this again today and found an optimization that seems to dramatically improve the performance:

```
+static void
+radix_sort_tuple(SortTuple *begin, size_t n_elems, int level, Tuplesortstate *state)
+{
+	RadixPartitionInfo partitions[256] = {0};
+	uint8_t		remaining_partitions[256] = {0};
```

Here partitions and remaining_partitions are just temporary buffers, allocating memory from stack and initialize them seems slow. By passing them as function parameters are much faster. See attached diff for my change.

V5 patch: by the way, v5 is very faster than v1.
```
evantest=# select * from test_multi order by category, name;
Time: 299.912 ms
evantest=# select * from test_multi order by category, name;
Time: 298.793 ms
evantest=# select * from test_multi order by category, name;
Time: 300.306 ms
evantest=# select * from test_multi order by category, name;
Time: 302.140 ms
```

v5 + my change:
```
evantest=# select * from test_multi order by category, name;
Time: 152.572 ms
evantest=# select * from test_multi order by category, name;
Time: 143.296 ms
evantest=# select * from test_multi order by category, name;
Time: 151.606 ms
```

The test I did today is just the high cardinality first column test I had done before:
```
drop table if exists test_multi;
create unlogged table test_multi (category int, name text);
insert into test_multi select (random() * 1000000)::int as category,  md5(random()::text) || md5(random()::text) as name from generate_series(1, 1000000);
vacuum freeze test_multi;
\timing on
\o /dev/null
set wip_radix_sort = ‘on;
set work_mem = ‘2GB’;
select * from test_multi order by category, name;
```

Best regards,
--
Chao Li (Evan)
HighGo Software Co., Ltd.
https://www.highgo.com/








Attachments:

  [application/octet-stream] tuplesort_chaoli.diff (1.9K, 2-tuplesort_chaoli.diff)
  download | inline diff:
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 028c5b71c27..2bf2785f5ec 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -2760,10 +2760,11 @@ normalize_datum(Datum orig, SortSupport ssup)
  * DEALINGS IN THE SOFTWARE.
  */
 static void
-radix_sort_tuple(SortTuple *begin, size_t n_elems, int level, Tuplesortstate *state)
+radix_sort_tuple(SortTuple *begin, size_t n_elems, int level, Tuplesortstate *state,
+				 RadixPartitionInfo * partitions, uint8_t *remaining_partitions)
 {
-	RadixPartitionInfo partitions[256] = {0};
-	uint8_t		remaining_partitions[256] = {0};
+	/* RadixPartitionInfo partitions[256] = {0}; */
+	/* uint8_t		remaining_partitions[256] = {0}; */
 	size_t		total = 0;
 	int			num_partitions = 0;
 	int			num_remaining;
@@ -2771,6 +2772,9 @@ radix_sort_tuple(SortTuple *begin, size_t n_elems, int level, Tuplesortstate *st
 	size_t		start_offset = 0;
 	SortTuple  *partition_begin = begin;
 
+	memset(partitions, 0, sizeof(RadixPartitionInfo) * 256);
+	memset(remaining_partitions, 0, sizeof(uint8_t) * 256);
+
 	/* count number of occurrences of each byte */
 	for (SortTuple *tup = begin; tup < begin + n_elems; tup++)
 	{
@@ -2881,7 +2885,9 @@ radix_sort_tuple(SortTuple *begin, size_t n_elems, int level, Tuplesortstate *st
 					radix_sort_tuple(partition_begin,
 									 num_elements,
 									 level + 1,
-									 state);
+									 state,
+									 partitions,
+									 remaining_partitions);
 				}
 			}
 			else if (state->base.onlyKey == NULL)
@@ -3019,10 +3025,15 @@ sort_byvalue_datum(Tuplesortstate *state)
 		}
 		else
 		{
+			RadixPartitionInfo partitions[256];
+			uint8_t		remaining_partitions[256];
+
 			radix_sort_tuple(not_null_start,
 							 not_null_count,
 							 0,
-							 state);
+							 state,
+							 partitions,
+							 remaining_partitions);
 		}
 	}
 }


view thread (39+ messages)  latest in thread

reply

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Reply to all the recipients using the --to and --cc options:
  reply via email

  To: [email protected]
  Cc: [email protected], [email protected], [email protected], [email protected], [email protected]
  Subject: Re: tuple radix sort
  In-Reply-To: <[email protected]>

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

This inbox is served by agora; see mirroring instructions
for how to clone and mirror all data and code used for this inbox