public inbox for [email protected]  
help / color / mirror / Atom feed
From: SATYANARAYANA NARLAPURAM <[email protected]>
To: PostgreSQL Hackers <[email protected]>
To: Álvaro Herrera <[email protected]>
To: [email protected]
Subject: [PATCH] Compressed TOAST data corruption with REPACK CONCURRENTLY
Date: Wed, 15 Apr 2026 23:13:42 -0700
Message-ID: <CAHg+QDeXb9HM2VGKXQedyCp52GzajJK5KOUdNi6oLjsS0nerQw@mail.gmail.com> (raw)

Hi hackers,

restore_tuple() in repack.c uses SET_VARSIZE() to reconstruct the varlena
header when
reading back external attributes from the spill file. In this process,
looks like the flag
SET_VARSIZE_COMPRESSED is silently lost. Because of this, when REPACK
CONCURRENTLY
run  any concurrently updated column whose value was TOAST-compressed ends
up with raw
compressed bytes behind an "uncompressed" header returning garbled data on
subsequent reads.
It appears that existing tests are using random chars which are
uncompressable.

Please find the
attached 0001-Fix-restore_tuple-losing-varlena-compression-flag.patch to
fix this.
Additionally I updated the existing repack_toast test to include the
scenario I was talking about.

Thanks,
Satya


Attachments:

  [application/octet-stream] 0001-Fix-restore_tuple-losing-varlena-compression-flag.patch (654B, 3-0001-Fix-restore_tuple-losing-varlena-compression-flag.patch)
  download | inline diff:
diff --git a/src/backend/commands/repack.c b/src/backend/commands/repack.c
index 58e3867..e6b6fea 100644
--- a/src/backend/commands/repack.c
+++ b/src/backend/commands/repack.c
@@ -2727,7 +2727,10 @@ restore_tuple(BufFile *file, Relation relation, TupleTableSlot *slot)
 			varlensz = VARSIZE_ANY(&chunk_header);
 
 			value = palloc(varlensz);
-			SET_VARSIZE(value, VARSIZE_ANY(&chunk_header));
+			if (VARATT_IS_4B_C(&chunk_header))
+				SET_VARSIZE_COMPRESSED(value, varlensz);
+			else
+				SET_VARSIZE(value, varlensz);
 			BufFileReadExact(file, (char *) value + VARHDRSZ, varlensz - VARHDRSZ);
 
 			slot->tts_values[i] = PointerGetDatum(value);


  [application/octet-stream] 0002-Add-compressed-TOAST-test-to-repack_toast.patch (4.7K, 4-0002-Add-compressed-TOAST-test-to-repack_toast.patch)
  download | inline diff:
diff --git a/src/test/modules/injection_points/expected/repack_toast.out b/src/test/modules/injection_points/expected/repack_toast.out
index b56dde1..0eefc88 100644
--- a/src/test/modules/injection_points/expected/repack_toast.out
+++ b/src/test/modules/injection_points/expected/repack_toast.out
@@ -63,3 +63,60 @@ injection_points_detach
                        
 (1 row)
 
+
+starting permutation: repack_compressed compressed_update wakeup_before_lock verify_compressed
+injection_points_attach
+-----------------------
+                       
+(1 row)
+
+step repack_compressed: 
+	REPACK (CONCURRENTLY) repack_ctest;
+ <waiting ...>
+step compressed_update: 
+	UPDATE repack_ctest SET j = gen_compressible(99) WHERE i = 2;
+
+step wakeup_before_lock: 
+	SELECT injection_points_wakeup('repack-concurrently-before-lock');
+
+injection_points_wakeup
+-----------------------
+                       
+(1 row)
+
+step repack_compressed: <... completed>
+step verify_compressed: 
+	INSERT INTO post_repack_data
+	SELECT i, j FROM repack_ctest ORDER BY i;
+
+	SELECT e.i,
+	       length(e.j) AS expect_len,
+	       length(p.j) AS actual_len,
+	       (length(e.j) = length(p.j)) AS len_ok,
+	       (md5(e.j) = md5(p.j)) AS hash_ok
+	FROM expected_data e
+	JOIN post_repack_data p USING (i)
+	ORDER BY e.i;
+
+	SELECT count(*) AS mismatches
+	FROM expected_data e
+	JOIN post_repack_data p USING (i)
+	WHERE length(e.j) != length(p.j) OR md5(e.j) != md5(p.j);
+
+i|expect_len|actual_len|len_ok|hash_ok
+-+----------+----------+------+-------
+1|     16000|     16000|t     |t      
+2|     16000|     16000|t     |t      
+3|     16000|     16000|t     |t      
+(3 rows)
+
+mismatches
+----------
+         0
+(1 row)
+
+injection_points_detach
+-----------------------
+                       
+(1 row)
+
diff --git a/src/test/modules/injection_points/specs/repack_toast.spec b/src/test/modules/injection_points/specs/repack_toast.spec
index b878b19..fb8a557 100644
--- a/src/test/modules/injection_points/specs/repack_toast.spec
+++ b/src/test/modules/injection_points/specs/repack_toast.spec
@@ -13,6 +13,14 @@ setup
 		FROM generate_series(1, 2048) s(x);
 	$$;
 
+	-- 500 concatenated md5 hashes = 16000 hex chars; compresses but stays > 2KB.
+	CREATE FUNCTION gen_compressible(seed int)
+	RETURNS text
+	LANGUAGE sql IMMUTABLE as $$
+		SELECT string_agg(md5((seed * 1000 + x)::text), '')
+		FROM generate_series(1, 500) x;
+	$$;
+
 	CREATE TABLE repack_test(i int PRIMARY KEY, j text);
 	INSERT INTO repack_test(i, j) VALUES (1, get_long_string()),
 		(2, get_long_string()), (3, get_long_string());
@@ -21,13 +29,30 @@ setup
 
 	CREATE TABLE data_s1(i int, j text);
 	CREATE TABLE data_s2(i int, j text);
+
+	CREATE TABLE repack_ctest(i int PRIMARY KEY, j text);
+	INSERT INTO repack_ctest(i, j)
+	VALUES (1, gen_compressible(1)),
+	       (2, gen_compressible(2)),
+	       (3, gen_compressible(3));
+
+	CREATE TABLE expected_data AS
+	SELECT 1 AS i, gen_compressible(1) AS j
+	UNION ALL SELECT 2, gen_compressible(99)
+	UNION ALL SELECT 3, gen_compressible(3);
+
+	CREATE TABLE post_repack_data(i int, j text);
 }
 
 teardown
 {
 	DROP TABLE repack_test;
+	DROP TABLE repack_ctest;
+	DROP TABLE expected_data;
+	DROP TABLE post_repack_data;
 	DROP EXTENSION injection_points;
 	DROP FUNCTION get_long_string();
+	DROP FUNCTION gen_compressible(int);
 
 	DROP TABLE relfilenodes;
 	DROP TABLE data_s1;
@@ -67,6 +92,29 @@ step check1
 	FROM data_s1 d1 FULL JOIN data_s2 d2 USING (i, j)
 	WHERE d1.i ISNULL OR d2.i ISNULL;
 }
+step repack_compressed
+{
+	REPACK (CONCURRENTLY) repack_ctest;
+}
+step verify_compressed
+{
+	INSERT INTO post_repack_data
+	SELECT i, j FROM repack_ctest ORDER BY i;
+
+	SELECT e.i,
+	       length(e.j) AS expect_len,
+	       length(p.j) AS actual_len,
+	       (length(e.j) = length(p.j)) AS len_ok,
+	       (md5(e.j) = md5(p.j)) AS hash_ok
+	FROM expected_data e
+	JOIN post_repack_data p USING (i)
+	ORDER BY e.i;
+
+	SELECT count(*) AS mismatches
+	FROM expected_data e
+	JOIN post_repack_data p USING (i)
+	WHERE length(e.j) != length(p.j) OR md5(e.j) != md5(p.j);
+}
 teardown
 {
     SELECT injection_points_detach('repack-concurrently-before-lock');
@@ -101,6 +149,10 @@ step wakeup_before_lock
 {
 	SELECT injection_points_wakeup('repack-concurrently-before-lock');
 }
+step compressed_update
+{
+	UPDATE repack_ctest SET j = gen_compressible(99) WHERE i = 2;
+}
 
 # Test if data changes introduced while one session is performing REPACK
 # CONCURRENTLY find their way into the table.
@@ -110,3 +162,10 @@ permutation
 	check2
 	wakeup_before_lock
 	check1
+
+# Test that compressed TOAST values survive REPACK CONCURRENTLY.
+permutation
+	repack_compressed
+	compressed_update
+	wakeup_before_lock
+	verify_compressed


view thread (10+ messages)  latest in thread

reply

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Reply to all the recipients using the --to and --cc options:
  reply via email

  To: [email protected]
  Cc: [email protected], [email protected], [email protected], [email protected]
  Subject: Re: [PATCH] Compressed TOAST data corruption with REPACK CONCURRENTLY
  In-Reply-To: <CAHg+QDeXb9HM2VGKXQedyCp52GzajJK5KOUdNi6oLjsS0nerQw@mail.gmail.com>

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

This inbox is served by agora; see mirroring instructions
for how to clone and mirror all data and code used for this inbox