From 3d3ca5ab3c6f6d2e7c275758dbb951e84d289e5e Mon Sep 17 00:00:00 2001 From: Anthonin Bonnefoy Date: Fri, 12 Dec 2025 09:56:00 +0100 Subject: Propagate errormsg to XLogFindNextRecord caller Currently, XLogFindNextRecord errormsg is ignored and callers will only output a generic 'could not find a valid record' message without details. Additionally, invalid page header won't go through XLogReadRecord, leaving the error in state->errormsg_buf. This patch propagates XLogFindNextRecord's error message to the callers and displays it. In case of an invalid page header, the errormsg is filled with errormsg_buf content. --- contrib/pg_walinspect/pg_walinspect.c | 16 ++++++++++++---- src/backend/access/transam/xlogreader.c | 18 +++++++++++++++--- src/backend/postmaster/walsummarizer.c | 17 ++++++++++++----- src/bin/pg_waldump/pg_waldump.c | 12 +++++++++--- src/bin/pg_waldump/t/001_basic.pl | 23 +++++++++++++++++++++++ src/include/access/xlogreader.h | 2 +- 6 files changed, 72 insertions(+), 16 deletions(-) diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c index 716a0922c6b..98dd31b69bb 100644 --- a/contrib/pg_walinspect/pg_walinspect.c +++ b/contrib/pg_walinspect/pg_walinspect.c @@ -97,6 +97,7 @@ InitXLogReaderState(XLogRecPtr lsn) XLogReaderState *xlogreader; ReadLocalXLogPageNoWaitPrivate *private_data; XLogRecPtr first_valid_record; + char *errormsg; /* * Reading WAL below the first page of the first segments isn't allowed. @@ -124,12 +125,19 @@ InitXLogReaderState(XLogRecPtr lsn) errdetail("Failed while allocating a WAL reading processor."))); /* first find a valid recptr to start from */ - first_valid_record = XLogFindNextRecord(xlogreader, lsn); + first_valid_record = XLogFindNextRecord(xlogreader, lsn, &errormsg); if (!XLogRecPtrIsValid(first_valid_record)) - ereport(ERROR, - errmsg("could not find a valid record after %X/%08X", - LSN_FORMAT_ARGS(lsn))); + { + if (errormsg) + ereport(ERROR, + errmsg("could not find a valid record after %X/%08X: %s", + LSN_FORMAT_ARGS(lsn), errormsg)); + else + ereport(ERROR, + errmsg("could not find a valid record after %X/%08X", + LSN_FORMAT_ARGS(lsn))); + } return xlogreader; } diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 03ada8aa0c5..c8aa84e5652 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -1391,12 +1391,13 @@ XLogReaderResetError(XLogReaderState *state) * XLogReadRecord() will read the next valid record. */ XLogRecPtr -XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr) +XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg) { XLogRecPtr tmpRecPtr; XLogRecPtr found = InvalidXLogRecPtr; XLogPageHeader header; - char *errormsg; + + *errormsg = NULL; Assert(XLogRecPtrIsValid(RecPtr)); @@ -1481,7 +1482,7 @@ XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr) * or we just jumped over the remaining data of a continuation. */ XLogBeginRead(state, tmpRecPtr); - while (XLogReadRecord(state, &errormsg) != NULL) + while (XLogReadRecord(state, errormsg) != NULL) { /* past the record we've found, break out */ if (RecPtr <= state->ReadRecPtr) @@ -1496,6 +1497,17 @@ XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr) err: XLogReaderInvalReadState(state); + /* + * We may have reported errors due to invalid WAL header, propagate the + * error message to the caller. + */ + if (state->errormsg_deferred) + { + if (state->errormsg_buf[0] != '\0') + *errormsg = state->errormsg_buf; + state->errormsg_deferred = false; + } + return InvalidXLogRecPtr; } diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c index 2d8f57099fd..f50976a7f76 100644 --- a/src/backend/postmaster/walsummarizer.c +++ b/src/backend/postmaster/walsummarizer.c @@ -917,6 +917,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, WalSummaryIO io; BlockRefTable *brtab = CreateEmptyBlockRefTable(); bool fast_forward = true; + char *errormsg; /* Initialize private data for xlogreader. */ private_data = palloc0_object(SummarizerReadLocalXLogPrivate); @@ -968,7 +969,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, } else { - summary_start_lsn = XLogFindNextRecord(xlogreader, start_lsn); + summary_start_lsn = XLogFindNextRecord(xlogreader, start_lsn, &errormsg); if (!XLogRecPtrIsValid(summary_start_lsn)) { /* @@ -997,9 +998,16 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, switch_lsn = xlogreader->EndRecPtr; } else - ereport(ERROR, - errmsg("could not find a valid record after %X/%08X", - LSN_FORMAT_ARGS(start_lsn))); + { + if (errormsg) + ereport(ERROR, + errmsg("could not find a valid record after %X/%08X: %s", + LSN_FORMAT_ARGS(start_lsn), errormsg)); + else + ereport(ERROR, + errmsg("could not find a valid record after %X/%08X", + LSN_FORMAT_ARGS(start_lsn))); + } } /* We shouldn't go backward. */ @@ -1012,7 +1020,6 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, while (1) { int block_id; - char *errormsg; XLogRecord *record; uint8 rmid; diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index f3446385d6a..503ba76e559 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -1212,11 +1212,17 @@ main(int argc, char **argv) pg_fatal("out of memory while allocating a WAL reading processor"); /* first find a valid recptr to start from */ - first_record = XLogFindNextRecord(xlogreader_state, private.startptr); + first_record = XLogFindNextRecord(xlogreader_state, private.startptr, &errormsg); if (!XLogRecPtrIsValid(first_record)) - pg_fatal("could not find a valid record after %X/%08X", - LSN_FORMAT_ARGS(private.startptr)); + { + if (errormsg) + pg_fatal("could not find a valid record after %X/%08X: %s", + LSN_FORMAT_ARGS(private.startptr), errormsg); + else + pg_fatal("could not find a valid record after %X/%08X", + LSN_FORMAT_ARGS(private.startptr)); + } /* * Display a message that we're skipping data if `from` wasn't a pointer diff --git a/src/bin/pg_waldump/t/001_basic.pl b/src/bin/pg_waldump/t/001_basic.pl index 5db5d20136f..1898848e418 100644 --- a/src/bin/pg_waldump/t/001_basic.pl +++ b/src/bin/pg_waldump/t/001_basic.pl @@ -6,6 +6,7 @@ use warnings FATAL => 'all'; use PostgreSQL::Test::Cluster; use PostgreSQL::Test::Utils; use Test::More; +use File::Copy; program_help_ok('pg_waldump'); program_version_ok('pg_waldump'); @@ -236,6 +237,28 @@ command_fails_like( qr/error: error in WAL record at/, 'errors are shown with --quiet'); +# Wrong WAL version. We copy an existing wal file and set the +# page's magic value to 0000. +{ + my $broken_wal_dir = PostgreSQL::Test::Utils::tempdir_short(); + my $broken_wal = "$broken_wal_dir/$start_walfile"; + copy($node->data_dir . '/pg_wal/' . $start_walfile, $broken_wal) + || die "copying $start_walfile $!"; + + my $fh; + open($fh, '+<', $broken_wal); + sysseek($fh, 0, 0) + or BAIL_OUT("sysseek failed: $!"); + syswrite($fh, pack("S", 0)) + or BAIL_OUT("syswrite failed: $!"); + close($fh); + + command_fails_like( + [ 'pg_waldump', $broken_wal ], + qr/invalid magic number 0000/i, + 'detailed error message shown for invalid WAL page magic' + ); +} # Test for: Display a message that we're skipping data if `from` # wasn't a pointer to the start of a record. diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h index 80f1a548e08..567df33ac8e 100644 --- a/src/include/access/xlogreader.h +++ b/src/include/access/xlogreader.h @@ -342,7 +342,7 @@ extern void XLogReaderSetDecodeBuffer(XLogReaderState *state, /* Position the XLogReader to given record */ extern void XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr); -extern XLogRecPtr XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr); +extern XLogRecPtr XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg); /* Return values from XLogPageReadCB. */ typedef enum XLogPageReadResult -- 2.52.0