From 4322e9804f9bce7f9fb30872c5d64736e91c653b Mon Sep 17 00:00:00 2001 From: Amul Sul Date: Tue, 10 Feb 2026 11:42:36 +0530 Subject: [PATCH v14 06/11] pg_waldump: Add support for archived WAL decoding. pg_waldump can now accept the path to a tar archive containing WAL files and decode them. This feature was added primarily for pg_verifybackup, which previously disabled WAL parsing for tar-formatted backups. Note that this patch requires that the WAL files within the archive be in sequential order; an error will be reported otherwise. The next patch is planned to remove this restriction. --- doc/src/sgml/ref/pg_waldump.sgml | 8 +- src/bin/pg_waldump/Makefile | 7 +- src/bin/pg_waldump/archive_waldump.c | 639 +++++++++++++++++++++++++++ src/bin/pg_waldump/meson.build | 4 +- src/bin/pg_waldump/pg_waldump.c | 255 ++++++++--- src/bin/pg_waldump/pg_waldump.h | 43 ++ src/bin/pg_waldump/t/001_basic.pl | 105 ++++- src/tools/pgindent/typedefs.list | 3 + 8 files changed, 998 insertions(+), 66 deletions(-) create mode 100644 src/bin/pg_waldump/archive_waldump.c diff --git a/doc/src/sgml/ref/pg_waldump.sgml b/doc/src/sgml/ref/pg_waldump.sgml index d1715ff5124..15fb8d13199 100644 --- a/doc/src/sgml/ref/pg_waldump.sgml +++ b/doc/src/sgml/ref/pg_waldump.sgml @@ -141,13 +141,17 @@ PostgreSQL documentation - Specifies a directory to search for WAL segment files or a - directory with a pg_wal subdirectory that + Specifies a tar archive or a directory to search for WAL segment files + or a directory with a pg_wal subdirectory that contains such files. The default is to search in the current directory, the pg_wal subdirectory of the current directory, and the pg_wal subdirectory of PGDATA. + + If a tar archive is provided, its WAL segment files must be in + sequential order; otherwise, an error will be reported. + diff --git a/src/bin/pg_waldump/Makefile b/src/bin/pg_waldump/Makefile index 4c1ee649501..aabb87566a2 100644 --- a/src/bin/pg_waldump/Makefile +++ b/src/bin/pg_waldump/Makefile @@ -3,6 +3,9 @@ PGFILEDESC = "pg_waldump - decode and display WAL" PGAPPICON=win32 +# make these available to TAP test scripts +export TAR + subdir = src/bin/pg_waldump top_builddir = ../../.. include $(top_builddir)/src/Makefile.global @@ -10,13 +13,15 @@ include $(top_builddir)/src/Makefile.global OBJS = \ $(RMGRDESCOBJS) \ $(WIN32RES) \ + archive_waldump.o \ compat.o \ pg_waldump.o \ rmgrdesc.o \ xlogreader.o \ xlogstats.o -override CPPFLAGS := -DFRONTEND $(CPPFLAGS) +override CPPFLAGS := -DFRONTEND -I$(libpq_srcdir) $(CPPFLAGS) +LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils RMGRDESCSOURCES = $(sort $(notdir $(wildcard $(top_srcdir)/src/backend/access/rmgrdesc/*desc*.c))) RMGRDESCOBJS = $(patsubst %.c,%.o,$(RMGRDESCSOURCES)) diff --git a/src/bin/pg_waldump/archive_waldump.c b/src/bin/pg_waldump/archive_waldump.c new file mode 100644 index 00000000000..17d27ffa520 --- /dev/null +++ b/src/bin/pg_waldump/archive_waldump.c @@ -0,0 +1,639 @@ +/*------------------------------------------------------------------------- + * + * archive_waldump.c + * A generic facility for reading WAL data from tar archives via archive + * streamer. + * + * Portions Copyright (c) 2026, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/bin/pg_waldump/archive_waldump.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres_fe.h" + +#include + +#include "access/xlog_internal.h" +#include "common/hashfn.h" +#include "common/logging.h" +#include "fe_utils/simple_list.h" +#include "pg_waldump.h" + +/* + * How many bytes should we try to read from a file at once? + */ +#define READ_CHUNK_SIZE (128 * 1024) + +/* + * Check if the start segment number is zero; this indicates a request to read + * any WAL file. + */ +#define READ_ANY_WAL(privateInfo) ((privateInfo)->start_segno == 0) + +/* + * Hash entry representing a WAL segment retrieved from the archive. + * + * While WAL segments are typically read sequentially, individual entries + * maintain their own buffers for the following reasons: + * + * 1. Boundary Handling: The archive streamer provides a continuous byte + * stream. A single streaming chunk may contain the end of one WAL segment + * and the start of the next. Separate buffers allow us to easily + * partition and track these bytes by their respective segments. + * + * 2. Out-of-Order Support: Dedicated buffers simplify logic if segments + * are ever archived or retrieved out of sequence. + * + * To minimize the memory footprint, entries and their associated buffers are + * freed immediately once consumed. Since pg_waldump does not request the same + * bytes twice, a segment is discarded as soon as it moves past it. + */ +typedef struct ArchivedWALFile +{ + uint32 status; /* hash status */ + const char *fname; /* hash key: WAL segment name */ + + StringInfo buf; /* holds WAL bytes read from archive */ + + int read_len; /* total bytes of a WAL read from archive */ +} ArchivedWALFile; + +static uint32 hash_string_pointer(const char *s); +#define SH_PREFIX ArchivedWAL +#define SH_ELEMENT_TYPE ArchivedWALFile +#define SH_KEY_TYPE const char * +#define SH_KEY fname +#define SH_HASH_KEY(tb, key) hash_string_pointer(key) +#define SH_EQUAL(tb, a, b) (strcmp(a, b) == 0) +#define SH_SCOPE static inline +#define SH_RAW_ALLOCATOR pg_malloc0 +#define SH_DECLARE +#define SH_DEFINE +#include "lib/simplehash.h" + +typedef struct astreamer_waldump +{ + astreamer base; + XLogDumpPrivate *privateInfo; +} astreamer_waldump; + +static ArchivedWALFile *get_archive_wal_entry(const char *fname, + XLogDumpPrivate *privateInfo, + int WalSegSz); +static int read_archive_file(XLogDumpPrivate *privateInfo, Size count); + +static astreamer *astreamer_waldump_new(XLogDumpPrivate *privateInfo); +static void astreamer_waldump_content(astreamer *streamer, + astreamer_member *member, + const char *data, int len, + astreamer_archive_context context); +static void astreamer_waldump_finalize(astreamer *streamer); +static void astreamer_waldump_free(astreamer *streamer); + +static bool member_is_wal_file(astreamer_waldump *mystreamer, + astreamer_member *member, + char **fname); + +static const astreamer_ops astreamer_waldump_ops = { + .content = astreamer_waldump_content, + .finalize = astreamer_waldump_finalize, + .free = astreamer_waldump_free +}; + +/* + * Initializes the tar archive reader, creates a hash table for WAL entries, + * checks for existing valid WAL segments in the archive file and retrieves the + * segment size, and sets up filters for relevant entries. + */ +void +init_archive_reader(XLogDumpPrivate *privateInfo, const char *waldir, + int *WalSegSz, pg_compress_algorithm compression) +{ + int fd; + astreamer *streamer; + ArchivedWALFile *entry = NULL; + XLogLongPageHeader longhdr; + XLogSegNo segno; + TimeLineID timeline; + + /* Open tar archive and store its file descriptor */ + fd = open_file_in_directory(waldir, privateInfo->archive_name); + + if (fd < 0) + pg_fatal("could not open file \"%s\"", privateInfo->archive_name); + + privateInfo->archive_fd = fd; + + streamer = astreamer_waldump_new(privateInfo); + + /* Before that we must parse the tar archive. */ + streamer = astreamer_tar_parser_new(streamer); + + /* Before that we must decompress, if archive is compressed. */ + if (compression == PG_COMPRESSION_GZIP) + streamer = astreamer_gzip_decompressor_new(streamer); + else if (compression == PG_COMPRESSION_LZ4) + streamer = astreamer_lz4_decompressor_new(streamer); + else if (compression == PG_COMPRESSION_ZSTD) + streamer = astreamer_zstd_decompressor_new(streamer); + + privateInfo->archive_streamer = streamer; + + /* + * Hash table storing WAL entries read from the archive with an arbitrary + * initial size + */ + privateInfo->archive_wal_htab = ArchivedWAL_create(8, NULL); + + /* + * Verify that the archive contains valid WAL files and fetch WAL segment + * size + */ + while (entry == NULL || entry->buf->len < XLOG_BLCKSZ) + { + if (read_archive_file(privateInfo, XLOG_BLCKSZ) == 0) + pg_fatal("could not find WAL in archive \"%s\"", + privateInfo->archive_name); + + entry = privateInfo->cur_file; + } + + /* Set WalSegSz if WAL data is successfully read */ + longhdr = (XLogLongPageHeader) entry->buf->data; + + if (!IsValidWalSegSize(longhdr->xlp_seg_size)) + { + pg_log_error(ngettext("invalid WAL segment size in WAL file from archive \"%s\" (%d byte)", + "invalid WAL segment size in WAL file from archive \"%s\" (%d bytes)", + longhdr->xlp_seg_size), + privateInfo->archive_name, longhdr->xlp_seg_size); + pg_log_error_detail("The WAL segment size must be a power of two between 1 MB and 1 GB."); + exit(1); + } + + *WalSegSz = longhdr->xlp_seg_size; + + /* + * With the WAL segment size available, we can now initialize the + * dependent start and end segment numbers. + */ + Assert(!XLogRecPtrIsInvalid(privateInfo->startptr)); + XLByteToSeg(privateInfo->startptr, privateInfo->start_segno, *WalSegSz); + + if (!XLogRecPtrIsInvalid(privateInfo->endptr)) + XLByteToSeg(privateInfo->endptr, privateInfo->end_segno, *WalSegSz); + + /* + * This WAL record was fetched before the filtering parameters + * (start_segno and end_segno) were fully initialized. Perform the + * relevance check against the user-provided range now; if the WAL falls + * outside this range, remove it from the hash table. Subsequent WAL will + * be filtered automatically by the archived streamer using the updated + * start_segno and end_segno values. + */ + XLogFromFileName(entry->fname, &timeline, &segno, privateInfo->segsize); + if (privateInfo->timeline != timeline || + privateInfo->start_segno > segno || + privateInfo->end_segno < segno) + free_archive_wal_entry(entry->fname, privateInfo); +} + +/* + * Release the archive streamer chain and close the archive file. + */ +void +free_archive_reader(XLogDumpPrivate *privateInfo) +{ + /* + * NB: Normally, astreamer_finalize() is called before astreamer_free() to + * flush any remaining buffered data or to ensure the end of the tar + * archive is reached. However, when decoding a WAL file, once we hit the + * end LSN, any remaining WAL data in the buffer or the tar archive's + * unreached end can be safely ignored. + */ + astreamer_free(privateInfo->archive_streamer); + + /* Close the file. */ + if (close(privateInfo->archive_fd) != 0) + pg_log_error("could not close file \"%s\": %m", + privateInfo->archive_name); +} + +/* + * Copies WAL data from astreamer to readBuff; if unavailable, fetches more + * from the tar archive via astreamer. + */ +int +read_archive_wal_page(XLogDumpPrivate *privateInfo, XLogRecPtr targetPagePtr, + Size count, char *readBuff, int WalSegSz) +{ + char *p = readBuff; + Size nbytes = count; + XLogRecPtr recptr = targetPagePtr; + XLogSegNo segno; + char fname[MAXFNAMELEN]; + ArchivedWALFile *entry; + + /* Identify the segment and locate its entry in the archive hash */ + XLByteToSeg(targetPagePtr, segno, WalSegSz); + XLogFileName(fname, privateInfo->timeline, segno, WalSegSz); + entry = get_archive_wal_entry(fname, privateInfo, WalSegSz); + + while (nbytes > 0) + { + char *buf = entry->buf->data; + int bufLen = entry->buf->len; + XLogRecPtr endPtr; + XLogRecPtr startPtr; + + /* Calculate the LSN range currently residing in the buffer */ + XLogSegNoOffsetToRecPtr(segno, entry->read_len, WalSegSz, endPtr); + startPtr = endPtr - bufLen; + + /* + * Copy the requested WAL record if it exists in the buffer. + */ + if (bufLen > 0 && startPtr <= recptr && recptr < endPtr) + { + int copyBytes; + int offset = recptr - startPtr; + + /* + * Given startPtr <= recptr < endPtr and a total buffer size + * 'bufLen', the offset (recptr - startPtr) will always be less + * than 'bufLen'. + */ + Assert(offset < bufLen); + + copyBytes = Min(nbytes, bufLen - offset); + memcpy(p, buf + offset, copyBytes); + + /* Update state for read */ + recptr += copyBytes; + nbytes -= copyBytes; + p += copyBytes; + } + else + { + /* + * Before starting the actual decoding loop, pg_waldump tries to + * locate the first valid record from the user-specified start + * position, which might not be the start of a WAL record and + * could fall in the middle of a record that spans multiple pages. + * Consequently, the valid start position the decoder is looking + * for could be far away from that initial position. + * + * This may involve reading across multiple pages, and this + * pre-reading fetches data in multiple rounds from the archive + * streamer; normally, we would throw away existing buffer + * contents to fetch the next set of data, but that existing data + * might be needed once the main loop starts. Because previously + * read data cannot be re-read by the archive streamer, we delay + * resetting the buffer until the main decoding loop is entered. + * + * Once pg_waldump has entered the main loop, it may re-read the + * currently active page, but never an older one; therefore, any + * fully consumed WAL data preceding the current page can then be + * safely discarded. + */ + if (privateInfo->decoding_started) + { + resetStringInfo(entry->buf); + + /* + * Push back the partial page data for the current page to the + * buffer, ensuring it remains full page available for + * re-reading if requested. + */ + if (p > readBuff) + { + Assert((count - nbytes) > 0); + appendBinaryStringInfo(entry->buf, readBuff, count - nbytes); + } + } + + /* + * Now, fetch more data; raise an error if it's not the current + * segment being read by the archive streamer or if reading of the + * archived file has finished. + */ + if (privateInfo->cur_file != entry || + read_archive_file(privateInfo, READ_CHUNK_SIZE) == 0) + pg_fatal("could not read file \"%s\" from archive \"%s\": read %lld of %lld", + fname, privateInfo->archive_name, + (long long int) count - nbytes, + (long long int) nbytes); + } + } + + /* + * Should have either have successfully read all the requested bytes or + * reported a failure before this point. + */ + Assert(nbytes == 0); + + /* + * NB: We return the fixed value provided as input. Although we could + * return a boolean since we either successfully read the WAL page or + * raise an error, but the caller expects this value to be returned. The + * routine that reads WAL pages from the physical WAL file follows the + * same convention. + */ + return count; +} + +/* + * Clears the buffer of a WAL entry that is being ignored. This frees up memory + * and prevents the accumulation of irrelevant WAL data. Additionally, + * conditionally setting cur_file within privateinfo to NULL ensures the + * archive streamer skips unnecessary copy operations + */ +void +free_archive_wal_entry(const char *fname, XLogDumpPrivate *privateInfo) +{ + ArchivedWALFile *entry; + + entry = ArchivedWAL_lookup(privateInfo->archive_wal_htab, fname); + + if (entry == NULL) + return; + + /* Destroy the buffer */ + destroyStringInfo(entry->buf); + entry->buf = NULL; + + /* Set cur_file to NULL if it matches the entry being ignored */ + if (privateInfo->cur_file == entry) + privateInfo->cur_file = NULL; + + ArchivedWAL_delete_item(privateInfo->archive_wal_htab, entry); +} + +/* + * Returns the archived WAL entry from the hash table if it exists. Otherwise, + * it invokes the routine to read the archived file, which then populates the + * entry in the hash table if that WAL exists in the archive. + */ +static ArchivedWALFile * +get_archive_wal_entry(const char *fname, XLogDumpPrivate *privateInfo, + int WalSegSz) +{ + ArchivedWALFile *entry = NULL; + + /* Search hash table */ + entry = ArchivedWAL_lookup(privateInfo->archive_wal_htab, fname); + + if (entry != NULL) + return entry; + + /* + * The requested WAL entry has not been read from the archive yet; invoke + * the archive streamer to read it. + */ + while (1) + { + /* Fetch more data */ + if (read_archive_file(privateInfo, READ_CHUNK_SIZE) == 0) + break; /* archive file ended */ + + /* + * Archived streamer is reading a non-WAL file or an irrelevant WAL + * file. + */ + if (privateInfo->cur_file == NULL) + continue; + + entry = privateInfo->cur_file; + + /* Found the required entry */ + if (strcmp(fname, entry->fname) == 0) + return entry; + + /* WAL segments must be archived in order */ + pg_log_error("WAL files are not archived in sequential order"); + pg_log_error_detail("Expecting segment \"%s\" but found \"%s\".", + fname, entry->fname); + exit(1); + } + + /* Requested WAL segment not found */ + pg_fatal("could not find WAL \"%s\" in archive \"%s\"", + fname, privateInfo->archive_name); +} + +/* + * Reads the archive file and passes it to the archive streamer for + * decompression. + */ +static int +read_archive_file(XLogDumpPrivate *privateInfo, Size count) +{ + int rc; + char *buffer; + + buffer = pg_malloc(count * sizeof(uint8)); + + rc = read(privateInfo->archive_fd, buffer, count); + if (rc < 0) + pg_fatal("could not read file \"%s\": %m", + privateInfo->archive_name); + + /* + * Decompress (if required), and then parse the previously read contents + * of the tar file. + */ + if (rc > 0) + astreamer_content(privateInfo->archive_streamer, NULL, + buffer, rc, ASTREAMER_UNKNOWN); + pg_free(buffer); + + return rc; +} + +/* + * Create an astreamer that can read WAL from a tar file. + */ +static astreamer * +astreamer_waldump_new(XLogDumpPrivate *privateInfo) +{ + astreamer_waldump *streamer; + + streamer = palloc0_object(astreamer_waldump); + *((const astreamer_ops **) &streamer->base.bbs_ops) = + &astreamer_waldump_ops; + + streamer->privateInfo = privateInfo; + + return &streamer->base; +} + +/* + * Main entry point of the archive streamer for reading WAL data from a tar + * file. If a member is identified as a valid WAL file, a hash entry is created + * for it, and its contents are copied into that entry's buffer, making them + * accessible to the decoding routine. + */ +static void +astreamer_waldump_content(astreamer *streamer, astreamer_member *member, + const char *data, int len, + astreamer_archive_context context) +{ + astreamer_waldump *mystreamer = (astreamer_waldump *) streamer; + XLogDumpPrivate *privateInfo = mystreamer->privateInfo; + + Assert(context != ASTREAMER_UNKNOWN); + + switch (context) + { + case ASTREAMER_MEMBER_HEADER: + { + char *fname = NULL; + ArchivedWALFile *entry; + bool found; + + pg_log_debug("reading \"%s\"", member->pathname); + + if (!member_is_wal_file(mystreamer, member, &fname)) + break; + + /* + * Further checks are skipped if any WAL file can be read. + * This typically occurs during initial verification. + */ + if (!READ_ANY_WAL(privateInfo)) + { + XLogSegNo segno; + TimeLineID timeline; + + /* + * Skip the segment if the timeline does not match, if it + * falls outside the caller-specified range. + */ + XLogFromFileName(fname, &timeline, &segno, privateInfo->segsize); + if (privateInfo->timeline != timeline || + privateInfo->start_segno > segno || + privateInfo->end_segno < segno) + { + free(fname); + break; + } + } + + entry = ArchivedWAL_insert(privateInfo->archive_wal_htab, + fname, &found); + + /* + * Shouldn't happen, but if it does, simply ignore the + * duplicate WAL file. + */ + if (found) + { + pg_log_warning("ignoring duplicate WAL \"%s\" found in archive \"%s\"", + member->pathname, privateInfo->archive_name); + break; + } + + entry->buf = makeStringInfo(); + entry->spilled = false; + entry->read_len = 0; + privateInfo->cur_file = entry; + } + break; + + case ASTREAMER_MEMBER_CONTENTS: + if (privateInfo->cur_file) + { + appendBinaryStringInfo(privateInfo->cur_file->buf, data, len); + privateInfo->cur_file->read_len += len; + } + break; + + case ASTREAMER_MEMBER_TRAILER: + privateInfo->cur_file = NULL; + break; + + case ASTREAMER_ARCHIVE_TRAILER: + break; + + default: + /* Shouldn't happen. */ + pg_fatal("unexpected state while parsing tar file"); + } +} + +/* + * End-of-stream processing for an astreamer_waldump stream. + */ +static void +astreamer_waldump_finalize(astreamer *streamer) +{ + Assert(streamer->bbs_next == NULL); +} + +/* + * Free memory associated with a astreamer_waldump stream. + */ +static void +astreamer_waldump_free(astreamer *streamer) +{ + Assert(streamer->bbs_next == NULL); + pfree(streamer); +} + +/* + * Returns true if the archive member name matches the WAL naming format. If + * successful, it also outputs the WAL segment name. + */ +static bool +member_is_wal_file(astreamer_waldump *mystreamer, astreamer_member *member, + char **fname) +{ + int pathlen; + char pathname[MAXPGPATH]; + char *filename; + + /* We are only interested in normal files. */ + if (member->is_directory || member->is_link) + return false; + + if (strlen(member->pathname) < XLOG_FNAME_LEN) + return false; + + /* + * For a correct comparison, we must remove any '.' or '..' components + * from the member pathname. Similar to member_verify_header(), we prepend + * './' to the path so that canonicalize_path() can properly resolve and + * strip these references from the tar member name + */ + snprintf(pathname, MAXPGPATH, "./%s", member->pathname); + canonicalize_path(pathname); + pathlen = strlen(pathname); + + /* WAL files from the top-level or pg_wal directory will be decoded */ + if (pathlen > XLOG_FNAME_LEN && + strncmp(pathname, XLOGDIR, strlen(XLOGDIR)) != 0) + return false; + + /* WAL file could be with full path */ + filename = pathname + (pathlen - XLOG_FNAME_LEN); + if (!IsXLogFileName(filename)) + return false; + + *fname = pnstrdup(filename, XLOG_FNAME_LEN); + + return true; +} + +/* + * Helper function for filemap hash table. + */ +static uint32 +hash_string_pointer(const char *s) +{ + unsigned char *ss = (unsigned char *) s; + + return hash_bytes(ss, strlen(s)); +} diff --git a/src/bin/pg_waldump/meson.build b/src/bin/pg_waldump/meson.build index 633a9874bb5..5296f21b82c 100644 --- a/src/bin/pg_waldump/meson.build +++ b/src/bin/pg_waldump/meson.build @@ -1,6 +1,7 @@ # Copyright (c) 2022-2026, PostgreSQL Global Development Group pg_waldump_sources = files( + 'archive_waldump.c', 'compat.c', 'pg_waldump.c', 'rmgrdesc.c', @@ -18,7 +19,7 @@ endif pg_waldump = executable('pg_waldump', pg_waldump_sources, - dependencies: [frontend_code, lz4, zstd], + dependencies: [frontend_code, libpq, lz4, zstd], c_args: ['-DFRONTEND'], # needed for xlogreader et al kwargs: default_bin_args, ) @@ -29,6 +30,7 @@ tests += { 'sd': meson.current_source_dir(), 'bd': meson.current_build_dir(), 'tap': { + 'env': {'TAR': tar.found() ? tar.full_path() : ''}, 'tests': [ 't/001_basic.pl', 't/002_save_fullpage.pl', diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index 5d31b15dbd8..90fc13f3609 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -176,7 +176,7 @@ split_path(const char *path, char **dir, char **fname) * * return a read only fd */ -static int +int open_file_in_directory(const char *directory, const char *fname) { int fd = -1; @@ -440,6 +440,80 @@ WALDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, return count; } +/* + * pg_waldump's XLogReaderRoutine->segment_open callback to support dumping WAL + * files from tar archives. + */ +static void +TarWALDumpOpenSegment(XLogReaderState *state, XLogSegNo nextSegNo, + TimeLineID *tli_p) +{ + /* No action needed */ +} + +/* + * pg_waldump's XLogReaderRoutine->segment_close callback. + */ +static void +TarWALDumpCloseSegment(XLogReaderState *state) +{ + /* No action needed */ +} + +/* + * pg_waldump's XLogReaderRoutine->page_read callback to support dumping WAL + * files from tar archives. + */ +static int +TarWALDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, + XLogRecPtr targetPtr, char *readBuff) +{ + XLogDumpPrivate *private = state->private_data; + int count = required_read_len(private, targetPagePtr, reqLen); + int WalSegSz = state->segcxt.ws_segsize; + XLogSegNo curSegNo; + + /* Bail out if the count to be read is not valid */ + if (count < 0) + return -1; + + /* + * If the target page is in a different segment, free the buffer space + * occupied by the previous segment data. Since pg_waldump never requests + * the same WAL bytes twice, moving to a new segment implies the previous + * buffer's data and that segment will not be needed again. + */ + curSegNo = state->seg.ws_segno; + if (!XLByteInSeg(targetPagePtr, curSegNo, WalSegSz)) + { + char fname[MAXFNAMELEN]; + XLogSegNo nextSegNo; + + /* + * Calculate the next WAL segment to be decoded from the given page + * pointer + */ + XLByteToSeg(targetPagePtr, nextSegNo, WalSegSz); + state->seg.ws_tli = private->timeline; + state->seg.ws_segno = nextSegNo; + + /* + * If in pre-reading mode (prior to actual decoding), do not delete any + * entries that might be requested again once the decoding loop starts. + * For more details, see the comments in read_archive_wal_page(). + */ + if (private->decoding_started && curSegNo < nextSegNo) + { + XLogFileName(fname, state->seg.ws_tli, curSegNo, WalSegSz); + free_archive_wal_entry(fname, private); + } + } + + /* Read the WAL page from the archive streamer */ + return read_archive_wal_page(private, targetPagePtr, count, readBuff, + WalSegSz); +} + /* * Boolean to return whether the given WAL record matches a specific relation * and optionally block. @@ -777,8 +851,8 @@ usage(void) printf(_(" -F, --fork=FORK only show records that modify blocks in fork FORK;\n" " valid names are main, fsm, vm, init\n")); printf(_(" -n, --limit=N number of records to display\n")); - printf(_(" -p, --path=PATH directory in which to find WAL segment files or a\n" - " directory with a ./pg_wal that contains such files\n" + printf(_(" -p, --path=PATH tar archive or a directory in which to find WAL segment files or\n" + " a directory with a ./pg_wal that contains such files\n" " (default: current directory, ./pg_wal, $PGDATA/pg_wal)\n")); printf(_(" -q, --quiet do not print any output, except for errors\n")); printf(_(" -r, --rmgr=RMGR only show records generated by resource manager RMGR;\n" @@ -810,7 +884,9 @@ main(int argc, char **argv) XLogRecord *record; XLogRecPtr first_record; char *waldir = NULL; + char *walpath = NULL; char *errormsg; + pg_compress_algorithm compression; static struct option long_options[] = { {"bkp-details", no_argument, NULL, 'b'}, @@ -868,6 +944,10 @@ main(int argc, char **argv) private.startptr = InvalidXLogRecPtr; private.endptr = InvalidXLogRecPtr; private.endptr_reached = false; + private.decoding_started = false; + private.archive_name = NULL; + private.start_segno = 0; + private.end_segno = UINT64_MAX; config.quiet = false; config.bkp_details = false; @@ -943,7 +1023,7 @@ main(int argc, char **argv) } break; case 'p': - waldir = pg_strdup(optarg); + walpath = pg_strdup(optarg); break; case 'q': config.quiet = true; @@ -1107,10 +1187,19 @@ main(int argc, char **argv) goto bad_argument; } - if (waldir != NULL) + if (walpath != NULL) { + /* validate path points to tar archive */ + if (parse_tar_compress_algorithm(walpath, &compression)) + { + char *fname = NULL; + + split_path(walpath, &waldir, &fname); + + private.archive_name = fname; + } /* validate path points to directory */ - if (!verify_directory(waldir)) + else if (!verify_directory(walpath)) { pg_log_error("could not open directory \"%s\": %m", waldir); goto bad_argument; @@ -1128,6 +1217,17 @@ main(int argc, char **argv) int fd; XLogSegNo segno; + /* + * If a tar archive is passed using the --path option, all other + * arguments become unnecessary. + */ + if (private.archive_name) + { + pg_log_error("unnecessary command-line arguments specified with tar archive (first is \"%s\")", + argv[optind]); + goto bad_argument; + } + split_path(argv[optind], &directory, &fname); if (waldir == NULL && directory != NULL) @@ -1138,69 +1238,76 @@ main(int argc, char **argv) pg_fatal("could not open directory \"%s\": %m", waldir); } - waldir = identify_target_directory(waldir, fname, &private.segsize); - fd = open_file_in_directory(waldir, fname); - if (fd < 0) - pg_fatal("could not open file \"%s\"", fname); - close(fd); - - /* parse position from file */ - XLogFromFileName(fname, &private.timeline, &segno, private.segsize); - - if (!XLogRecPtrIsValid(private.startptr)) - XLogSegNoOffsetToRecPtr(segno, 0, private.segsize, private.startptr); - else if (!XLByteInSeg(private.startptr, segno, private.segsize)) + if (fname != NULL && parse_tar_compress_algorithm(fname, &compression)) { - pg_log_error("start WAL location %X/%08X is not inside file \"%s\"", - LSN_FORMAT_ARGS(private.startptr), - fname); - goto bad_argument; + private.archive_name = fname; } - - /* no second file specified, set end position */ - if (!(optind + 1 < argc) && !XLogRecPtrIsValid(private.endptr)) - XLogSegNoOffsetToRecPtr(segno + 1, 0, private.segsize, private.endptr); - - /* parse ENDSEG if passed */ - if (optind + 1 < argc) + else { - XLogSegNo endsegno; - - /* ignore directory, already have that */ - split_path(argv[optind + 1], &directory, &fname); - + waldir = identify_target_directory(waldir, fname, &private.segsize); fd = open_file_in_directory(waldir, fname); if (fd < 0) pg_fatal("could not open file \"%s\"", fname); close(fd); /* parse position from file */ - XLogFromFileName(fname, &private.timeline, &endsegno, private.segsize); + XLogFromFileName(fname, &private.timeline, &segno, private.segsize); - if (endsegno < segno) - pg_fatal("ENDSEG %s is before STARTSEG %s", - argv[optind + 1], argv[optind]); + if (!XLogRecPtrIsValid(private.startptr)) + XLogSegNoOffsetToRecPtr(segno, 0, private.segsize, private.startptr); + else if (!XLByteInSeg(private.startptr, segno, private.segsize)) + { + pg_log_error("start WAL location %X/%08X is not inside file \"%s\"", + LSN_FORMAT_ARGS(private.startptr), + fname); + goto bad_argument; + } - if (!XLogRecPtrIsValid(private.endptr)) - XLogSegNoOffsetToRecPtr(endsegno + 1, 0, private.segsize, - private.endptr); + /* no second file specified, set end position */ + if (!(optind + 1 < argc) && !XLogRecPtrIsValid(private.endptr)) + XLogSegNoOffsetToRecPtr(segno + 1, 0, private.segsize, private.endptr); - /* set segno to endsegno for check of --end */ - segno = endsegno; - } + /* parse ENDSEG if passed */ + if (optind + 1 < argc) + { + XLogSegNo endsegno; + /* ignore directory, already have that */ + split_path(argv[optind + 1], &directory, &fname); - if (!XLByteInSeg(private.endptr, segno, private.segsize) && - private.endptr != (segno + 1) * private.segsize) - { - pg_log_error("end WAL location %X/%08X is not inside file \"%s\"", - LSN_FORMAT_ARGS(private.endptr), - argv[argc - 1]); - goto bad_argument; + fd = open_file_in_directory(waldir, fname); + if (fd < 0) + pg_fatal("could not open file \"%s\"", fname); + close(fd); + + /* parse position from file */ + XLogFromFileName(fname, &private.timeline, &endsegno, private.segsize); + + if (endsegno < segno) + pg_fatal("ENDSEG %s is before STARTSEG %s", + argv[optind + 1], argv[optind]); + + if (!XLogRecPtrIsValid(private.endptr)) + XLogSegNoOffsetToRecPtr(endsegno + 1, 0, private.segsize, + private.endptr); + + /* set segno to endsegno for check of --end */ + segno = endsegno; + } + + + if (!XLByteInSeg(private.endptr, segno, private.segsize) && + private.endptr != (segno + 1) * private.segsize) + { + pg_log_error("end WAL location %X/%08X is not inside file \"%s\"", + LSN_FORMAT_ARGS(private.endptr), + argv[argc - 1]); + goto bad_argument; + } } } - else - waldir = identify_target_directory(waldir, NULL, &private.segsize); + else if (!private.archive_name) + waldir = identify_target_directory(walpath, NULL, &private.segsize); /* we don't know what to print */ if (!XLogRecPtrIsValid(private.startptr)) @@ -1212,12 +1319,36 @@ main(int argc, char **argv) /* done with argument parsing, do the actual work */ /* we have everything we need, start reading */ - xlogreader_state = - XLogReaderAllocate(private.segsize, waldir, - XL_ROUTINE(.page_read = WALDumpReadPage, - .segment_open = WALDumpOpenSegment, - .segment_close = WALDumpCloseSegment), - &private); + if (private.archive_name) + { + /* + * A NULL WAL directory indicates that the archive file is located in + * the current working directory of the pg_waldump execution + */ + if (waldir == NULL) + waldir = pg_strdup("."); + + /* Set up for reading tar file */ + init_archive_reader(&private, waldir, &private.segsize, compression); + + /* Routine to decode WAL files in tar archive */ + xlogreader_state = + XLogReaderAllocate(private.segsize, waldir, + XL_ROUTINE(.page_read = TarWALDumpReadPage, + .segment_open = TarWALDumpOpenSegment, + .segment_close = TarWALDumpCloseSegment), + &private); + } + else + { + xlogreader_state = + XLogReaderAllocate(private.segsize, waldir, + XL_ROUTINE(.page_read = WALDumpReadPage, + .segment_open = WALDumpOpenSegment, + .segment_close = WALDumpCloseSegment), + &private); + } + if (!xlogreader_state) pg_fatal("out of memory while allocating a WAL reading processor"); @@ -1245,6 +1376,9 @@ main(int argc, char **argv) if (config.stats == true && !config.quiet) stats.startptr = first_record; + /* Flag indicating that the decoding loop has been entered */ + private.decoding_started = true; + for (;;) { if (time_to_stop) @@ -1326,6 +1460,9 @@ main(int argc, char **argv) XLogReaderFree(xlogreader_state); + if (private.archive_name) + free_archive_reader(&private); + return EXIT_SUCCESS; bad_argument: diff --git a/src/bin/pg_waldump/pg_waldump.h b/src/bin/pg_waldump/pg_waldump.h index 013b051506f..54d54a8a718 100644 --- a/src/bin/pg_waldump/pg_waldump.h +++ b/src/bin/pg_waldump/pg_waldump.h @@ -12,6 +12,11 @@ #define PG_WALDUMP_H #include "access/xlogdefs.h" +#include "fe_utils/astreamer.h" + +/* Forward declaration */ +struct ArchivedWALFile; +struct ArchivedWAL_hash; /* Contains the necessary information to drive WAL decoding */ typedef struct XLogDumpPrivate @@ -21,6 +26,44 @@ typedef struct XLogDumpPrivate XLogRecPtr startptr; XLogRecPtr endptr; bool endptr_reached; + bool decoding_started; + + /* Fields required to read WAL from archive */ + char *archive_name; /* Tar archive name */ + int archive_fd; /* File descriptor for the open tar file */ + + astreamer *archive_streamer; + + /* What the archive streamer is currently reading */ + struct ArchivedWALFile *cur_file; + + /* + * Hash table of all WAL files that the archive stream has read, including + * the one currently in progress. + */ + struct ArchivedWAL_hash *archive_wal_htab; + + /* + * Although these values can be easily derived from startptr and endptr, + * doing so repeatedly for each archived member would be inefficient, as + * it would involve recalculating and filtering out irrelevant WAL + * segments. + */ + XLogSegNo start_segno; + XLogSegNo end_segno; } XLogDumpPrivate; +extern int open_file_in_directory(const char *directory, const char *fname); + +extern void init_archive_reader(XLogDumpPrivate *privateInfo, + const char *waldir, int *WalSegSz, + pg_compress_algorithm compression); +extern void free_archive_reader(XLogDumpPrivate *privateInfo); +extern int read_archive_wal_page(XLogDumpPrivate *privateInfo, + XLogRecPtr targetPagePtr, + Size count, char *readBuff, + int WalSegSz); +extern void free_archive_wal_entry(const char *fname, + XLogDumpPrivate *privateInfo); + #endif /* PG_WALDUMP_H */ diff --git a/src/bin/pg_waldump/t/001_basic.pl b/src/bin/pg_waldump/t/001_basic.pl index f12ba52cbfc..9ab7457e9e2 100644 --- a/src/bin/pg_waldump/t/001_basic.pl +++ b/src/bin/pg_waldump/t/001_basic.pl @@ -3,10 +3,13 @@ use strict; use warnings FATAL => 'all'; +use Cwd; use PostgreSQL::Test::Cluster; use PostgreSQL::Test::Utils; use Test::More; +my $tar = $ENV{TAR}; + program_help_ok('pg_waldump'); program_version_ok('pg_waldump'); program_options_handling_ok('pg_waldump'); @@ -162,6 +165,42 @@ CREATE TABLESPACE ts1 LOCATION '$tblspc_path'; DROP TABLESPACE ts1; }); +# Test: Decode a continuation record (contrecord) that spans multiple WAL +# segments. +# +# Now consume all remaining room in the current WAL segment, leaving +# space enough only for the start of a largish record. +$node->safe_psql( + 'postgres', q{ +DO $$ +DECLARE + wal_segsize int := setting::int FROM pg_settings WHERE name = 'wal_segment_size'; + remain int; + iters int := 0; +BEGIN + LOOP + INSERT into t1(b) + select repeat(encode(sha256(g::text::bytea), 'hex'), (random() * 15 + 1)::int) + from generate_series(1, 10) g; + + remain := wal_segsize - (pg_current_wal_insert_lsn() - '0/0') % wal_segsize; + IF remain < 2 * setting::int from pg_settings where name = 'block_size' THEN + RAISE log 'exiting after % iterations, % bytes to end of WAL segment', iters, remain; + EXIT; + END IF; + iters := iters + 1; + END LOOP; +END +$$; +}); + +my $contrecord_lsn = $node->safe_psql('postgres', + 'SELECT pg_current_wal_insert_lsn()'); +# Generate contrecord record +$node->safe_psql('postgres', + qq{SELECT pg_logical_emit_message(true, 'test 026', repeat('xyzxz', 123456))} +); + my ($end_lsn, $end_walfile) = split /\|/, $node->safe_psql('postgres', q{SELECT pg_current_wal_insert_lsn(), pg_walfile_name(pg_current_wal_insert_lsn())} @@ -259,11 +298,50 @@ sub test_pg_waldump return @lines; } -my @lines; +# Create a tar archive, sorting the file order +sub generate_archive +{ + my ($archive, $directory, $compression_flags) = @_; + + my @files; + opendir my $dh, $directory or die "opendir: $!"; + while (my $entry = readdir $dh) { + # Skip '.' and '..' + next if $entry eq '.' || $entry eq '..'; + push @files, $entry; + } + closedir $dh; + + @files = sort @files; + + # move into the WAL directory before archiving files + my $cwd = getcwd; + chdir($directory) || die "chdir: $!"; + command_ok([$tar, $compression_flags, $archive, @files]); + chdir($cwd) || die "chdir: $!"; +} + +my $tmp_dir = PostgreSQL::Test::Utils::tempdir_short(); my @scenarios = ( { - 'path' => $node->data_dir + 'path' => $node->data_dir, + 'is_archive' => 0, + 'enabled' => 1 + }, + { + 'path' => "$tmp_dir/pg_wal.tar", + 'compression_method' => 'none', + 'compression_flags' => '-cf', + 'is_archive' => 1, + 'enabled' => 1 + }, + { + 'path' => "$tmp_dir/pg_wal.tar.gz", + 'compression_method' => 'gzip', + 'compression_flags' => '-czf', + 'is_archive' => 1, + 'enabled' => check_pg_config("#define HAVE_LIBZ 1") }); for my $scenario (@scenarios) @@ -272,6 +350,19 @@ for my $scenario (@scenarios) SKIP: { + skip "tar command is not available", 3 + if !defined $tar; + skip "$scenario->{'compression_method'} compression not supported by this build", 3 + if !$scenario->{'enabled'} && $scenario->{'is_archive'}; + + # create pg_wal archive + if ($scenario->{'is_archive'}) + { + generate_archive($path, + $node->data_dir . '/pg_wal', + $scenario->{'compression_flags'}); + } + command_fails_like( [ 'pg_waldump', '--path' => $path ], qr/error: no start WAL location given/, @@ -305,9 +396,14 @@ for my $scenario (@scenarios) test_pg_waldump_skip_bytes($path, $start_lsn, $end_lsn); - @lines = test_pg_waldump($path, $start_lsn, $end_lsn); + my @lines = test_pg_waldump($path, $start_lsn, $end_lsn); is(grep(!/^rmgr: \w/, @lines), 0, 'all output lines are rmgr lines'); + @lines = test_pg_waldump($path, $contrecord_lsn, $end_lsn); + is(grep(!/^rmgr: \w/, @lines), 0, 'all output lines are rmgr lines'); + + test_pg_waldump_skip_bytes($path, $contrecord_lsn, $end_lsn); + @lines = test_pg_waldump($path, $start_lsn, $end_lsn, '--limit' => 6); is(@lines, 6, 'limit option observed'); @@ -337,6 +433,9 @@ for my $scenario (@scenarios) '--relation' => "$default_ts_oid/$postgres_db_oid/$rel_i1a_oid", '--block' => 1); is(grep(!/\bblk 1\b/, @lines), 0, 'only lines for selected block'); + + # Cleanup. + unlink $path if $scenario->{'is_archive'}; } } diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 77e3c04144e..595ad7d5c5a 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -145,6 +145,8 @@ ArchiveOpts ArchiveShutdownCB ArchiveStartupCB ArchiveStreamState +ArchivedWALFile +ArchivedWAL_hash ArchiverOutput ArchiverStage ArrayAnalyzeExtraData @@ -3513,6 +3515,7 @@ astreamer_recovery_injector astreamer_tar_archiver astreamer_tar_parser astreamer_verify +astreamer_waldump astreamer_zstd_frame auth_password_hook_typ autovac_table -- 2.47.1