public inbox for [email protected]
help / color / mirror / Atom feedFrom: Maxim Orlov <[email protected]>
To: wenhui qiu <[email protected]>
Cc: Heikki Linnakangas <[email protected]>
Cc: Postgres hackers <[email protected]>
Subject: Re: POC: make mxidoff 64 bits
Date: Wed, 14 Aug 2024 18:30:15 +0300
Message-ID: <CACG=ezY9xq73jcX_EjVqx5-f90nbQ9PyhFCTW2fwFCS2wmNiFw@mail.gmail.com> (raw)
In-Reply-To: <CACG=ezYokoiumOFnqUfg_ffHD5s8T+6iHYfzKLfa=QQ-1pNrBg@mail.gmail.com>
References: <CACG=ezaWg7_nt-8ey4aKv2w9LcuLthHknwCawmBgEeTnJrJTcw@mail.gmail.com>
<[email protected]>
<CAGjGUAKO1GCzG5wBMt5RosWo0PatgFpYY=Gjgt77tN2brNe=Bg@mail.gmail.com>
<CACG=ezYokoiumOFnqUfg_ffHD5s8T+6iHYfzKLfa=QQ-1pNrBg@mail.gmail.com>
Hi!
Sorry for delay. I was a bit busy last month. Anyway, here is my
proposal for making multioffsets 64 bit.
The patch set consists of three parts:
0001 - making user output of offsets 64-bit ready;
0002 - making offsets 64-bit;
0003 - provide 32 to 64 bit conversion in pg_upgarde.
I'm pretty sure this is just a beginning of the conversation, so any
opinions and reviews, as always, are very welcome!
--
Best regards,
Maxim Orlov.
Attachments:
[application/x-patch] v1-0002-Use-64-bit-multixact-offsets.patch (14.3K, 2-v1-0002-Use-64-bit-multixact-offsets.patch)
download | inline diff:
From 2e1f05b3b0504153e57188e968bb19cb6741c087 Mon Sep 17 00:00:00 2001
From: Maxim Orlov <[email protected]>
Date: Wed, 6 Mar 2024 11:11:33 +0300
Subject: [PATCH v1 2/3] Use 64-bit multixact offsets.
Author: Maxim Orlov <[email protected]>
---
src/backend/access/transam/multixact.c | 182 ++-----------------------
src/bin/pg_resetwal/pg_resetwal.c | 2 +-
src/bin/pg_resetwal/t/001_basic.pl | 2 +-
src/include/access/multixact.h | 2 +-
src/include/c.h | 2 +-
5 files changed, 16 insertions(+), 174 deletions(-)
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 57c5148933..f2a2aa9547 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -95,14 +95,6 @@
/*
* Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is
* used everywhere else in Postgres.
- *
- * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF,
- * MultiXact page numbering also wraps around at
- * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at
- * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need
- * take no explicit notice of that fact in this module, except when comparing
- * segment and page numbers in TruncateMultiXact (see
- * MultiXactOffsetPagePrecedes).
*/
/* We need four bytes per offset */
@@ -174,7 +166,7 @@ MXOffsetToMemberPage(MultiXactOffset offset)
return offset / MULTIXACT_MEMBERS_PER_PAGE;
}
-static inline int
+static inline int64
MXOffsetToMemberSegment(MultiXactOffset offset)
{
return MXOffsetToMemberPage(offset) / SLRU_PAGES_PER_SEGMENT;
@@ -271,9 +263,6 @@ typedef struct MultiXactStateData
MultiXactId multiStopLimit;
MultiXactId multiWrapLimit;
- /* support for members anti-wraparound measures */
- MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */
-
/*
* This is used to sleep until a multixact offset is written when we want
* to create the next one.
@@ -408,8 +397,6 @@ static bool MultiXactOffsetPrecedes(MultiXactOffset offset1,
MultiXactOffset offset2);
static void ExtendMultiXactOffset(MultiXactId multi);
static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers);
-static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary,
- MultiXactOffset start, uint32 distance);
static bool SetOffsetVacuumLimit(bool is_startup);
static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
static void WriteMZeroPageXlogRec(int64 pageno, uint8 info);
@@ -1158,78 +1145,6 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
else
*offset = nextOffset;
- /*----------
- * Protect against overrun of the members space as well, with the
- * following rules:
- *
- * If we're past offsetStopLimit, refuse to generate more multis.
- * If we're close to offsetStopLimit, emit a warning.
- *
- * Arbitrarily, we start emitting warnings when we're 20 segments or less
- * from offsetStopLimit.
- *
- * Note we haven't updated the shared state yet, so if we fail at this
- * point, the multixact ID we grabbed can still be used by the next guy.
- *
- * Note that there is no point in forcing autovacuum runs here: the
- * multixact freeze settings would have to be reduced for that to have any
- * effect.
- *----------
- */
-#define OFFSET_WARN_SEGMENTS 20
- if (MultiXactState->oldestOffsetKnown &&
- MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset,
- nmembers))
- {
- /* see comment in the corresponding offsets wraparound case */
- SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
-
- ereport(ERROR,
- (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
- errmsg("multixact \"members\" limit exceeded"),
- errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.",
- "This command would create a multixact with %u members, but the remaining space is only enough for %u members.",
- MultiXactState->offsetStopLimit - nextOffset - 1,
- nmembers,
- MultiXactState->offsetStopLimit - nextOffset - 1),
- errhint("Execute a database-wide VACUUM in database with OID %u with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.",
- MultiXactState->oldestMultiXactDB)));
- }
-
- /*
- * Check whether we should kick autovacuum into action, to prevent members
- * wraparound. NB we use a much larger window to trigger autovacuum than
- * just the warning limit. The warning is just a measure of last resort -
- * this is in line with GetNewTransactionId's behaviour.
- */
- if (!MultiXactState->oldestOffsetKnown ||
- (MultiXactState->nextOffset - MultiXactState->oldestOffset
- > MULTIXACT_MEMBER_SAFE_THRESHOLD))
- {
- /*
- * To avoid swamping the postmaster with signals, we issue the autovac
- * request only when crossing a segment boundary. With default
- * compilation settings that's roughly after 50k members. This still
- * gives plenty of chances before we get into real trouble.
- */
- if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) !=
- (MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT))
- SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
- }
-
- if (MultiXactState->oldestOffsetKnown &&
- MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit,
- nextOffset,
- nmembers + MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT * OFFSET_WARN_SEGMENTS))
- ereport(WARNING,
- (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
- errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used",
- "database with OID %u must be vacuumed before %d more multixact members are used",
- MultiXactState->offsetStopLimit - nextOffset + nmembers,
- MultiXactState->oldestMultiXactDB,
- MultiXactState->offsetStopLimit - nextOffset + nmembers),
- errhint("Execute a database-wide VACUUM in that database with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.")));
-
ExtendMultiXactMember(nextOffset, nmembers);
/*
@@ -1968,7 +1883,7 @@ MultiXactShmemInit(void)
"pg_multixact/offsets", LWTRANCHE_MULTIXACTOFFSET_BUFFER,
LWTRANCHE_MULTIXACTOFFSET_SLRU,
SYNC_HANDLER_MULTIXACT_OFFSET,
- false);
+ true);
SlruPagePrecedesUnitTests(MultiXactOffsetCtl, MULTIXACT_OFFSETS_PER_PAGE);
SimpleLruInit(MultiXactMemberCtl,
"multixact_member", multixact_member_buffers, 0,
@@ -2713,8 +2628,6 @@ SetOffsetVacuumLimit(bool is_startup)
MultiXactOffset nextOffset;
bool oldestOffsetKnown = false;
bool prevOldestOffsetKnown;
- MultiXactOffset offsetStopLimit = 0;
- MultiXactOffset prevOffsetStopLimit;
/*
* NB: Have to prevent concurrent truncation, we might otherwise try to
@@ -2729,7 +2642,6 @@ SetOffsetVacuumLimit(bool is_startup)
nextOffset = MultiXactState->nextOffset;
prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown;
prevOldestOffset = MultiXactState->oldestOffset;
- prevOffsetStopLimit = MultiXactState->offsetStopLimit;
Assert(MultiXactState->finishedStartup);
LWLockRelease(MultiXactGenLock);
@@ -2760,11 +2672,7 @@ SetOffsetVacuumLimit(bool is_startup)
oldestOffsetKnown =
find_multixact_start(oldestMultiXactId, &oldestOffset);
- if (oldestOffsetKnown)
- ereport(DEBUG1,
- (errmsg_internal("oldest MultiXactId member is at offset %u",
- oldestOffset)));
- else
+ if (!oldestOffsetKnown)
ereport(LOG,
(errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk",
oldestMultiXactId)));
@@ -2777,24 +2685,7 @@ SetOffsetVacuumLimit(bool is_startup)
* overrun of old data in the members SLRU area. We can only do so if the
* oldest offset is known though.
*/
- if (oldestOffsetKnown)
- {
- /* move back to start of the corresponding segment */
- offsetStopLimit = oldestOffset - (oldestOffset %
- (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT));
-
- /* always leave one segment before the wraparound point */
- offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT);
-
- if (!prevOldestOffsetKnown && !is_startup)
- ereport(LOG,
- (errmsg("MultiXact member wraparound protections are now enabled")));
-
- ereport(DEBUG1,
- (errmsg_internal("MultiXact member stop limit is now %u based on MultiXact %u",
- offsetStopLimit, oldestMultiXactId)));
- }
- else if (prevOldestOffsetKnown)
+ if (prevOldestOffsetKnown)
{
/*
* If we failed to get the oldest offset this time, but we have a
@@ -2804,14 +2695,12 @@ SetOffsetVacuumLimit(bool is_startup)
*/
oldestOffset = prevOldestOffset;
oldestOffsetKnown = true;
- offsetStopLimit = prevOffsetStopLimit;
}
/* Install the computed values */
LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
MultiXactState->oldestOffset = oldestOffset;
MultiXactState->oldestOffsetKnown = oldestOffsetKnown;
- MultiXactState->offsetStopLimit = offsetStopLimit;
LWLockRelease(MultiXactGenLock);
/*
@@ -2821,54 +2710,6 @@ SetOffsetVacuumLimit(bool is_startup)
(nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD);
}
-/*
- * Return whether adding "distance" to "start" would move past "boundary".
- *
- * We use this to determine whether the addition is "wrapping around" the
- * boundary point, hence the name. The reason we don't want to use the regular
- * 2^31-modulo arithmetic here is that we want to be able to use the whole of
- * the 2^32-1 space here, allowing for more multixacts than would fit
- * otherwise.
- */
-static bool
-MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start,
- uint32 distance)
-{
- MultiXactOffset finish;
-
- /*
- * Note that offset number 0 is not used (see GetMultiXactIdMembers), so
- * if the addition wraps around the UINT_MAX boundary, skip that value.
- */
- finish = start + distance;
- if (finish < start)
- finish++;
-
- /*-----------------------------------------------------------------------
- * When the boundary is numerically greater than the starting point, any
- * value numerically between the two is not wrapped:
- *
- * <----S----B---->
- * [---) = F wrapped past B (and UINT_MAX)
- * [---) = F not wrapped
- * [----] = F wrapped past B
- *
- * When the boundary is numerically less than the starting point (i.e. the
- * UINT_MAX wraparound occurs somewhere in between) then all values in
- * between are wrapped:
- *
- * <----B----S---->
- * [---) = F not wrapped past B (but wrapped past UINT_MAX)
- * [---) = F wrapped past B (and UINT_MAX)
- * [----] = F not wrapped
- *-----------------------------------------------------------------------
- */
- if (start < boundary)
- return finish >= boundary || finish < start;
- else
- return finish >= boundary && finish < start;
-}
-
/*
* Find the starting offset of the given MultiXactId.
*
@@ -2990,8 +2831,9 @@ MultiXactMemberFreezeThreshold(void)
* we try to eliminate from the system is based on how far we are past
* MULTIXACT_MEMBER_SAFE_THRESHOLD.
*/
- fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) /
- (MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD);
+ fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD);
+ fraction /= (double) (MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD);
+
victim_multixacts = multixacts * fraction;
/* fraction could be > 1.0, but lowest possible freeze age is zero */
@@ -3041,10 +2883,10 @@ SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int64 segpage, void *data
static void
PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset)
{
- const int maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset);
- int startsegment = MXOffsetToMemberSegment(oldestOffset);
- int endsegment = MXOffsetToMemberSegment(newOldestOffset);
- int segment = startsegment;
+ const int64 maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset);
+ int64 startsegment = MXOffsetToMemberSegment(oldestOffset);
+ int64 endsegment = MXOffsetToMemberSegment(newOldestOffset);
+ int64 segment = startsegment;
/*
* Delete all the segments but the last one. The last segment can still
@@ -3337,7 +3179,7 @@ MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2)
static bool
MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
{
- int32 diff = (int32) (offset1 - offset2);
+ int64 diff = (int64) (offset1 - offset2);
return (diff < 0);
}
diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c
index 985cd06802..1af2ce4b93 100644
--- a/src/bin/pg_resetwal/pg_resetwal.c
+++ b/src/bin/pg_resetwal/pg_resetwal.c
@@ -264,7 +264,7 @@ main(int argc, char *argv[])
case 'O':
errno = 0;
- set_mxoff = strtoul(optarg, &endptr, 0);
+ set_mxoff = strtou64(optarg, &endptr, 0);
if (endptr == optarg || *endptr != '\0' || errno != 0)
{
pg_log_error("invalid argument for option %s", "-O");
diff --git a/src/bin/pg_resetwal/t/001_basic.pl b/src/bin/pg_resetwal/t/001_basic.pl
index 9829e48106..f8a8eef44d 100644
--- a/src/bin/pg_resetwal/t/001_basic.pl
+++ b/src/bin/pg_resetwal/t/001_basic.pl
@@ -206,7 +206,7 @@ push @cmd,
sprintf("%d,%d", hex($files[0]) == 0 ? 3 : hex($files[0]), hex($files[-1]));
@files = get_slru_files('pg_multixact/offsets');
-$mult = 32 * $blcksz / 4;
+$mult = 32 * $blcksz / 8;
# -m argument is "new,old"
push @cmd, '-m',
sprintf("%d,%d",
diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h
index 7ffd256c74..90583634ec 100644
--- a/src/include/access/multixact.h
+++ b/src/include/access/multixact.h
@@ -27,7 +27,7 @@
#define MultiXactIdIsValid(multi) ((multi) != InvalidMultiXactId)
-#define MaxMultiXactOffset ((MultiXactOffset) 0xFFFFFFFF)
+#define MaxMultiXactOffset UINT64CONST(0xFFFFFFFFFFFFFFFF)
/*
* Possible multixact lock modes ("status"). The first four modes are for
diff --git a/src/include/c.h b/src/include/c.h
index dc1841346c..ccfb82b478 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -661,7 +661,7 @@ typedef uint32 SubTransactionId;
/* MultiXactId must be equivalent to TransactionId, to fit in t_xmax */
typedef TransactionId MultiXactId;
-typedef uint32 MultiXactOffset;
+typedef uint64 MultiXactOffset;
typedef uint32 CommandId;
--
2.45.2
[application/x-patch] v1-0001-Use-64-bit-format-output-for-multixact-offsets.patch (9.0K, 3-v1-0001-Use-64-bit-format-output-for-multixact-offsets.patch)
download | inline diff:
From 95226756a225ca6b95e2baafff502034c355310d Mon Sep 17 00:00:00 2001
From: Maxim Orlov <[email protected]>
Date: Wed, 7 Aug 2024 16:35:22 +0300
Subject: [PATCH v1 1/3] Use 64-bit format output for multixact offsets
Author: Maxim Orlov <[email protected]>
---
src/backend/access/rmgrdesc/mxactdesc.c | 9 ++++----
src/backend/access/rmgrdesc/xlogdesc.c | 4 ++--
src/backend/access/transam/multixact.c | 26 +++++++++++++----------
src/backend/access/transam/xlogrecovery.c | 5 +++--
src/bin/pg_controldata/pg_controldata.c | 4 ++--
src/bin/pg_resetwal/pg_resetwal.c | 8 +++----
6 files changed, 31 insertions(+), 25 deletions(-)
diff --git a/src/backend/access/rmgrdesc/mxactdesc.c b/src/backend/access/rmgrdesc/mxactdesc.c
index 3e8ad4d5ef..1b486de38c 100644
--- a/src/backend/access/rmgrdesc/mxactdesc.c
+++ b/src/backend/access/rmgrdesc/mxactdesc.c
@@ -65,8 +65,8 @@ multixact_desc(StringInfo buf, XLogReaderState *record)
xl_multixact_create *xlrec = (xl_multixact_create *) rec;
int i;
- appendStringInfo(buf, "%u offset %u nmembers %d: ", xlrec->mid,
- xlrec->moff, xlrec->nmembers);
+ appendStringInfo(buf, "%u offset %llu nmembers %d: ", xlrec->mid,
+ (unsigned long long) xlrec->moff, xlrec->nmembers);
for (i = 0; i < xlrec->nmembers; i++)
out_member(buf, &xlrec->members[i]);
}
@@ -74,9 +74,10 @@ multixact_desc(StringInfo buf, XLogReaderState *record)
{
xl_multixact_truncate *xlrec = (xl_multixact_truncate *) rec;
- appendStringInfo(buf, "offsets [%u, %u), members [%u, %u)",
+ appendStringInfo(buf, "offsets [%u, %u), members [%llu, %llu)",
xlrec->startTruncOff, xlrec->endTruncOff,
- xlrec->startTruncMemb, xlrec->endTruncMemb);
+ (unsigned long long) xlrec->startTruncMemb,
+ (unsigned long long) xlrec->endTruncMemb);
}
}
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index 363294d623..aaa19c81c8 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -66,7 +66,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
CheckPoint *checkpoint = (CheckPoint *) rec;
appendStringInfo(buf, "redo %X/%X; "
- "tli %u; prev tli %u; fpw %s; wal_level %s; xid %u:%u; oid %u; multi %u; offset %u; "
+ "tli %u; prev tli %u; fpw %s; wal_level %s; xid %u:%u; oid %u; multi %u; offset %llu; "
"oldest xid %u in DB %u; oldest multi %u in DB %u; "
"oldest/newest commit timestamp xid: %u/%u; "
"oldest running xid %u; %s",
@@ -79,7 +79,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
XidFromFullTransactionId(checkpoint->nextXid),
checkpoint->nextOid,
checkpoint->nextMulti,
- checkpoint->nextMultiOffset,
+ (unsigned long long) checkpoint->nextMultiOffset,
checkpoint->oldestXid,
checkpoint->oldestXidDB,
checkpoint->oldestMulti,
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index c601ff98a1..57c5148933 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -1258,7 +1258,8 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
LWLockRelease(MultiXactGenLock);
- debug_elog4(DEBUG2, "GetNew: returning %u offset %u", result, *offset);
+ debug_elog4(DEBUG2, "GetNew: returning %u offset %llu", result,
+ (unsigned long long) *offset);
return result;
}
@@ -2285,8 +2286,9 @@ MultiXactGetCheckptMulti(bool is_shutdown,
LWLockRelease(MultiXactGenLock);
debug_elog6(DEBUG2,
- "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u",
- *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB);
+ "MultiXact: checkpoint is nextMulti %u, nextOffset %llu, oldestMulti %u in DB %u",
+ *nextMulti, (unsigned long long) *nextMultiOffset, *oldestMulti,
+ *oldestMultiDB);
}
/*
@@ -2320,8 +2322,8 @@ void
MultiXactSetNextMXact(MultiXactId nextMulti,
MultiXactOffset nextMultiOffset)
{
- debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u",
- nextMulti, nextMultiOffset);
+ debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %llu",
+ nextMulti, (unsigned long long) nextMultiOffset);
LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
MultiXactState->nextMXact = nextMulti;
MultiXactState->nextOffset = nextMultiOffset;
@@ -2511,8 +2513,8 @@ MultiXactAdvanceNextMXact(MultiXactId minMulti,
}
if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset))
{
- debug_elog3(DEBUG2, "MultiXact: setting next offset to %u",
- minMultiOffset);
+ debug_elog3(DEBUG2, "MultiXact: setting next offset to %llu",
+ (unsigned long long) minMultiOffset);
MultiXactState->nextOffset = minMultiOffset;
}
LWLockRelease(MultiXactGenLock);
@@ -3203,11 +3205,12 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
elog(DEBUG1, "performing multixact truncation: "
"offsets [%u, %u), offsets segments [%llx, %llx), "
- "members [%u, %u), members segments [%llx, %llx)",
+ "members [%llu, %llu), members segments [%llx, %llx)",
oldestMulti, newOldestMulti,
(unsigned long long) MultiXactIdToOffsetSegment(oldestMulti),
(unsigned long long) MultiXactIdToOffsetSegment(newOldestMulti),
- oldestOffset, newOldestOffset,
+ (unsigned long long) oldestOffset,
+ (unsigned long long) newOldestOffset,
(unsigned long long) MXOffsetToMemberSegment(oldestOffset),
(unsigned long long) MXOffsetToMemberSegment(newOldestOffset));
@@ -3463,11 +3466,12 @@ multixact_redo(XLogReaderState *record)
elog(DEBUG1, "replaying multixact truncation: "
"offsets [%u, %u), offsets segments [%llx, %llx), "
- "members [%u, %u), members segments [%llx, %llx)",
+ "members [%llu, %llu), members segments [%llx, %llx)",
xlrec.startTruncOff, xlrec.endTruncOff,
(unsigned long long) MultiXactIdToOffsetSegment(xlrec.startTruncOff),
(unsigned long long) MultiXactIdToOffsetSegment(xlrec.endTruncOff),
- xlrec.startTruncMemb, xlrec.endTruncMemb,
+ (unsigned long long) xlrec.startTruncMemb,
+ (unsigned long long) xlrec.endTruncMemb,
(unsigned long long) MXOffsetToMemberSegment(xlrec.startTruncMemb),
(unsigned long long) MXOffsetToMemberSegment(xlrec.endTruncMemb));
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index ad817fbca6..388037a94b 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -877,8 +877,9 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
U64FromFullTransactionId(checkPoint.nextXid),
checkPoint.nextOid)));
ereport(DEBUG1,
- (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
- checkPoint.nextMulti, checkPoint.nextMultiOffset)));
+ (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %llu",
+ checkPoint.nextMulti,
+ (unsigned long long) checkPoint.nextMultiOffset)));
ereport(DEBUG1,
(errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
checkPoint.oldestXid, checkPoint.oldestXidDB)));
diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index 93a05d80ca..43b6727570 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -253,8 +253,8 @@ main(int argc, char *argv[])
ControlFile->checkPointCopy.nextOid);
printf(_("Latest checkpoint's NextMultiXactId: %u\n"),
ControlFile->checkPointCopy.nextMulti);
- printf(_("Latest checkpoint's NextMultiOffset: %u\n"),
- ControlFile->checkPointCopy.nextMultiOffset);
+ printf(_("Latest checkpoint's NextMultiOffset: %llu\n"),
+ (unsigned long long) ControlFile->checkPointCopy.nextMultiOffset);
printf(_("Latest checkpoint's oldestXID: %u\n"),
ControlFile->checkPointCopy.oldestXid);
printf(_("Latest checkpoint's oldestXID's DB: %u\n"),
diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c
index e9dcb5a6d8..985cd06802 100644
--- a/src/bin/pg_resetwal/pg_resetwal.c
+++ b/src/bin/pg_resetwal/pg_resetwal.c
@@ -737,8 +737,8 @@ PrintControlValues(bool guessed)
ControlFile.checkPointCopy.nextOid);
printf(_("Latest checkpoint's NextMultiXactId: %u\n"),
ControlFile.checkPointCopy.nextMulti);
- printf(_("Latest checkpoint's NextMultiOffset: %u\n"),
- ControlFile.checkPointCopy.nextMultiOffset);
+ printf(_("Latest checkpoint's NextMultiOffset: %llu\n"),
+ (unsigned long long) ControlFile.checkPointCopy.nextMultiOffset);
printf(_("Latest checkpoint's oldestXID: %u\n"),
ControlFile.checkPointCopy.oldestXid);
printf(_("Latest checkpoint's oldestXID's DB: %u\n"),
@@ -809,8 +809,8 @@ PrintNewControlValues(void)
if (set_mxoff != -1)
{
- printf(_("NextMultiOffset: %u\n"),
- ControlFile.checkPointCopy.nextMultiOffset);
+ printf(_("NextMultiOffset: %llu\n"),
+ (unsigned long long) ControlFile.checkPointCopy.nextMultiOffset);
}
if (set_oid != 0)
--
2.45.2
[application/x-patch] v1-0003-Make-pg_upgrade-convert-multixact-offsets.patch (12.9K, 4-v1-0003-Make-pg_upgrade-convert-multixact-offsets.patch)
download | inline diff:
From 063ec2662d94f7a72e3162702c4051f34cd67000 Mon Sep 17 00:00:00 2001
From: Maxim Orlov <[email protected]>
Date: Tue, 13 Aug 2024 14:44:50 +0300
Subject: [PATCH v1 3/3] Make pg_upgrade convert multixact offsets.
Author: Maxim Orlov <[email protected]>
---
src/bin/pg_upgrade/Makefile | 1 +
src/bin/pg_upgrade/meson.build | 1 +
src/bin/pg_upgrade/pg_upgrade.c | 29 ++-
src/bin/pg_upgrade/pg_upgrade.h | 13 +-
src/bin/pg_upgrade/segresize.c | 350 +++++++++++++++++++++++++++++++
src/include/catalog/catversion.h | 2 +-
6 files changed, 391 insertions(+), 5 deletions(-)
create mode 100644 src/bin/pg_upgrade/segresize.c
diff --git a/src/bin/pg_upgrade/Makefile b/src/bin/pg_upgrade/Makefile
index bde91e2beb..030816596f 100644
--- a/src/bin/pg_upgrade/Makefile
+++ b/src/bin/pg_upgrade/Makefile
@@ -21,6 +21,7 @@ OBJS = \
info.o \
option.o \
parallel.o \
+ segresize.o \
pg_upgrade.o \
relfilenumber.o \
server.o \
diff --git a/src/bin/pg_upgrade/meson.build b/src/bin/pg_upgrade/meson.build
index 9825fa3305..2d9f7e6b65 100644
--- a/src/bin/pg_upgrade/meson.build
+++ b/src/bin/pg_upgrade/meson.build
@@ -10,6 +10,7 @@ pg_upgrade_sources = files(
'info.c',
'option.c',
'parallel.c',
+ 'segresize.c',
'pg_upgrade.c',
'relfilenumber.c',
'server.c',
diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c
index 663235816f..d9d8d0ea78 100644
--- a/src/bin/pg_upgrade/pg_upgrade.c
+++ b/src/bin/pg_upgrade/pg_upgrade.c
@@ -750,7 +750,30 @@ copy_xact_xlog_xid(void)
if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
{
- copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets");
+ /*
+ * If the old server is before the MULTIXACTOFFSET_FORMATCHANGE_CAT_VER
+ * it must have 32-bit multixid offsets, thus it should be converted.
+ */
+ if (old_cluster.controldata.cat_ver < MULTIXACTOFFSET_FORMATCHANGE_CAT_VER &&
+ new_cluster.controldata.cat_ver >= MULTIXACTOFFSET_FORMATCHANGE_CAT_VER)
+ {
+ uint64 oldest_offset = convert_multixact_offsets();
+
+ if (oldest_offset)
+ {
+ uint64 next_offset = old_cluster.controldata.chkpnt_nxtmxoff;
+
+ /* Handle possible wraparound. */
+ if (next_offset < oldest_offset)
+ next_offset += ((uint64) 1 << 32) - 1;
+
+ next_offset -= oldest_offset - 1;
+ old_cluster.controldata.chkpnt_nxtmxoff = next_offset;
+ }
+ }
+ else
+ copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets");
+
copy_subdir_files("pg_multixact/members", "pg_multixact/members");
prep_status("Setting next multixact ID and offset for new cluster");
@@ -760,9 +783,9 @@ copy_xact_xlog_xid(void)
* counters here and the oldest multi present on system.
*/
exec_prog(UTILITY_LOG_FILE, NULL, true, true,
- "\"%s/pg_resetwal\" -O %u -m %u,%u \"%s\"",
+ "\"%s/pg_resetwal\" -O %llu -m %u,%u \"%s\"",
new_cluster.bindir,
- old_cluster.controldata.chkpnt_nxtmxoff,
+ (unsigned long long) old_cluster.controldata.chkpnt_nxtmxoff,
old_cluster.controldata.chkpnt_nxtmulti,
old_cluster.controldata.chkpnt_oldstMulti,
new_cluster.pgdata);
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index cdb6e2b759..37d173cb86 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -114,6 +114,13 @@ extern char *output_files[];
*/
#define MULTIXACT_FORMATCHANGE_CAT_VER 201301231
+/*
+ * Swicth from 32-bit to 64-bit for multixid offsets.
+ *
+ * XXX: should be changed to the actual CATALOG_VERSION_NO on commit.
+ */
+#define MULTIXACTOFFSET_FORMATCHANGE_CAT_VER 202408123
+
/*
* large object chunk size added to pg_controldata,
* commit 5f93c37805e7485488480916b4585e098d3cc883
@@ -230,7 +237,7 @@ typedef struct
uint32 chkpnt_nxtepoch;
uint32 chkpnt_nxtoid;
uint32 chkpnt_nxtmulti;
- uint32 chkpnt_nxtmxoff;
+ uint64 chkpnt_nxtmxoff;
uint32 chkpnt_oldstMulti;
uint32 chkpnt_oldstxid;
uint32 align;
@@ -494,3 +501,7 @@ void parallel_transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr
char *old_pgdata, char *new_pgdata,
char *old_tablespace);
bool reap_child(bool wait_for_child);
+
+/* segresize.c */
+
+uint64 convert_multixact_offsets(void);
diff --git a/src/bin/pg_upgrade/segresize.c b/src/bin/pg_upgrade/segresize.c
new file mode 100644
index 0000000000..e47c0a2407
--- /dev/null
+++ b/src/bin/pg_upgrade/segresize.c
@@ -0,0 +1,350 @@
+/*
+ * segresize.c
+ *
+ * SLRU segment resize utility
+ *
+ * Copyright (c) 2024, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/segresize.c
+ */
+
+#include "postgres_fe.h"
+
+#include "pg_upgrade.h"
+#include "access/multixact.h"
+
+/* See slru.h */
+#define SLRU_PAGES_PER_SEGMENT 32
+
+/*
+ * Some kind of iterator associated with a particular SLRU segment. The idea is
+ * to specify the segment and page number and then move through the pages.
+ */
+typedef struct SlruSegState
+{
+ char *dir;
+ char *fn;
+ FILE *file;
+ int64 segno;
+ uint64 pageno;
+ bool leading_gap;
+ bool long_segment_names;
+} SlruSegState;
+
+/*
+ * Get SLRU segmen file name from state.
+ *
+ * NOTE: this function should mirror SlruFileName call.
+ */
+static inline char *
+SlruFileName(SlruSegState *state)
+{
+ if (state->long_segment_names)
+ {
+ Assert(state->segno >= 0 &&
+ state->segno <= INT64CONST(0xFFFFFFFFFFFFFFF));
+ return psprintf("%s/%015llX", state->dir, (long long) state->segno);
+ }
+ else
+ {
+ Assert(state->segno >= 0 &&
+ state->segno <= INT64CONST(0xFFFFFF));
+ return psprintf("%s/%04X", state->dir, (unsigned int) state->segno);
+ }
+}
+
+/*
+ * Create SLRU segment file.
+ */
+static void
+create_segment(SlruSegState *state)
+{
+ Assert(state->fn == NULL);
+ Assert(state->file == NULL);
+
+ state->fn = SlruFileName(state);
+ state->file = fopen(state->fn, "wb");
+ if (!state->file)
+ pg_fatal("could not create file \"%s\": %m", state->fn);
+}
+
+/*
+ * Open existing SLRU segment file.
+ */
+static void
+open_segment(SlruSegState *state)
+{
+ Assert(state->fn == NULL);
+ Assert(state->file == NULL);
+
+ state->fn = SlruFileName(state);
+ state->file = fopen(state->fn, "rb");
+ if (!state->file)
+ pg_fatal("could not open file \"%s\": %m", state->fn);
+}
+
+/*
+ * Close SLRU segment file.
+ */
+static void
+close_segment(SlruSegState *state)
+{
+ if (state->file)
+ {
+ fclose(state->file);
+ state->file = NULL;
+ }
+
+ if (state->fn)
+ {
+ pfree(state->fn);
+ state->fn = NULL;
+ }
+}
+
+/*
+ * Read next page from the old 32-bit offset segment file.
+ */
+static int
+read_old_segment_page(SlruSegState *state, void *buf, bool *empty)
+{
+ int len;
+
+ /* Open next segment file, if needed. */
+ if (!state->fn)
+ {
+ if (!state->segno)
+ state->leading_gap = true;
+
+ open_segment(state);
+
+ /* Set position to the needed page. */
+ if (state->pageno > 0 &&
+ fseek(state->file, state->pageno * BLCKSZ, SEEK_SET))
+ {
+ close_segment(state);
+ }
+ }
+
+ if (state->file)
+ {
+ /* Segment file do exists, read page from it. */
+ state->leading_gap = false;
+
+ len = fread(buf, sizeof(char), BLCKSZ, state->file);
+
+ /* Are we done or was there an error? */
+ if (len <= 0)
+ {
+ if (ferror(state->file))
+ pg_fatal("error reading file \"%s\": %m", state->fn);
+
+ if (feof(state->file))
+ {
+ *empty = true;
+ len = -1;
+
+ close_segment(state);
+ }
+ }
+ else
+ *empty = false;
+ }
+ else if (!state->leading_gap)
+ {
+ /* We reached the last segment. */
+ len = -1;
+ *empty = true;
+ }
+ else
+ {
+ /* Skip few first segments if they were frozen and removed. */
+ len = BLCKSZ;
+ *empty = true;
+ }
+
+ if (++state->pageno >= SLRU_PAGES_PER_SEGMENT)
+ {
+ /* Start a new segment. */
+ state->segno++;
+ state->pageno = 0;
+
+ close_segment(state);
+ }
+
+ return len;
+}
+
+/*
+ * Write next page to the new 64-bit offset segment file.
+ */
+static void
+write_new_segment_page(SlruSegState *state, void *buf)
+{
+ /*
+ * Create a new segment file if we still didn't. Creation is
+ * postponed until the first non-empty page is found. This helps
+ * not to create completely empty segments.
+ */
+ if (!state->file)
+ {
+ create_segment(state);
+
+ /* Write zeroes to the previously skipped prefix. */
+ if (state->pageno > 0)
+ {
+ char zerobuf[BLCKSZ] = {0};
+
+ for (int64 i = 0; i < state->pageno; i++)
+ {
+ if (fwrite(zerobuf, sizeof(char), BLCKSZ, state->file) != BLCKSZ)
+ pg_fatal("could not write file \"%s\": %m", state->fn);
+ }
+ }
+ }
+
+ /* Write page to the new segment (if it was created). */
+ if (state->file)
+ {
+ if (fwrite(buf, sizeof(char), BLCKSZ, state->file) != BLCKSZ)
+ pg_fatal("could not write file \"%s\": %m", state->fn);
+ }
+
+ state->pageno++;
+
+ /*
+ * Did we reach the maximum page number? Then close segment file
+ * and create a new one on the next iteration.
+ */
+ if (state->pageno >= SLRU_PAGES_PER_SEGMENT)
+ {
+ state->segno++;
+ state->pageno = 0;
+ close_segment(state);
+ }
+}
+
+/*
+ * Convert pg_multixact/offsets segments and return oldest multi offset.
+ */
+uint64
+convert_multixact_offsets(void)
+{
+ /* See multixact.c */
+#define MULTIXACT_OFFSETS_PER_PAGE_OLD (BLCKSZ / sizeof(uint32))
+#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
+
+ SlruSegState oldseg = {0},
+ newseg = {0};
+ uint32 oldbuf[MULTIXACT_OFFSETS_PER_PAGE_OLD] = {0};
+ MultiXactOffset newbuf[MULTIXACT_OFFSETS_PER_PAGE] = {0};
+ /*
+ * It is much easier to deal with multi wraparound in 64 bitd format. Thus
+ * we use 64 bits for multi-transactions, although they remain 32 bits.
+ */
+ uint64 oldest_multi = old_cluster.controldata.chkpnt_oldstMulti,
+ next_multi = old_cluster.controldata.chkpnt_nxtmulti,
+ multi,
+ old_entry,
+ new_entry;
+ bool found = false;
+ uint64 oldest_offset = 0;
+
+ prep_status("Converting pg_multixact/offsets to 64-bit");
+
+ oldseg.pageno = oldest_multi / MULTIXACT_OFFSETS_PER_PAGE_OLD;
+ oldseg.segno = oldseg.pageno / SLRU_PAGES_PER_SEGMENT;
+ oldseg.pageno %= SLRU_PAGES_PER_SEGMENT;
+ oldseg.dir = psprintf("%s/pg_multixact/offsets", old_cluster.pgdata);
+ oldseg.long_segment_names = false; /* old format XXXX */
+
+ newseg.pageno = oldest_multi / MULTIXACT_OFFSETS_PER_PAGE;
+ newseg.segno = newseg.pageno / SLRU_PAGES_PER_SEGMENT;
+ newseg.pageno %= SLRU_PAGES_PER_SEGMENT;
+ newseg.dir = psprintf("%s/pg_multixact/offsets", new_cluster.pgdata);
+ newseg.long_segment_names = true;
+
+ old_entry = oldest_multi % MULTIXACT_OFFSETS_PER_PAGE_OLD;
+ new_entry = oldest_multi % MULTIXACT_OFFSETS_PER_PAGE;
+
+ if (next_multi < oldest_multi)
+ next_multi += (uint64) 1 << 32; /* wraparound */
+
+ for (multi = oldest_multi; multi < next_multi; old_entry = 0)
+ {
+ int oldlen;
+ bool empty;
+
+ /* Handle possible segment wraparound. */
+ if (oldseg.segno > MaxMultiXactId /
+ MULTIXACT_OFFSETS_PER_PAGE_OLD /
+ SLRU_PAGES_PER_SEGMENT)
+ oldseg.segno = 0;
+
+ /* Read old offset segment. */
+ oldlen = read_old_segment_page(&oldseg, oldbuf, &empty);
+ if (oldlen <= 0 || empty)
+ pg_fatal("cannot read page %llu from file \"%s\": %m",
+ (unsigned long long) oldseg.pageno, oldseg.fn);
+
+ /* Fill possible gap. */
+ if (oldlen < BLCKSZ)
+ memset((char *) oldbuf + oldlen, 0, BLCKSZ - oldlen);
+
+ /* Save oldest multi offset */
+ if (!found)
+ {
+ oldest_offset = oldbuf[old_entry];
+ found = true;
+ }
+
+ /* ... skip wrapped-around invalid multi */
+ if (multi == (uint64) 1 << 32)
+ {
+ Assert(oldseg.segno == 0);
+ Assert(oldseg.pageno == 1);
+ Assert(old_entry == 0);
+
+ multi += FirstMultiXactId;
+ old_entry = FirstMultiXactId;
+ }
+
+ /* Copy entries to the new page. */
+ for (; multi < next_multi && old_entry < MULTIXACT_OFFSETS_PER_PAGE_OLD;
+ multi++, old_entry++)
+ {
+ MultiXactOffset offset = oldbuf[old_entry];
+
+ /* Handle possible offset wraparound. */
+ if (offset < oldest_offset)
+ offset += ((uint64) 1 << 32) - 1;
+
+ /* Subtract oldest_offset, so new offsets will start from 1. */
+ newbuf[new_entry++] = offset - oldest_offset + 1;
+ if (new_entry >= MULTIXACT_OFFSETS_PER_PAGE)
+ {
+ /* Write a new page. */
+ write_new_segment_page(&newseg, newbuf);
+ new_entry = 0;
+ }
+ }
+ }
+
+ /* Write the last incomplete page. */
+ if (new_entry > 0 || oldest_multi == next_multi)
+ {
+ memset(&newbuf[new_entry], 0,
+ sizeof(newbuf[0]) * (MULTIXACT_OFFSETS_PER_PAGE - new_entry));
+ write_new_segment_page(&newseg, newbuf);
+ }
+
+ /* Release resources. */
+ close_segment(&oldseg);
+ close_segment(&newseg);
+
+ pfree(oldseg.dir);
+ pfree(newseg.dir);
+
+ check_ok();
+
+ return oldest_offset;
+}
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 9a0ae27823..f29dc9fc92 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -57,6 +57,6 @@
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 202408122
+#define CATALOG_VERSION_NO 202408123
#endif
--
2.45.2
view thread (21+ messages) latest in thread
reply
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Reply to all the recipients using the --to and --cc options:
reply via email
To: [email protected]
Cc: [email protected], [email protected], [email protected], [email protected]
Subject: Re: POC: make mxidoff 64 bits
In-Reply-To: <CACG=ezY9xq73jcX_EjVqx5-f90nbQ9PyhFCTW2fwFCS2wmNiFw@mail.gmail.com>
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
This inbox is served by agora; see mirroring instructions
for how to clone and mirror all data and code used for this inbox