From b4db0517df2510bd452ae0e82d4b707e70fa29f5 Mon Sep 17 00:00:00 2001 From: Maxim Orlov Date: Wed, 6 Mar 2024 11:11:33 +0300 Subject: [PATCH v12 2/7] Use 64-bit multixact offsets. Author: Maxim Orlov --- src/backend/access/transam/multixact.c | 541 ++---------------------- src/backend/access/transam/xlog.c | 2 +- src/backend/commands/vacuum.c | 2 +- src/backend/postmaster/autovacuum.c | 4 +- src/bin/pg_resetwal/pg_resetwal.c | 2 +- src/bin/pg_resetwal/t/001_basic.pl | 2 +- src/include/access/multixact.h | 3 +- src/include/access/multixact_internal.h | 115 +++++ src/include/c.h | 2 +- 9 files changed, 156 insertions(+), 517 deletions(-) create mode 100644 src/include/access/multixact_internal.h diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 623fc8bdac..cd9db52e95 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -69,6 +69,7 @@ #include "postgres.h" #include "access/multixact.h" +#include "access/multixact_internal.h" #include "access/slru.h" #include "access/transam.h" #include "access/twophase.h" @@ -92,130 +93,14 @@ #include "utils/injection_point.h" #include "utils/memutils.h" - -/* - * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is - * used everywhere else in Postgres. - * - * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF, - * MultiXact page numbering also wraps around at - * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at - * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need - * take no explicit notice of that fact in this module, except when comparing - * segment and page numbers in TruncateMultiXact (see - * MultiXactOffsetPagePrecedes). - */ - -/* We need four bytes per offset */ -#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset)) - -static inline int64 -MultiXactIdToOffsetPage(MultiXactId multi) -{ - return multi / MULTIXACT_OFFSETS_PER_PAGE; -} - -static inline int -MultiXactIdToOffsetEntry(MultiXactId multi) -{ - return multi % MULTIXACT_OFFSETS_PER_PAGE; -} - -static inline int64 -MultiXactIdToOffsetSegment(MultiXactId multi) -{ - return MultiXactIdToOffsetPage(multi) / SLRU_PAGES_PER_SEGMENT; -} - -/* - * The situation for members is a bit more complex: we store one byte of - * additional flag bits for each TransactionId. To do this without getting - * into alignment issues, we store four bytes of flags, and then the - * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and - * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups - * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and - * performance) trumps space efficiency here. - * - * Note that the "offset" macros work with byte offset, not array indexes, so - * arithmetic must be done using "char *" pointers. - */ -/* We need eight bits per xact, so one xact fits in a byte */ -#define MXACT_MEMBER_BITS_PER_XACT 8 -#define MXACT_MEMBER_FLAGS_PER_BYTE 1 -#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) - -/* how many full bytes of flags are there in a group? */ -#define MULTIXACT_FLAGBYTES_PER_GROUP 4 -#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \ - (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE) -/* size in bytes of a complete group */ -#define MULTIXACT_MEMBERGROUP_SIZE \ - (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP) -#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) -#define MULTIXACT_MEMBERS_PER_PAGE \ - (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP) - /* - * Because the number of items per page is not a divisor of the last item - * number (member 0xFFFFFFFF), the last segment does not use the maximum number - * of pages, and moreover the last used page therein does not use the same - * number of items as previous pages. (Another way to say it is that the - * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page - * has some empty space after that item.) - * - * This constant is the number of members in the last page of the last segment. + * Multixact members warning threshold. + * + * If difference bettween nextOffset and oldestOffset exceed this value, we + * trigger autovacuumin order to release the disk space, reduce table bloat if + * possible. */ -#define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \ - ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1)) - -/* page in which a member is to be found */ -static inline int64 -MXOffsetToMemberPage(MultiXactOffset offset) -{ - return offset / MULTIXACT_MEMBERS_PER_PAGE; -} - -static inline int64 -MXOffsetToMemberSegment(MultiXactOffset offset) -{ - return MXOffsetToMemberPage(offset) / SLRU_PAGES_PER_SEGMENT; -} - -/* Location (byte offset within page) of flag word for a given member */ -static inline int -MXOffsetToFlagsOffset(MultiXactOffset offset) -{ - MultiXactOffset group = offset / MULTIXACT_MEMBERS_PER_MEMBERGROUP; - int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE; - int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE; - - return byteoff; -} - -static inline int -MXOffsetToFlagsBitShift(MultiXactOffset offset) -{ - int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; - int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT; - - return bshift; -} - -/* Location (byte offset within page) of TransactionId of given member */ -static inline int -MXOffsetToMemberOffset(MultiXactOffset offset) -{ - int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; - - return MXOffsetToFlagsOffset(offset) + - MULTIXACT_FLAGBYTES_PER_GROUP + - member_in_group * sizeof(TransactionId); -} - -/* Multixact members wraparound thresholds. */ -#define MULTIXACT_MEMBER_SAFE_THRESHOLD (MaxMultiXactOffset / 2) -#define MULTIXACT_MEMBER_DANGER_THRESHOLD \ - (MaxMultiXactOffset - MaxMultiXactOffset / 4) +#define MULTIXACT_MEMBER_AUTOVAC_THRESHOLD UINT64CONST(0xFFFFFFFF) static inline MultiXactId PreviousMultiXactId(MultiXactId multi) @@ -260,11 +145,9 @@ typedef struct MultiXactStateData /* * Oldest multixact offset that is potentially referenced by a multixact - * referenced by a relation. We don't always know this value, so there's - * a flag here to indicate whether or not we currently do. + * referenced by a relation. */ MultiXactOffset oldestOffset; - bool oldestOffsetKnown; /* support for anti-wraparound measures */ MultiXactId multiVacLimit; @@ -272,9 +155,6 @@ typedef struct MultiXactStateData MultiXactId multiStopLimit; MultiXactId multiWrapLimit; - /* support for members anti-wraparound measures */ - MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */ - /* * This is used to sleep until a multixact offset is written when we want * to create the next one. @@ -409,10 +289,8 @@ static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2); static void ExtendMultiXactOffset(MultiXactId multi); static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers); -static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary, - MultiXactOffset start, uint32 distance); static bool SetOffsetVacuumLimit(bool is_startup); -static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result); +static MultiXactOffset find_multixact_start(MultiXactId multi); static void WriteMZeroPageXlogRec(int64 pageno, uint8 info); static void WriteMTruncateXlogRec(Oid oldestMultiDB, MultiXactId startTruncOff, @@ -1054,9 +932,7 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) * against catastrophic data loss due to multixact wraparound. The basic * rules are: * - * If we're past multiVacLimit or the safe threshold for member storage - * space, or we don't know what the safe threshold for member storage is, - * start trying to force autovacuum cycles. + * If we're past multiVacLimit, start trying to force autovacuum cycles. * If we're past multiWarnLimit, start issuing warnings. * If we're past multiStopLimit, refuse to create new MultiXactIds. * @@ -1151,90 +1027,10 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) ExtendMultiXactOffset(result); /* - * Reserve the members space, similarly to above. Also, be careful not to - * return zero as the starting offset for any multixact. See - * GetMultiXactIdMembers() for motivation. + * Reserve the members space, similarly to above. */ nextOffset = MultiXactState->nextOffset; - if (nextOffset == 0) - { - *offset = 1; - nmembers++; /* allocate member slot 0 too */ - } - else - *offset = nextOffset; - - /*---------- - * Protect against overrun of the members space as well, with the - * following rules: - * - * If we're past offsetStopLimit, refuse to generate more multis. - * If we're close to offsetStopLimit, emit a warning. - * - * Arbitrarily, we start emitting warnings when we're 20 segments or less - * from offsetStopLimit. - * - * Note we haven't updated the shared state yet, so if we fail at this - * point, the multixact ID we grabbed can still be used by the next guy. - * - * Note that there is no point in forcing autovacuum runs here: the - * multixact freeze settings would have to be reduced for that to have any - * effect. - *---------- - */ -#define OFFSET_WARN_SEGMENTS 20 - if (MultiXactState->oldestOffsetKnown && - MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset, - nmembers)) - { - /* see comment in the corresponding offsets wraparound case */ - SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); - - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("multixact \"members\" limit exceeded"), - errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.", - "This command would create a multixact with %u members, but the remaining space is only enough for %u members.", - MultiXactState->offsetStopLimit - nextOffset - 1, - nmembers, - MultiXactState->offsetStopLimit - nextOffset - 1), - errhint("Execute a database-wide VACUUM in database with OID %u with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.", - MultiXactState->oldestMultiXactDB))); - } - - /* - * Check whether we should kick autovacuum into action, to prevent members - * wraparound. NB we use a much larger window to trigger autovacuum than - * just the warning limit. The warning is just a measure of last resort - - * this is in line with GetNewTransactionId's behaviour. - */ - if (!MultiXactState->oldestOffsetKnown || - (MultiXactState->nextOffset - MultiXactState->oldestOffset - > MULTIXACT_MEMBER_SAFE_THRESHOLD)) - { - /* - * To avoid swamping the postmaster with signals, we issue the autovac - * request only when crossing a segment boundary. With default - * compilation settings that's roughly after 50k members. This still - * gives plenty of chances before we get into real trouble. - */ - if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) != - (MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT)) - SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); - } - - if (MultiXactState->oldestOffsetKnown && - MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, - nextOffset, - nmembers + MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT * OFFSET_WARN_SEGMENTS)) - ereport(WARNING, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used", - "database with OID %u must be vacuumed before %d more multixact members are used", - MultiXactState->offsetStopLimit - nextOffset + nmembers, - MultiXactState->oldestMultiXactDB, - MultiXactState->offsetStopLimit - nextOffset + nmembers), - errhint("Execute a database-wide VACUUM in that database with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings."))); + *offset = nextOffset; ExtendMultiXactMember(nextOffset, nmembers); @@ -2620,22 +2416,9 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers) } /* - * Compute the number of items till end of current page. Careful: if - * addition of unsigned ints wraps around, we're at the last page of - * the last segment; since that page holds a different number of items - * than other pages, we need to do it differently. + * Compute the number of items till end of current page. */ - if (offset + MAX_MEMBERS_IN_LAST_MEMBERS_PAGE < offset) - { - /* - * This is the last page of the last segment; we can compute the - * number of items left to allocate in it without modulo - * arithmetic. - */ - difference = MaxMultiXactOffset - offset + 1; - } - else - difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE; + difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE; /* * Advance to next page, taking care to properly handle the wraparound @@ -2701,15 +2484,13 @@ GetOldestMultiXactId(void) } /* - * Determine how aggressively we need to vacuum in order to prevent member - * wraparound. + * Determine if we need to vacuum for member or not. * * To do so determine what's the oldest member offset and install the limit * info in MultiXactState, where it can be used to prevent overrun of old data * in the members SLRU area. * - * The return value is true if emergency autovacuum is required and false - * otherwise. + * The return value is true if autovacuum is required and false otherwise. */ static bool SetOffsetVacuumLimit(bool is_startup) @@ -2717,12 +2498,7 @@ SetOffsetVacuumLimit(bool is_startup) MultiXactId oldestMultiXactId; MultiXactId nextMXact; MultiXactOffset oldestOffset = 0; /* placate compiler */ - MultiXactOffset prevOldestOffset; MultiXactOffset nextOffset; - bool oldestOffsetKnown = false; - bool prevOldestOffsetKnown; - MultiXactOffset offsetStopLimit = 0; - MultiXactOffset prevOffsetStopLimit; /* * NB: Have to prevent concurrent truncation, we might otherwise try to @@ -2735,9 +2511,6 @@ SetOffsetVacuumLimit(bool is_startup) oldestMultiXactId = MultiXactState->oldestMultiXactId; nextMXact = MultiXactState->nextMXact; nextOffset = MultiXactState->nextOffset; - prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown; - prevOldestOffset = MultiXactState->oldestOffset; - prevOffsetStopLimit = MultiXactState->offsetStopLimit; Assert(MultiXactState->finishedStartup); LWLockRelease(MultiXactGenLock); @@ -2755,139 +2528,31 @@ SetOffsetVacuumLimit(bool is_startup) * offset. */ oldestOffset = nextOffset; - oldestOffsetKnown = true; } else - { - /* - * Figure out where the oldest existing multixact's offsets are - * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X, - * the supposedly-earliest multixact might not really exist. We are - * careful not to fail in that case. - */ - oldestOffsetKnown = - find_multixact_start(oldestMultiXactId, &oldestOffset); - - if (oldestOffsetKnown) - ereport(DEBUG1, - (errmsg_internal("oldest MultiXactId member is at offset %u", - oldestOffset))); - else - ereport(LOG, - (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk", - oldestMultiXactId))); - } + oldestOffset = find_multixact_start(oldestMultiXactId); LWLockRelease(MultiXactTruncationLock); - /* - * If we can, compute limits (and install them MultiXactState) to prevent - * overrun of old data in the members SLRU area. We can only do so if the - * oldest offset is known though. - */ - if (oldestOffsetKnown) - { - /* move back to start of the corresponding segment */ - offsetStopLimit = oldestOffset - (oldestOffset % - (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT)); - - /* always leave one segment before the wraparound point */ - offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT); - - if (!prevOldestOffsetKnown && !is_startup) - ereport(LOG, - (errmsg("MultiXact member wraparound protections are now enabled"))); - - ereport(DEBUG1, - (errmsg_internal("MultiXact member stop limit is now %u based on MultiXact %u", - offsetStopLimit, oldestMultiXactId))); - } - else if (prevOldestOffsetKnown) - { - /* - * If we failed to get the oldest offset this time, but we have a - * value from a previous pass through this function, use the old - * values rather than automatically forcing an emergency autovacuum - * cycle again. - */ - oldestOffset = prevOldestOffset; - oldestOffsetKnown = true; - offsetStopLimit = prevOffsetStopLimit; - } - /* Install the computed values */ LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); MultiXactState->oldestOffset = oldestOffset; - MultiXactState->oldestOffsetKnown = oldestOffsetKnown; - MultiXactState->offsetStopLimit = offsetStopLimit; LWLockRelease(MultiXactGenLock); /* - * Do we need an emergency autovacuum? If we're not sure, assume yes. - */ - return !oldestOffsetKnown || - (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD); -} - -/* - * Return whether adding "distance" to "start" would move past "boundary". - * - * We use this to determine whether the addition is "wrapping around" the - * boundary point, hence the name. The reason we don't want to use the regular - * 2^31-modulo arithmetic here is that we want to be able to use the whole of - * the 2^32-1 space here, allowing for more multixacts than would fit - * otherwise. - */ -static bool -MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start, - uint32 distance) -{ - MultiXactOffset finish; - - /* - * Note that offset number 0 is not used (see GetMultiXactIdMembers), so - * if the addition wraps around the UINT_MAX boundary, skip that value. - */ - finish = start + distance; - if (finish < start) - finish++; - - /*----------------------------------------------------------------------- - * When the boundary is numerically greater than the starting point, any - * value numerically between the two is not wrapped: - * - * <----S----B----> - * [---) = F wrapped past B (and UINT_MAX) - * [---) = F not wrapped - * [----] = F wrapped past B - * - * When the boundary is numerically less than the starting point (i.e. the - * UINT_MAX wraparound occurs somewhere in between) then all values in - * between are wrapped: - * - * <----B----S----> - * [---) = F not wrapped past B (but wrapped past UINT_MAX) - * [---) = F wrapped past B (and UINT_MAX) - * [----] = F not wrapped - *----------------------------------------------------------------------- + * Do we need autovacuum? */ - if (start < boundary) - return finish >= boundary || finish < start; - else - return finish >= boundary && finish < start; + return (nextOffset - oldestOffset > MULTIXACT_MEMBER_AUTOVAC_THRESHOLD); } /* * Find the starting offset of the given MultiXactId. * - * Returns false if the file containing the multi does not exist on disk. - * Otherwise, returns true and sets *result to the starting member offset. - * * This function does not prevent concurrent truncation, so if that's * required, the caller has to protect against that. */ -static bool -find_multixact_start(MultiXactId multi, MultiXactOffset *result) +static MultiXactOffset +find_multixact_start(MultiXactId multi) { MultiXactOffset offset; int64 pageno; @@ -2900,15 +2565,6 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result) pageno = MultiXactIdToOffsetPage(multi); entryno = MultiXactIdToOffsetEntry(multi); - /* - * Write out dirty data, so PhysicalPageExists can work correctly. - */ - SimpleLruWriteAll(MultiXactOffsetCtl, true); - SimpleLruWriteAll(MultiXactMemberCtl, true); - - if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno)) - return false; - /* lock is acquired by SimpleLruReadPage_ReadOnly */ slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi); offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; @@ -2916,102 +2572,7 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result) offset = *offptr; LWLockRelease(SimpleLruGetBankLock(MultiXactOffsetCtl, pageno)); - *result = offset; - return true; -} - -/* - * Determine how many multixacts, and how many multixact members, currently - * exist. Return false if unable to determine. - */ -static bool -ReadMultiXactCounts(uint32 *multixacts, MultiXactOffset *members) -{ - MultiXactOffset nextOffset; - MultiXactOffset oldestOffset; - MultiXactId oldestMultiXactId; - MultiXactId nextMultiXactId; - bool oldestOffsetKnown; - - LWLockAcquire(MultiXactGenLock, LW_SHARED); - nextOffset = MultiXactState->nextOffset; - oldestMultiXactId = MultiXactState->oldestMultiXactId; - nextMultiXactId = MultiXactState->nextMXact; - oldestOffset = MultiXactState->oldestOffset; - oldestOffsetKnown = MultiXactState->oldestOffsetKnown; - LWLockRelease(MultiXactGenLock); - - if (!oldestOffsetKnown) - return false; - - *members = nextOffset - oldestOffset; - *multixacts = nextMultiXactId - oldestMultiXactId; - return true; -} - -/* - * Multixact members can be removed once the multixacts that refer to them - * are older than every datminmxid. autovacuum_multixact_freeze_max_age and - * vacuum_multixact_freeze_table_age work together to make sure we never have - * too many multixacts; we hope that, at least under normal circumstances, - * this will also be sufficient to keep us from using too many offsets. - * However, if the average multixact has many members, we might exhaust the - * members space while still using few enough members that these limits fail - * to trigger relminmxid advancement by VACUUM. At that point, we'd have no - * choice but to start failing multixact-creating operations with an error. - * - * To prevent that, if more than a threshold portion of the members space is - * used, we effectively reduce autovacuum_multixact_freeze_max_age and - * to a value just less than the number of multixacts in use. We hope that - * this will quickly trigger autovacuuming on the table or tables with the - * oldest relminmxid, thus allowing datminmxid values to advance and removing - * some members. - * - * As the fraction of the member space currently in use grows, we become - * more aggressive in clamping this value. That not only causes autovacuum - * to ramp up, but also makes any manual vacuums the user issues more - * aggressive. This happens because vacuum_get_cutoffs() will clamp the - * freeze table and the minimum freeze age cutoffs based on the effective - * autovacuum_multixact_freeze_max_age this function returns. In the worst - * case, we'll claim the freeze_max_age to zero, and every vacuum of any - * table will freeze every multixact. - */ -int -MultiXactMemberFreezeThreshold(void) -{ - MultiXactOffset members; - uint32 multixacts; - uint32 victim_multixacts; - double fraction; - int result; - - /* If we can't determine member space utilization, assume the worst. */ - if (!ReadMultiXactCounts(&multixacts, &members)) - return 0; - - /* If member space utilization is low, no special action is required. */ - if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD) - return autovacuum_multixact_freeze_max_age; - - /* - * Compute a target for relminmxid advancement. The number of multixacts - * we try to eliminate from the system is based on how far we are past - * MULTIXACT_MEMBER_SAFE_THRESHOLD. - */ - fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) / - (MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD); - victim_multixacts = multixacts * fraction; - - /* fraction could be > 1.0, but lowest possible freeze age is zero */ - if (victim_multixacts > multixacts) - return 0; - result = multixacts - victim_multixacts; - - /* - * Clamp to autovacuum_multixact_freeze_max_age, so that we never make - * autovacuum less aggressive than it would otherwise be. - */ - return Min(result, autovacuum_multixact_freeze_max_age); + return offset; } typedef struct mxtruncinfo @@ -3039,37 +2600,13 @@ SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int64 segpage, void *data /* - * Delete members segments [oldest, newOldest) - * - * The members SLRU can, in contrast to the offsets one, be filled to almost - * the full range at once. This means SimpleLruTruncate() can't trivially be - * used - instead the to-be-deleted range is computed using the offsets - * SLRU. C.f. TruncateMultiXact(). + * Delete members segments before the newOldestOffset. */ static void -PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset) +PerformMembersTruncation(MultiXactOffset newOldestOffset) { - const int64 maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset); - int64 startsegment = MXOffsetToMemberSegment(oldestOffset); - int64 endsegment = MXOffsetToMemberSegment(newOldestOffset); - int64 segment = startsegment; - - /* - * Delete all the segments but the last one. The last segment can still - * contain, possibly partially, valid data. - */ - while (segment != endsegment) - { - elog(DEBUG2, "truncating multixact members segment %llx", - (unsigned long long) segment); - SlruDeleteSegment(MultiXactMemberCtl, segment); - - /* move to next segment, handling wraparound correctly */ - if (segment == maxsegment) - segment = 0; - else - segment += 1; - } + SimpleLruTruncate(MultiXactMemberCtl, + MXOffsetToMemberPage(newOldestOffset)); } /* @@ -3174,23 +2711,15 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) /* * First, compute the safe truncation point for MultiXactMember. This is * the starting offset of the oldest multixact. - * - * Hopefully, find_multixact_start will always work here, because we've - * already checked that it doesn't precede the earliest MultiXact on disk. - * But if it fails, don't truncate anything, and log a message. */ if (oldestMulti == nextMulti) { /* there are NO MultiXacts */ oldestOffset = nextOffset; } - else if (!find_multixact_start(oldestMulti, &oldestOffset)) + else { - ereport(LOG, - (errmsg("oldest MultiXact %u not found, earliest MultiXact %u, skipping truncation", - oldestMulti, earliest))); - LWLockRelease(MultiXactTruncationLock); - return; + oldestOffset = find_multixact_start(oldestMulti); } /* @@ -3202,13 +2731,9 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) /* there are NO MultiXacts */ newOldestOffset = nextOffset; } - else if (!find_multixact_start(newOldestMulti, &newOldestOffset)) + else { - ereport(LOG, - (errmsg("cannot truncate up to MultiXact %u because it does not exist on disk, skipping truncation", - newOldestMulti))); - LWLockRelease(MultiXactTruncationLock); - return; + newOldestOffset = find_multixact_start(newOldestMulti); } elog(DEBUG1, "performing multixact truncation: " @@ -3258,7 +2783,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) LWLockRelease(MultiXactGenLock); /* First truncate members */ - PerformMembersTruncation(oldestOffset, newOldestOffset); + PerformMembersTruncation(newOldestOffset); /* Then offsets */ PerformOffsetsTruncation(oldestMulti, newOldestMulti); @@ -3345,7 +2870,7 @@ MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2) static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2) { - int32 diff = (int32) (offset1 - offset2); + int64 diff = (int64) (offset1 - offset2); return (diff < 0); } @@ -3492,7 +3017,7 @@ multixact_redo(XLogReaderState *record) */ SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB, false); - PerformMembersTruncation(xlrec.startTruncMemb, xlrec.endTruncMemb); + PerformMembersTruncation(xlrec.endTruncMemb); /* * During XLOG replay, latest_page_number isn't necessarily set up diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index bf3dbda901..a813a090fa 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -5083,7 +5083,7 @@ BootStrapXLOG(uint32 data_checksum_version) FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId); checkPoint.nextOid = FirstGenbkiObjectId; checkPoint.nextMulti = FirstMultiXactId; - checkPoint.nextMultiOffset = 0; + checkPoint.nextMultiOffset = 1; checkPoint.oldestXid = FirstNormalTransactionId; checkPoint.oldestXidDB = Template1DbOid; checkPoint.oldestMulti = FirstMultiXactId; diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index e6745e6145..c96fbf004d 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -1134,7 +1134,7 @@ vacuum_get_cutoffs(Relation rel, const VacuumParams *params, * normally autovacuum_multixact_freeze_max_age, but may be less if we are * short of multixact member space. */ - effective_multixact_freeze_max_age = MultiXactMemberFreezeThreshold(); + effective_multixact_freeze_max_age = autovacuum_multixact_freeze_max_age; /* * Almost ready to set freeze output parameters; check if OldestXmin or diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 0ab921a169..ed5fc09c38 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -1134,7 +1134,7 @@ do_start_worker(void) /* Also determine the oldest datminmxid we will consider. */ recentMulti = ReadNextMultiXactId(); - multiForceLimit = recentMulti - MultiXactMemberFreezeThreshold(); + multiForceLimit = recentMulti - autovacuum_multixact_freeze_max_age; if (multiForceLimit < FirstMultiXactId) multiForceLimit -= FirstMultiXactId; @@ -1922,7 +1922,7 @@ do_autovacuum(void) * normally autovacuum_multixact_freeze_max_age, but may be less if we are * short of multixact member space. */ - effective_multixact_freeze_max_age = MultiXactMemberFreezeThreshold(); + effective_multixact_freeze_max_age = autovacuum_multixact_freeze_max_age; /* * Find the pg_database entry and select the default freeze ages. We use diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index fff401e469..4ad64cf1ed 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -264,7 +264,7 @@ main(int argc, char *argv[]) case 'O': errno = 0; - set_mxoff = strtoul(optarg, &endptr, 0); + set_mxoff = strtou64(optarg, &endptr, 0); if (endptr == optarg || *endptr != '\0' || errno != 0) { pg_log_error("invalid argument for option %s", "-O"); diff --git a/src/bin/pg_resetwal/t/001_basic.pl b/src/bin/pg_resetwal/t/001_basic.pl index d0bd1f7ace..79d03d79de 100644 --- a/src/bin/pg_resetwal/t/001_basic.pl +++ b/src/bin/pg_resetwal/t/001_basic.pl @@ -206,7 +206,7 @@ push @cmd, sprintf("%d,%d", hex($files[0]) == 0 ? 3 : hex($files[0]), hex($files[-1])); @files = get_slru_files('pg_multixact/offsets'); -$mult = 32 * $blcksz / 4; +$mult = 32 * $blcksz / 8; # -m argument is "new,old" push @cmd, '-m', sprintf("%d,%d", diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h index 4e6b0eec2f..5ee632dfe6 100644 --- a/src/include/access/multixact.h +++ b/src/include/access/multixact.h @@ -27,7 +27,7 @@ #define MultiXactIdIsValid(multi) ((multi) != InvalidMultiXactId) -#define MaxMultiXactOffset ((MultiXactOffset) 0xFFFFFFFF) +#define MaxMultiXactOffset UINT64CONST(0xFFFFFFFFFFFFFFFF) /* * Possible multixact lock modes ("status"). The first four modes are for @@ -143,7 +143,6 @@ extern void MultiXactSetNextMXact(MultiXactId nextMulti, extern void MultiXactAdvanceNextMXact(MultiXactId minMulti, MultiXactOffset minMultiOffset); extern void MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB); -extern int MultiXactMemberFreezeThreshold(void); extern void multixact_twophase_recover(TransactionId xid, uint16 info, void *recdata, uint32 len); diff --git a/src/include/access/multixact_internal.h b/src/include/access/multixact_internal.h new file mode 100644 index 0000000000..39e74a21c7 --- /dev/null +++ b/src/include/access/multixact_internal.h @@ -0,0 +1,115 @@ +/* + * multixact_internal.h + * + * Internal definitions for the on-disk format of multixact manager. + * + * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/multixact_internal.h + */ +#ifndef MULTIXACT_INTERNAL_H +#define MULTIXACT_INTERNAL_H + +/* FIXME: had to duplicate this */ +#define SLRU_PAGES_PER_SEGMENT 32 + +/* + * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is + * used everywhere else in Postgres. + */ + +/* We need four bytes per offset */ +#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset)) + +static inline int64 +MultiXactIdToOffsetPage(MultiXactId multi) +{ + return multi / MULTIXACT_OFFSETS_PER_PAGE; +} + +static inline int +MultiXactIdToOffsetEntry(MultiXactId multi) +{ + return multi % MULTIXACT_OFFSETS_PER_PAGE; +} + +static inline int64 +MultiXactIdToOffsetSegment(MultiXactId multi) +{ + return MultiXactIdToOffsetPage(multi) / SLRU_PAGES_PER_SEGMENT; +} + +/* + * The situation for members is a bit more complex: we store one byte of + * additional flag bits for each TransactionId. To do this without getting + * into alignment issues, we store four bytes of flags, and then the + * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and + * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups + * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and + * performance) trumps space efficiency here. + * + * Note that the "offset" macros work with byte offset, not array indexes, so + * arithmetic must be done using "char *" pointers. + */ +/* We need eight bits per xact, so one xact fits in a byte */ +#define MXACT_MEMBER_BITS_PER_XACT 8 +#define MXACT_MEMBER_FLAGS_PER_BYTE 1 +#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) + +/* how many full bytes of flags are there in a group? */ +#define MULTIXACT_FLAGBYTES_PER_GROUP 4 +#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \ + (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE) +/* size in bytes of a complete group */ +#define MULTIXACT_MEMBERGROUP_SIZE \ + (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP) +#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) +#define MULTIXACT_MEMBERS_PER_PAGE \ + (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP) + +/* page in which a member is to be found */ +static inline int64 +MXOffsetToMemberPage(MultiXactOffset offset) +{ + return offset / MULTIXACT_MEMBERS_PER_PAGE; +} + +static inline int64 +MXOffsetToMemberSegment(MultiXactOffset offset) +{ + return MXOffsetToMemberPage(offset) / SLRU_PAGES_PER_SEGMENT; +} + +/* Location (byte offset within page) of flag word for a given member */ +static inline int +MXOffsetToFlagsOffset(MultiXactOffset offset) +{ + MultiXactOffset group = offset / MULTIXACT_MEMBERS_PER_MEMBERGROUP; + int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE; + int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE; + + return byteoff; +} + +static inline int +MXOffsetToFlagsBitShift(MultiXactOffset offset) +{ + int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; + int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT; + + return bshift; +} + +/* Location (byte offset within page) of TransactionId of given member */ +static inline int +MXOffsetToMemberOffset(MultiXactOffset offset) +{ + int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; + + return MXOffsetToFlagsOffset(offset) + + MULTIXACT_FLAGBYTES_PER_GROUP + + member_in_group * sizeof(TransactionId); +} + +#endif /* MULTIXACT_INTERNAL_H */ diff --git a/src/include/c.h b/src/include/c.h index a14c631516..318194f78d 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -618,7 +618,7 @@ typedef uint32 SubTransactionId; /* MultiXactId must be equivalent to TransactionId, to fit in t_xmax */ typedef TransactionId MultiXactId; -typedef uint32 MultiXactOffset; +typedef uint64 MultiXactOffset; typedef uint32 CommandId; -- 2.43.0