diff --git a/src/backend/access/rmgrdesc/mxactdesc.c b/src/backend/access/rmgrdesc/mxactdesc.c index 3ca0582db364..052dd0a4ce56 100644 --- a/src/backend/access/rmgrdesc/mxactdesc.c +++ b/src/backend/access/rmgrdesc/mxactdesc.c @@ -65,7 +65,7 @@ multixact_desc(StringInfo buf, XLogReaderState *record) xl_multixact_create *xlrec = (xl_multixact_create *) rec; int i; - appendStringInfo(buf, "%u offset %u nmembers %d: ", xlrec->mid, + appendStringInfo(buf, "%u offset %" PRIu64 " nmembers %d: ", xlrec->mid, xlrec->moff, xlrec->nmembers); for (i = 0; i < xlrec->nmembers; i++) out_member(buf, &xlrec->members[i]); @@ -74,7 +74,7 @@ multixact_desc(StringInfo buf, XLogReaderState *record) { xl_multixact_truncate *xlrec = (xl_multixact_truncate *) rec; - appendStringInfo(buf, "offsets [%u, %u), members [%u, %u)", + appendStringInfo(buf, "offsets [%u, %u), members [%" PRIu64 ", %" PRIu64 ")", xlrec->startTruncOff, xlrec->endTruncOff, xlrec->startTruncMemb, xlrec->endTruncMemb); } diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index cd6c2a2f650a..441034f5929c 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -66,7 +66,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record) CheckPoint *checkpoint = (CheckPoint *) rec; appendStringInfo(buf, "redo %X/%08X; " - "tli %u; prev tli %u; fpw %s; wal_level %s; xid %u:%u; oid %u; multi %u; offset %u; " + "tli %u; prev tli %u; fpw %s; wal_level %s; xid %u:%u; oid %u; multi %u; offset %" PRIu64 "; " "oldest xid %u in DB %u; oldest multi %u in DB %u; " "oldest/newest commit timestamp xid: %u/%u; " "oldest running xid %u; %s", diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 9d5f130af7ef..26f8a10c377c 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -69,6 +69,7 @@ #include "postgres.h" #include "access/multixact.h" +#include "access/multixact_internal.h" #include "access/slru.h" #include "access/twophase.h" #include "access/twophase_rmgr.h" @@ -88,35 +89,6 @@ #include "utils/lsyscache.h" #include "utils/memutils.h" - -/* - * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is - * used everywhere else in Postgres. - * - * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF, - * MultiXact page numbering also wraps around at - * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at - * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need - * take no explicit notice of that fact in this module, except when comparing - * segment and page numbers in TruncateMultiXact (see - * MultiXactOffsetPagePrecedes). - */ - -/* We need four bytes per offset */ -#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset)) - -static inline int64 -MultiXactIdToOffsetPage(MultiXactId multi) -{ - return multi / MULTIXACT_OFFSETS_PER_PAGE; -} - -static inline int -MultiXactIdToOffsetEntry(MultiXactId multi) -{ - return multi % MULTIXACT_OFFSETS_PER_PAGE; -} - static inline int64 MultiXactIdToOffsetSegment(MultiXactId multi) { @@ -124,94 +96,13 @@ MultiXactIdToOffsetSegment(MultiXactId multi) } /* - * The situation for members is a bit more complex: we store one byte of - * additional flag bits for each TransactionId. To do this without getting - * into alignment issues, we store four bytes of flags, and then the - * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and - * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups - * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and - * performance) trumps space efficiency here. + * Multixact members warning threshold. * - * Note that the "offset" macros work with byte offset, not array indexes, so - * arithmetic must be done using "char *" pointers. + * If the difference between nextOffset and oldestOffset exceeds this value, + * we trigger autovacuum in order to release disk space consumed by the + * members SLRU. */ -/* We need eight bits per xact, so one xact fits in a byte */ -#define MXACT_MEMBER_BITS_PER_XACT 8 -#define MXACT_MEMBER_FLAGS_PER_BYTE 1 -#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) - -/* how many full bytes of flags are there in a group? */ -#define MULTIXACT_FLAGBYTES_PER_GROUP 4 -#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \ - (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE) -/* size in bytes of a complete group */ -#define MULTIXACT_MEMBERGROUP_SIZE \ - (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP) -#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) -#define MULTIXACT_MEMBERS_PER_PAGE \ - (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP) - -/* - * Because the number of items per page is not a divisor of the last item - * number (member 0xFFFFFFFF), the last segment does not use the maximum number - * of pages, and moreover the last used page therein does not use the same - * number of items as previous pages. (Another way to say it is that the - * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page - * has some empty space after that item.) - * - * This constant is the number of members in the last page of the last segment. - */ -#define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \ - ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1)) - -/* page in which a member is to be found */ -static inline int64 -MXOffsetToMemberPage(MultiXactOffset offset) -{ - return offset / MULTIXACT_MEMBERS_PER_PAGE; -} - -static inline int64 -MXOffsetToMemberSegment(MultiXactOffset offset) -{ - return MXOffsetToMemberPage(offset) / SLRU_PAGES_PER_SEGMENT; -} - -/* Location (byte offset within page) of flag word for a given member */ -static inline int -MXOffsetToFlagsOffset(MultiXactOffset offset) -{ - MultiXactOffset group = offset / MULTIXACT_MEMBERS_PER_MEMBERGROUP; - int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE; - int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE; - - return byteoff; -} - -static inline int -MXOffsetToFlagsBitShift(MultiXactOffset offset) -{ - int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; - int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT; - - return bshift; -} - -/* Location (byte offset within page) of TransactionId of given member */ -static inline int -MXOffsetToMemberOffset(MultiXactOffset offset) -{ - int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; - - return MXOffsetToFlagsOffset(offset) + - MULTIXACT_FLAGBYTES_PER_GROUP + - member_in_group * sizeof(TransactionId); -} - -/* Multixact members wraparound thresholds. */ -#define MULTIXACT_MEMBER_SAFE_THRESHOLD (MaxMultiXactOffset / 2) -#define MULTIXACT_MEMBER_DANGER_THRESHOLD \ - (MaxMultiXactOffset - MaxMultiXactOffset / 4) +#define MULTIXACT_MEMBER_AUTOVAC_THRESHOLD UINT64CONST(4000000000) static inline MultiXactId PreviousMultiXactId(MultiXactId multi) @@ -254,23 +145,12 @@ typedef struct MultiXactStateData MultiXactId oldestMultiXactId; Oid oldestMultiXactDB; - /* - * Oldest multixact offset that is potentially referenced by a multixact - * referenced by a relation. We don't always know this value, so there's - * a flag here to indicate whether or not we currently do. - */ - MultiXactOffset oldestOffset; - bool oldestOffsetKnown; - /* support for anti-wraparound measures */ MultiXactId multiVacLimit; MultiXactId multiWarnLimit; MultiXactId multiStopLimit; MultiXactId multiWrapLimit; - /* support for members anti-wraparound measures */ - MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */ - /* * This is used to sleep until a multixact offset is written when we want * to create the next one. @@ -401,8 +281,6 @@ static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2); static void ExtendMultiXactOffset(MultiXactId multi); static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers); -static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary, - MultiXactOffset start, uint32 distance); static bool SetOffsetVacuumLimit(bool is_startup); static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result); static void WriteMTruncateXlogRec(Oid oldestMultiDB, @@ -1142,90 +1020,22 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) ExtendMultiXactOffset(result); /* - * Reserve the members space, similarly to above. Also, be careful not to - * return zero as the starting offset for any multixact. See - * GetMultiXactIdMembers() for motivation. + * Reserve the members space, similarly to above. */ nextOffset = MultiXactState->nextOffset; - if (nextOffset == 0) - { - *offset = 1; - nmembers++; /* allocate member slot 0 too */ - } - else - *offset = nextOffset; - - /*---------- - * Protect against overrun of the members space as well, with the - * following rules: - * - * If we're past offsetStopLimit, refuse to generate more multis. - * If we're close to offsetStopLimit, emit a warning. - * - * Arbitrarily, we start emitting warnings when we're 20 segments or less - * from offsetStopLimit. - * - * Note we haven't updated the shared state yet, so if we fail at this - * point, the multixact ID we grabbed can still be used by the next guy. - * - * Note that there is no point in forcing autovacuum runs here: the - * multixact freeze settings would have to be reduced for that to have any - * effect. - *---------- - */ -#define OFFSET_WARN_SEGMENTS 20 - if (MultiXactState->oldestOffsetKnown && - MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset, - nmembers)) - { - /* see comment in the corresponding offsets wraparound case */ - SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); - - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("multixact \"members\" limit exceeded"), - errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.", - "This command would create a multixact with %u members, but the remaining space is only enough for %u members.", - MultiXactState->offsetStopLimit - nextOffset - 1, - nmembers, - MultiXactState->offsetStopLimit - nextOffset - 1), - errhint("Execute a database-wide VACUUM in database with OID %u with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.", - MultiXactState->oldestMultiXactDB))); - } /* - * Check whether we should kick autovacuum into action, to prevent members - * wraparound. NB we use a much larger window to trigger autovacuum than - * just the warning limit. The warning is just a measure of last resort - - * this is in line with GetNewTransactionId's behaviour. + * Offsets are 64-bit integers and will never wrap around. Firstly, it + * would take an unrealistic amount of time and resources to consume 2^64 + * offsets. Secondly, multixid creation is WAL-logged, so you would run + * out of LSNs before reaching offset wraparound. Nevertheless, check for + * wraparound as a sanity check. */ - if (!MultiXactState->oldestOffsetKnown || - (MultiXactState->nextOffset - MultiXactState->oldestOffset - > MULTIXACT_MEMBER_SAFE_THRESHOLD)) - { - /* - * To avoid swamping the postmaster with signals, we issue the autovac - * request only when crossing a segment boundary. With default - * compilation settings that's roughly after 50k members. This still - * gives plenty of chances before we get into real trouble. - */ - if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) != - (MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT)) - SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); - } - - if (MultiXactState->oldestOffsetKnown && - MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, - nextOffset, - nmembers + MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT * OFFSET_WARN_SEGMENTS)) - ereport(WARNING, + if (nextOffset + nmembers < nextOffset) + ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used", - "database with OID %u must be vacuumed before %d more multixact members are used", - MultiXactState->offsetStopLimit - nextOffset + nmembers, - MultiXactState->oldestMultiXactDB, - MultiXactState->offsetStopLimit - nextOffset + nmembers), - errhint("Execute a database-wide VACUUM in that database with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings."))); + "MultiXact members would wrap around")); + *offset = nextOffset; ExtendMultiXactMember(nextOffset, nmembers); @@ -1246,8 +1056,7 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) * the next iteration. But note that nextMXact may be InvalidMultiXactId * or the first value on a segment-beginning page after this routine * exits, so anyone else looking at the variable must be prepared to deal - * with either case. Similarly, nextOffset may be zero, but we won't use - * that as the actual start offset of the next multixact. + * with either case. */ (MultiXactState->nextMXact)++; @@ -1255,7 +1064,8 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) LWLockRelease(MultiXactGenLock); - debug_elog4(DEBUG2, "GetNew: returning %u offset %u", result, *offset); + debug_elog4(DEBUG2, "GetNew: returning %u offset %" PRIu64, result, + *offset); return result; } @@ -1297,7 +1107,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, MultiXactOffset *offptr; MultiXactOffset offset; int length; - int truelength; MultiXactId oldestMXact; MultiXactId nextMXact; MultiXactId tmpMXact; @@ -1396,16 +1205,7 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, * we have just for this; the process in charge will signal the CV as soon * as it has finished writing the multixact offset. * - * 3. Because GetNewMultiXactId increments offset zero to offset one to - * handle case #2, there is an ambiguity near the point of offset - * wraparound. If we see next multixact's offset is one, is that our - * multixact's actual endpoint, or did it end at zero with a subsequent - * increment? We handle this using the knowledge that if the zero'th - * member slot wasn't filled, it'll contain zero, and zero isn't a valid - * transaction ID so it can't be a multixact member. Therefore, if we - * read a zero from the members array, just ignore it. - * - * This is all pretty messy, but the mess occurs only in infrequent corner + * This is a little messy, but the mess occurs only in infrequent corner * cases, so it seems better than holding the MultiXactGenLock for a long * time on every multixact creation. */ @@ -1491,6 +1291,9 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, LWLockRelease(lock); lock = NULL; + /* A multixid with zero members should not happen */ + Assert(length > 0); + /* * If we slept above, clean up state; it's no longer needed. */ @@ -1499,7 +1302,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember)); - truelength = 0; prev_pageno = -1; for (int i = 0; i < length; i++, offset++) { @@ -1536,37 +1338,27 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, xactptr = (TransactionId *) (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); - - if (!TransactionIdIsValid(*xactptr)) - { - /* Corner case 3: we must be looking at unused slot zero */ - Assert(offset == 0); - continue; - } + Assert(TransactionIdIsValid(*xactptr)); flagsoff = MXOffsetToFlagsOffset(offset); bshift = MXOffsetToFlagsBitShift(offset); flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff); - ptr[truelength].xid = *xactptr; - ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK; - truelength++; + ptr[i].xid = *xactptr; + ptr[i].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK; } LWLockRelease(lock); - /* A multixid with zero members should not happen */ - Assert(truelength > 0); - /* * Copy the result into the local cache. */ - mXactCachePut(multi, truelength, ptr); + mXactCachePut(multi, length, ptr); debug_elog3(DEBUG2, "GetMembers: no cache for %s", - mxid_to_string(multi, truelength, ptr)); + mxid_to_string(multi, length, ptr)); *members = ptr; - return truelength; + return length; } /* @@ -1973,7 +1765,7 @@ MultiXactShmemInit(void) "pg_multixact/members", LWTRANCHE_MULTIXACTMEMBER_BUFFER, LWTRANCHE_MULTIXACTMEMBER_SLRU, SYNC_HANDLER_MULTIXACT_MEMBER, - false); + true); /* doesn't call SimpleLruTruncate() or meet criteria for unit tests */ /* Initialize our shared state struct */ @@ -2029,48 +1821,6 @@ BootStrapMultiXact(void) SimpleLruZeroAndWritePage(MultiXactMemberCtl, 0); } -/* - * MaybeExtendOffsetSlru - * Extend the offsets SLRU area, if necessary - * - * After a binary upgrade from <= 9.2, the pg_multixact/offsets SLRU area might - * contain files that are shorter than necessary; this would occur if the old - * installation had used multixacts beyond the first page (files cannot be - * copied, because the on-disk representation is different). pg_upgrade would - * update pg_control to set the next offset value to be at that position, so - * that tuples marked as locked by such MultiXacts would be seen as visible - * without having to consult multixact. However, trying to create and use a - * new MultiXactId would result in an error because the page on which the new - * value would reside does not exist. This routine is in charge of creating - * such pages. - */ -static void -MaybeExtendOffsetSlru(void) -{ - int64 pageno; - LWLock *lock; - - pageno = MultiXactIdToOffsetPage(MultiXactState->nextMXact); - lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); - - LWLockAcquire(lock, LW_EXCLUSIVE); - - if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno)) - { - int slotno; - - /* - * Fortunately for us, SimpleLruWritePage is already prepared to deal - * with creating a new segment file even if the page we're writing is - * not the first in it, so this is enough. - */ - slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); - SimpleLruWritePage(MultiXactOffsetCtl, slotno); - } - - LWLockRelease(lock); -} - /* * This must be called ONCE during postmaster or standalone-backend startup. * @@ -2150,7 +1900,6 @@ TrimMultiXact(void) slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact); offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; - MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset))); MultiXactOffsetCtl->shared->page_dirty[slotno] = true; @@ -2223,7 +1972,7 @@ MultiXactGetCheckptMulti(bool is_shutdown, LWLockRelease(MultiXactGenLock); debug_elog6(DEBUG2, - "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u", + "MultiXact: checkpoint is nextMulti %u, nextOffset %" PRIu64 ", oldestMulti %u in DB %u", *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB); } @@ -2258,26 +2007,12 @@ void MultiXactSetNextMXact(MultiXactId nextMulti, MultiXactOffset nextMultiOffset) { - debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u", + debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %" PRIu64, nextMulti, nextMultiOffset); LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); MultiXactState->nextMXact = nextMulti; MultiXactState->nextOffset = nextMultiOffset; LWLockRelease(MultiXactGenLock); - - /* - * During a binary upgrade, make sure that the offsets SLRU is large - * enough to contain the next value that would be created. - * - * We need to do this pretty early during the first startup in binary - * upgrade mode: before StartupMultiXact() in fact, because this routine - * is called even before that by StartupXLOG(). And we can't do it - * earlier than at this point, because during that first call of this - * routine we determine the MultiXactState->nextMXact value that - * MaybeExtendOffsetSlru needs. - */ - if (IsBinaryUpgrade) - MaybeExtendOffsetSlru(); } /* @@ -2449,7 +2184,7 @@ MultiXactAdvanceNextMXact(MultiXactId minMulti, } if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset)) { - debug_elog3(DEBUG2, "MultiXact: setting next offset to %u", + debug_elog3(DEBUG2, "MultiXact: setting next offset to %" PRIU64, minMultiOffset); MultiXactState->nextOffset = minMultiOffset; } @@ -2551,23 +2286,8 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers) LWLockRelease(lock); } - /* - * Compute the number of items till end of current page. Careful: if - * addition of unsigned ints wraps around, we're at the last page of - * the last segment; since that page holds a different number of items - * than other pages, we need to do it differently. - */ - if (offset + MAX_MEMBERS_IN_LAST_MEMBERS_PAGE < offset) - { - /* - * This is the last page of the last segment; we can compute the - * number of items left to allocate in it without modulo - * arithmetic. - */ - difference = MaxMultiXactOffset - offset + 1; - } - else - difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE; + /* Compute the number of items till end of current page. */ + difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE; /* * Advance to next page, taking care to properly handle the wraparound @@ -2633,15 +2353,14 @@ GetOldestMultiXactId(void) } /* - * Determine how aggressively we need to vacuum in order to prevent member - * wraparound. + * Determine if we need to vacuum to keep the size of the members SLRU in + * check. * * To do so determine what's the oldest member offset and install the limit * info in MultiXactState, where it can be used to prevent overrun of old data * in the members SLRU area. * - * The return value is true if emergency autovacuum is required and false - * otherwise. + * The return value is true if autovacuum is required and false otherwise. */ static bool SetOffsetVacuumLimit(bool is_startup) @@ -2649,12 +2368,7 @@ SetOffsetVacuumLimit(bool is_startup) MultiXactId oldestMultiXactId; MultiXactId nextMXact; MultiXactOffset oldestOffset = 0; /* placate compiler */ - MultiXactOffset prevOldestOffset; MultiXactOffset nextOffset; - bool oldestOffsetKnown = false; - bool prevOldestOffsetKnown; - MultiXactOffset offsetStopLimit = 0; - MultiXactOffset prevOffsetStopLimit; /* * NB: Have to prevent concurrent truncation, we might otherwise try to @@ -2667,9 +2381,6 @@ SetOffsetVacuumLimit(bool is_startup) oldestMultiXactId = MultiXactState->oldestMultiXactId; nextMXact = MultiXactState->nextMXact; nextOffset = MultiXactState->nextOffset; - prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown; - prevOldestOffset = MultiXactState->oldestOffset; - prevOffsetStopLimit = MultiXactState->offsetStopLimit; Assert(MultiXactState->finishedStartup); LWLockRelease(MultiXactGenLock); @@ -2687,126 +2398,20 @@ SetOffsetVacuumLimit(bool is_startup) * offset. */ oldestOffset = nextOffset; - oldestOffsetKnown = true; } - else + else if (!find_multixact_start(oldestMultiXactId, &oldestOffset)) { - /* - * Figure out where the oldest existing multixact's offsets are - * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X, - * the supposedly-earliest multixact might not really exist. We are - * careful not to fail in that case. - */ - oldestOffsetKnown = - find_multixact_start(oldestMultiXactId, &oldestOffset); - - if (oldestOffsetKnown) - ereport(DEBUG1, - (errmsg_internal("oldest MultiXactId member is at offset %u", - oldestOffset))); - else - ereport(LOG, - (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk", - oldestMultiXactId))); + ereport(LOG, + (errmsg("oldest checkpointed MultiXact %u does not exist on disk", + oldestMultiXactId))); } LWLockRelease(MultiXactTruncationLock); /* - * If we can, compute limits (and install them MultiXactState) to prevent - * overrun of old data in the members SLRU area. We can only do so if the - * oldest offset is known though. - */ - if (oldestOffsetKnown) - { - /* move back to start of the corresponding segment */ - offsetStopLimit = oldestOffset - (oldestOffset % - (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT)); - - /* always leave one segment before the wraparound point */ - offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT); - - if (!prevOldestOffsetKnown && !is_startup) - ereport(LOG, - (errmsg("MultiXact member wraparound protections are now enabled"))); - - ereport(DEBUG1, - (errmsg_internal("MultiXact member stop limit is now %u based on MultiXact %u", - offsetStopLimit, oldestMultiXactId))); - } - else if (prevOldestOffsetKnown) - { - /* - * If we failed to get the oldest offset this time, but we have a - * value from a previous pass through this function, use the old - * values rather than automatically forcing an emergency autovacuum - * cycle again. - */ - oldestOffset = prevOldestOffset; - oldestOffsetKnown = true; - offsetStopLimit = prevOffsetStopLimit; - } - - /* Install the computed values */ - LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); - MultiXactState->oldestOffset = oldestOffset; - MultiXactState->oldestOffsetKnown = oldestOffsetKnown; - MultiXactState->offsetStopLimit = offsetStopLimit; - LWLockRelease(MultiXactGenLock); - - /* - * Do we need an emergency autovacuum? If we're not sure, assume yes. + * Do we need autovacuum? If we're not sure, assume yes. */ - return !oldestOffsetKnown || - (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD); -} - -/* - * Return whether adding "distance" to "start" would move past "boundary". - * - * We use this to determine whether the addition is "wrapping around" the - * boundary point, hence the name. The reason we don't want to use the regular - * 2^31-modulo arithmetic here is that we want to be able to use the whole of - * the 2^32-1 space here, allowing for more multixacts than would fit - * otherwise. - */ -static bool -MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start, - uint32 distance) -{ - MultiXactOffset finish; - - /* - * Note that offset number 0 is not used (see GetMultiXactIdMembers), so - * if the addition wraps around the UINT_MAX boundary, skip that value. - */ - finish = start + distance; - if (finish < start) - finish++; - - /*----------------------------------------------------------------------- - * When the boundary is numerically greater than the starting point, any - * value numerically between the two is not wrapped: - * - * <----S----B----> - * [---) = F wrapped past B (and UINT_MAX) - * [---) = F not wrapped - * [----] = F wrapped past B - * - * When the boundary is numerically less than the starting point (i.e. the - * UINT_MAX wraparound occurs somewhere in between) then all values in - * between are wrapped: - * - * <----B----S----> - * [---) = F not wrapped past B (but wrapped past UINT_MAX) - * [---) = F wrapped past B (and UINT_MAX) - * [----] = F not wrapped - *----------------------------------------------------------------------- - */ - if (start < boundary) - return finish >= boundary || finish < start; - else - return finish >= boundary && finish < start; + return nextOffset - oldestOffset > MULTIXACT_MEMBER_AUTOVAC_THRESHOLD; } /* @@ -2846,120 +2451,13 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result) offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; offset = *offptr; + LWLockRelease(SimpleLruGetBankLock(MultiXactOffsetCtl, pageno)); *result = offset; return true; } -/* - * GetMultiXactInfo - * - * Returns information about the current MultiXact state, as of: - * multixacts: Number of MultiXacts (nextMultiXactId - oldestMultiXactId) - * members: Number of member entries (nextOffset - oldestOffset) - * oldestMultiXactId: Oldest MultiXact ID still in use - * oldestOffset: Oldest offset still in use - * - * Returns false if unable to determine, the oldest offset being unknown. - */ -bool -GetMultiXactInfo(uint32 *multixacts, MultiXactOffset *members, - MultiXactId *oldestMultiXactId, MultiXactOffset *oldestOffset) -{ - MultiXactOffset nextOffset; - MultiXactId nextMultiXactId; - bool oldestOffsetKnown; - - LWLockAcquire(MultiXactGenLock, LW_SHARED); - nextOffset = MultiXactState->nextOffset; - *oldestMultiXactId = MultiXactState->oldestMultiXactId; - nextMultiXactId = MultiXactState->nextMXact; - *oldestOffset = MultiXactState->oldestOffset; - oldestOffsetKnown = MultiXactState->oldestOffsetKnown; - LWLockRelease(MultiXactGenLock); - - if (!oldestOffsetKnown) - { - *members = 0; - *multixacts = 0; - *oldestMultiXactId = InvalidMultiXactId; - *oldestOffset = 0; - return false; - } - - *members = nextOffset - *oldestOffset; - *multixacts = nextMultiXactId - *oldestMultiXactId; - return true; -} - -/* - * Multixact members can be removed once the multixacts that refer to them - * are older than every datminmxid. autovacuum_multixact_freeze_max_age and - * vacuum_multixact_freeze_table_age work together to make sure we never have - * too many multixacts; we hope that, at least under normal circumstances, - * this will also be sufficient to keep us from using too many offsets. - * However, if the average multixact has many members, we might exhaust the - * members space while still using few enough members that these limits fail - * to trigger relminmxid advancement by VACUUM. At that point, we'd have no - * choice but to start failing multixact-creating operations with an error. - * - * To prevent that, if more than a threshold portion of the members space is - * used, we effectively reduce autovacuum_multixact_freeze_max_age and - * to a value just less than the number of multixacts in use. We hope that - * this will quickly trigger autovacuuming on the table or tables with the - * oldest relminmxid, thus allowing datminmxid values to advance and removing - * some members. - * - * As the fraction of the member space currently in use grows, we become - * more aggressive in clamping this value. That not only causes autovacuum - * to ramp up, but also makes any manual vacuums the user issues more - * aggressive. This happens because vacuum_get_cutoffs() will clamp the - * freeze table and the minimum freeze age cutoffs based on the effective - * autovacuum_multixact_freeze_max_age this function returns. In the worst - * case, we'll claim the freeze_max_age to zero, and every vacuum of any - * table will freeze every multixact. - */ -int -MultiXactMemberFreezeThreshold(void) -{ - MultiXactOffset members; - uint32 multixacts; - uint32 victim_multixacts; - double fraction; - int result; - MultiXactId oldestMultiXactId; - MultiXactOffset oldestOffset; - - /* If we can't determine member space utilization, assume the worst. */ - if (!GetMultiXactInfo(&multixacts, &members, &oldestMultiXactId, &oldestOffset)) - return 0; - - /* If member space utilization is low, no special action is required. */ - if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD) - return autovacuum_multixact_freeze_max_age; - - /* - * Compute a target for relminmxid advancement. The number of multixacts - * we try to eliminate from the system is based on how far we are past - * MULTIXACT_MEMBER_SAFE_THRESHOLD. - */ - fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) / - (MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD); - victim_multixacts = multixacts * fraction; - - /* fraction could be > 1.0, but lowest possible freeze age is zero */ - if (victim_multixacts > multixacts) - return 0; - result = multixacts - victim_multixacts; - - /* - * Clamp to autovacuum_multixact_freeze_max_age, so that we never make - * autovacuum less aggressive than it would otherwise be. - */ - return Min(result, autovacuum_multixact_freeze_max_age); -} - typedef struct mxtruncinfo { int64 earliestExistingPage; @@ -2986,36 +2484,12 @@ SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int64 segpage, void *data /* * Delete members segments [oldest, newOldest) - * - * The members SLRU can, in contrast to the offsets one, be filled to almost - * the full range at once. This means SimpleLruTruncate() can't trivially be - * used - instead the to-be-deleted range is computed using the offsets - * SLRU. C.f. TruncateMultiXact(). */ static void PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset) { - const int64 maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset); - int64 startsegment = MXOffsetToMemberSegment(oldestOffset); - int64 endsegment = MXOffsetToMemberSegment(newOldestOffset); - int64 segment = startsegment; - - /* - * Delete all the segments but the last one. The last segment can still - * contain, possibly partially, valid data. - */ - while (segment != endsegment) - { - elog(DEBUG2, "truncating multixact members segment %" PRIx64, - segment); - SlruDeleteSegment(MultiXactMemberCtl, segment); - - /* move to next segment, handling wraparound correctly */ - if (segment == maxsegment) - segment = 0; - else - segment += 1; - } + SimpleLruTruncate(MultiXactMemberCtl, + MXOffsetToMemberPage(newOldestOffset)); } /* @@ -3159,7 +2633,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) elog(DEBUG1, "performing multixact truncation: " "offsets [%u, %u), offsets segments [%" PRIx64 ", %" PRIx64 "), " - "members [%u, %u), members segments [%" PRIx64 ", %" PRIx64 ")", + "members [%" PRIu64 ", %" PRIu64 "), members segments [%" PRIx64 ", %" PRIx64 ")", oldestMulti, newOldestMulti, MultiXactIdToOffsetSegment(oldestMulti), MultiXactIdToOffsetSegment(newOldestMulti), @@ -3239,20 +2713,13 @@ MultiXactOffsetPagePrecedes(int64 page1, int64 page2) /* * Decide whether a MultiXactMember page number is "older" for truncation - * purposes. There is no "invalid offset number" so use the numbers verbatim. + * purposes. There is no "invalid offset number" and members never wrap + * around, so use the numbers verbatim. */ static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2) { - MultiXactOffset offset1; - MultiXactOffset offset2; - - offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE; - offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE; - - return (MultiXactOffsetPrecedes(offset1, offset2) && - MultiXactOffsetPrecedes(offset1, - offset2 + MULTIXACT_MEMBERS_PER_PAGE - 1)); + return page1 < page2; } /* @@ -3290,7 +2757,7 @@ MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2) static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2) { - int32 diff = (int32) (offset1 - offset2); + int64 diff = (int64) (offset1 - offset2); return (diff < 0); } @@ -3387,7 +2854,7 @@ multixact_redo(XLogReaderState *record) elog(DEBUG1, "replaying multixact truncation: " "offsets [%u, %u), offsets segments [%" PRIx64 ", %" PRIx64 "), " - "members [%u, %u), members segments [%" PRIx64 ", %" PRIx64 ")", + "members [%" PRIu64 ", %" PRIu64 "), members segments [%" PRIx64 ", %" PRIx64 ")", xlrec.startTruncOff, xlrec.endTruncOff, MultiXactIdToOffsetSegment(xlrec.startTruncOff), MultiXactIdToOffsetSegment(xlrec.endTruncOff), diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 101b616b028b..c5117401409c 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -5128,7 +5128,7 @@ BootStrapXLOG(uint32 data_checksum_version) FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId); checkPoint.nextOid = FirstGenbkiObjectId; checkPoint.nextMulti = FirstMultiXactId; - checkPoint.nextMultiOffset = 0; + checkPoint.nextMultiOffset = 1; checkPoint.oldestXid = FirstNormalTransactionId; checkPoint.oldestXidDB = Template1DbOid; checkPoint.oldestMulti = FirstMultiXactId; diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index eddc22fc5ad2..5dd25cf2dfc0 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -886,7 +886,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, U64FromFullTransactionId(checkPoint.nextXid), checkPoint.nextOid))); ereport(DEBUG1, - (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u", + (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %" PRIu64, checkPoint.nextMulti, checkPoint.nextMultiOffset))); ereport(DEBUG1, (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u", diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index ed03e3bd50d8..259ef60bd318 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -1147,7 +1147,7 @@ vacuum_get_cutoffs(Relation rel, const VacuumParams params, * normally autovacuum_multixact_freeze_max_age, but may be less if we are * short of multixact member space. */ - effective_multixact_freeze_max_age = MultiXactMemberFreezeThreshold(); + effective_multixact_freeze_max_age = autovacuum_multixact_freeze_max_age; /* * Almost ready to set freeze output parameters; check if OldestXmin or diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index ed19c74bb19f..34909ee54ffd 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -1151,7 +1151,7 @@ do_start_worker(void) /* Also determine the oldest datminmxid we will consider. */ recentMulti = ReadNextMultiXactId(); - multiForceLimit = recentMulti - MultiXactMemberFreezeThreshold(); + multiForceLimit = recentMulti - autovacuum_multixact_freeze_max_age; if (multiForceLimit < FirstMultiXactId) multiForceLimit -= FirstMultiXactId; @@ -1939,7 +1939,7 @@ do_autovacuum(void) * normally autovacuum_multixact_freeze_max_age, but may be less if we are * short of multixact member space. */ - effective_multixact_freeze_max_age = MultiXactMemberFreezeThreshold(); + effective_multixact_freeze_max_age = autovacuum_multixact_freeze_max_age; /* * Find the pg_database entry and select the default freeze ages. We use diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index 10de058ce91f..5295108ade3e 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -264,7 +264,7 @@ main(int argc, char *argv[]) ControlFile->checkPointCopy.nextOid); printf(_("Latest checkpoint's NextMultiXactId: %u\n"), ControlFile->checkPointCopy.nextMulti); - printf(_("Latest checkpoint's NextMultiOffset: %u\n"), + printf(_("Latest checkpoint's NextMultiOffset: %" PRIu64 "\n"), ControlFile->checkPointCopy.nextMultiOffset); printf(_("Latest checkpoint's oldestXID: %u\n"), ControlFile->checkPointCopy.oldestXid); diff --git a/src/bin/pg_resetwal/meson.build b/src/bin/pg_resetwal/meson.build index 290832b22996..1e2dfb38a5b7 100644 --- a/src/bin/pg_resetwal/meson.build +++ b/src/bin/pg_resetwal/meson.build @@ -25,6 +25,7 @@ tests += { 'tests': [ 't/001_basic.pl', 't/002_corrupted.pl', + 't/003_mxoff.pl', ], }, } diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index a89d72fc5cfe..4e5eeced89d5 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -267,7 +267,7 @@ main(int argc, char *argv[]) case 'O': errno = 0; - set_mxoff = strtoul(optarg, &endptr, 0); + set_mxoff = strtou64(optarg, &endptr, 0); if (endptr == optarg || *endptr != '\0' || errno != 0) { pg_log_error("invalid argument for option %s", "-O"); @@ -743,7 +743,7 @@ PrintControlValues(bool guessed) ControlFile.checkPointCopy.nextOid); printf(_("Latest checkpoint's NextMultiXactId: %u\n"), ControlFile.checkPointCopy.nextMulti); - printf(_("Latest checkpoint's NextMultiOffset: %u\n"), + printf(_("Latest checkpoint's NextMultiOffset: %" PRIu64 "\n"), ControlFile.checkPointCopy.nextMultiOffset); printf(_("Latest checkpoint's oldestXID: %u\n"), ControlFile.checkPointCopy.oldestXid); @@ -817,7 +817,7 @@ PrintNewControlValues(void) if (set_mxoff != -1) { - printf(_("NextMultiOffset: %u\n"), + printf(_("NextMultiOffset: %" PRIu64 "\n"), ControlFile.checkPointCopy.nextMultiOffset); } diff --git a/src/bin/pg_resetwal/t/001_basic.pl b/src/bin/pg_resetwal/t/001_basic.pl index d6bbbd0ceda3..cc89e0764aea 100644 --- a/src/bin/pg_resetwal/t/001_basic.pl +++ b/src/bin/pg_resetwal/t/001_basic.pl @@ -213,7 +213,7 @@ sub get_slru_files sprintf("%d,%d", hex($files[0]) == 0 ? 3 : hex($files[0]), hex($files[-1])); @files = get_slru_files('pg_multixact/offsets'); -$mult = 32 * $blcksz / 4; +$mult = 32 * $blcksz / 8; # --multixact-ids argument is "new,old" push @cmd, '--multixact-ids' => sprintf("%d,%d", diff --git a/src/bin/pg_resetwal/t/003_mxoff.pl b/src/bin/pg_resetwal/t/003_mxoff.pl new file mode 100644 index 000000000000..3c1b7fa1d335 --- /dev/null +++ b/src/bin/pg_resetwal/t/003_mxoff.pl @@ -0,0 +1,170 @@ + +# Copyright (c) 2025, PostgreSQL Global Development Group + +use strict; +use warnings FATAL => 'all'; + +use Math::BigInt; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +sub mxact_eater +{ + my $node = shift; + my $tbl = shift; + + $node->start; + $node->safe_psql('postgres', + "CREATE TABLE ${tbl} (I INT PRIMARY KEY, N_UPDATED INT) " . + " WITH (AUTOVACUUM_ENABLED=FALSE);" . + "INSERT INTO ${tbl} SELECT G, 0 FROM GENERATE_SERIES(1, 50) G;"); + + # consume around 10k multixact-offsetfs + my $nclients = 10; + my $update_every = 75; + my @connections = (); + + for (0..$nclients) + { + my $conn = $node->background_psql('postgres'); + $conn->query_safe("BEGIN"); + + push(@connections, $conn); + } + + for (my $i = 0; $i < 1000; $i++) + { + my $conn = $connections[$i % $nclients]; + + $conn->query_safe("COMMIT;"); + $conn->query_safe("BEGIN"); + + if ($i % $update_every == 0) + { + $conn->query_safe( + "UPDATE ${tbl} SET " . + "N_UPDATED = N_UPDATED + 1 " . + "WHERE I = ${i} % 50"); + } + else + { + $conn->query_safe( + "SELECT * FROM ${tbl} FOR KEY SHARE"); + } + } + + for my $conn (@connections) + { + $conn->quit(); + } + + $node->stop; +} + +sub next_mxoff +{ + my $node = shift; + my ($stdout, $stderr) = + run_command([ 'pg_controldata', $node->data_dir ]); + my @control_data = split("\n", $stdout); + my $next_mxoff = undef; + + foreach (@control_data) + { + if ($_ =~ /^Latest checkpoint's NextMultiOffset:\s*(.*)$/mg) + { + $next_mxoff = $1; + last; + } + } + die "NextMultiOffset not found in control file\n" + unless defined($next_mxoff); + + return $next_mxoff; +} + +sub reset_mxoff +{ + my $node = shift; + my $offset = shift; + $offset = Math::BigInt->new($offset); + + # Get block size + my $out = (run_command([ 'pg_resetwal', '--dry-run', $node->data_dir ]))[0]; + $out =~ /^Database block size: *(\d+)$/m or die; + my $blcksz = $1; + + # Reset to new offset + my @cmd = ('pg_resetwal', '--pgdata' => $node->data_dir); + push @cmd, '--multixact-offset' => $offset->as_hex(); + command_ok(\@cmd, 'set oldest multixact-offset'); + + # Fill empty pg_multixact/members segment + my $mult = 32 * int($blcksz / 20) * 4; + my $segname = sprintf "%015X", $offset / $mult; + + my @dd = ('dd'); + push @dd, "if=/dev/zero"; + push @dd, "of=" . $node->data_dir . "/pg_multixact/members/" . $segname; + push @dd, "bs=$blcksz"; + push @dd, "count=32"; + command_ok(\@dd, 'fill empty multixact-members'); +} + +my ($off1, $off2); + +# start from defaults +my $node1 = PostgreSQL::Test::Cluster->new('node1'); +$node1->init; +$off1 = next_mxoff($node1); +mxact_eater($node1, "FOO"); +$off2 = next_mxoff($node1); +note "> start from $off1, finished at $off2\n"; + +# start from before 32-bit wraparound +my $node2 = PostgreSQL::Test::Cluster->new('node2'); +$node2->init; +reset_mxoff($node2, 0xFFFF0000); +$off1 = next_mxoff($node2); +mxact_eater($node2, "FOO"); +$off2 = next_mxoff($node2); +note "> start from $off1, finished at $off2\n"; + +# start near 32-bit wraparound +my $node3 = PostgreSQL::Test::Cluster->new('node3'); +$node3->init; +reset_mxoff($node3, 0xFFFFEC77); +$off1 = next_mxoff($node3); +mxact_eater($node3, "FOO"); +$off2 = next_mxoff($node3); +note "> start from $off1, finished at $off2\n"; + +# start over 32-bit wraparound +my $node4 = PostgreSQL::Test::Cluster->new('node4'); +$node4->init; +reset_mxoff($node4, '0xFFFFFFFF0000'); +$off1 = next_mxoff($node4); +mxact_eater($node4, "FOO"); +$off2 = next_mxoff($node3); +note "> start from $off1, finished at $off2\n"; + +# check invariant +$node1->start; +$node2->start; +$node3->start; +$node4->start; + +my $var1 = $node1->safe_psql('postgres', 'TABLE FOO'); +my $var2 = $node2->safe_psql('postgres', 'TABLE FOO'); +my $var3 = $node3->safe_psql('postgres', 'TABLE FOO'); +my $var4 = $node4->safe_psql('postgres', 'TABLE FOO'); +ok($var1 eq $var2 eq $var3 eq $var4, + 'check table invariant in all nodes'); + +$node4->stop; +$node3->stop; +$node2->stop; +$node1->stop; + +done_testing(); diff --git a/src/bin/pg_upgrade/Makefile b/src/bin/pg_upgrade/Makefile index 69fcf593caec..42995d53b0bc 100644 --- a/src/bin/pg_upgrade/Makefile +++ b/src/bin/pg_upgrade/Makefile @@ -18,11 +18,14 @@ OBJS = \ file.o \ function.o \ info.o \ + multixact_new.o \ + multixact_old.o \ option.o \ parallel.o \ pg_upgrade.o \ relfilenumber.o \ server.o \ + slru_io.o \ tablespace.o \ task.o \ util.o \ diff --git a/src/bin/pg_upgrade/meson.build b/src/bin/pg_upgrade/meson.build index ac992f0d14b1..ca87ae221ce0 100644 --- a/src/bin/pg_upgrade/meson.build +++ b/src/bin/pg_upgrade/meson.build @@ -8,11 +8,14 @@ pg_upgrade_sources = files( 'file.c', 'function.c', 'info.c', + 'multixact_new.c', + 'multixact_old.c', 'option.c', 'parallel.c', 'pg_upgrade.c', 'relfilenumber.c', 'server.c', + 'slru_io.c', 'tablespace.c', 'task.c', 'util.c', @@ -47,6 +50,7 @@ tests += { 't/004_subscription.pl', 't/005_char_signedness.pl', 't/006_transfer_modes.pl', + 't/007_multi_wrap.pl', ], 'test_kwargs': {'priority': 40}, # pg_upgrade tests are slow }, diff --git a/src/bin/pg_upgrade/multixact_new.c b/src/bin/pg_upgrade/multixact_new.c new file mode 100644 index 000000000000..f565a378254b --- /dev/null +++ b/src/bin/pg_upgrade/multixact_new.c @@ -0,0 +1,103 @@ +/* + * multixact_new.c + * + * Functions to write multixacts in the v19 format with 64-bit + * MultiXactOffsets + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/multixact_new.c + */ + +#include "postgres_fe.h" + +#include "access/multixact.h" +#include "access/multixact_internal.h" + +#include "multixact_new.h" + +MultiXactWriter * +AllocMultiXactWrite(const char *pgdata, MultiXactId firstMulti, + MultiXactOffset firstOffset) +{ + MultiXactWriter *state = pg_malloc(sizeof(*state)); + char dir[MAXPGPATH] = {0}; + + pg_sprintf(dir, "%s/pg_multixact/offsets", pgdata); + state->offset = AllocSlruWrite(dir, false); + SlruWriteSwitchPage(state->offset, MultiXactIdToOffsetPage(firstMulti)); + + pg_sprintf(dir, "%s/pg_multixact/members", pgdata); + state->members = AllocSlruWrite(dir, true /* use long segment names */ ); + SlruWriteSwitchPage(state->members, MXOffsetToMemberPage(firstOffset)); + + return state; +} + +/* + * Write a new multixact with members. + * + * Simplified version of the correspoding server function, hence the name. + */ +void +RecordNewMultiXact(MultiXactWriter *state, MultiXactOffset offset, + MultiXactId multi, int nmembers, MultiXactMember *members) +{ + int64 pageno; + int64 prev_pageno; + int entryno; + char *buf; + MultiXactOffset *offptr; + + pageno = MultiXactIdToOffsetPage(multi); + entryno = MultiXactIdToOffsetEntry(multi); + + /* Store the offset */ + buf = SlruWriteSwitchPage(state->offset, pageno); + offptr = (MultiXactOffset *) buf; + offptr[entryno] = offset; + + /* Store the members */ + prev_pageno = -1; + for (int i = 0; i < nmembers; i++, offset++) + { + TransactionId *memberptr; + uint32 *flagsptr; + uint32 flagsval; + int bshift; + int flagsoff; + int memberoff; + + Assert(members[i].status <= MultiXactStatusUpdate); + + pageno = MXOffsetToMemberPage(offset); + memberoff = MXOffsetToMemberOffset(offset); + flagsoff = MXOffsetToFlagsOffset(offset); + bshift = MXOffsetToFlagsBitShift(offset); + + if (pageno != prev_pageno) + { + buf = SlruWriteSwitchPage(state->members, pageno); + prev_pageno = pageno; + } + + memberptr = (TransactionId *) (buf + memberoff); + + *memberptr = members[i].xid; + + flagsptr = (uint32 *) (buf + flagsoff); + + flagsval = *flagsptr; + flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift); + flagsval |= (members[i].status << bshift); + *flagsptr = flagsval; + } +} + +void +FreeMultiXactWrite(MultiXactWriter *state) +{ + FreeSlruWrite(state->offset); + FreeSlruWrite(state->members); + + pfree(state); +} diff --git a/src/bin/pg_upgrade/multixact_new.h b/src/bin/pg_upgrade/multixact_new.h new file mode 100644 index 000000000000..f66e6af7e45e --- /dev/null +++ b/src/bin/pg_upgrade/multixact_new.h @@ -0,0 +1,23 @@ +/* + * multixact_new.h + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/multixact_new.h + */ +#include "access/multixact.h" + +#include "slru_io.h" + +typedef struct MultiXactWriter +{ + SlruSegState *offset; + SlruSegState *members; +} MultiXactWriter; + +extern MultiXactWriter *AllocMultiXactWrite(const char *pgdata, + MultiXactId firstMulti, + MultiXactOffset firstOffset); +extern void RecordNewMultiXact(MultiXactWriter *state, MultiXactOffset offset, + MultiXactId multi, int nmembers, + MultiXactMember *members); +extern void FreeMultiXactWrite(MultiXactWriter *writer); diff --git a/src/bin/pg_upgrade/multixact_old.c b/src/bin/pg_upgrade/multixact_old.c new file mode 100644 index 000000000000..70ae88d97f46 --- /dev/null +++ b/src/bin/pg_upgrade/multixact_old.c @@ -0,0 +1,297 @@ +/* + * multixact_old.c + * + * Functions to read pre-v19 multixacts + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/multixact_old.c + */ + +#include "postgres_fe.h" + +#include "multixact_old.h" +#include "pg_upgrade.h" + +/* + * NOTE: below are a bunch of definitions and simple sttaic inline functions + * that are copy-pasted from multixact.c from version 18. The only difference + * is that we use the OldMultiXactOffset type equal to uint32 instead of + * MultiXactOffset which became uint64. + */ + +/* We need four bytes per offset and 8 bytes per base for each page. */ +#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(OldMultiXactOffset)) + +static inline int64 +MultiXactIdToOffsetPage(MultiXactId multi) +{ + return multi / MULTIXACT_OFFSETS_PER_PAGE; +} + +static inline int +MultiXactIdToOffsetEntry(MultiXactId multi) +{ + return multi % MULTIXACT_OFFSETS_PER_PAGE; +} + +/* + * The situation for members is a bit more complex: we store one byte of + * additional flag bits for each TransactionId. To do this without getting + * into alignment issues, we store four bytes of flags, and then the + * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and + * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups + * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and + * performance) trumps space efficiency here. + * + * Note that the "offset" macros work with byte offset, not array indexes, so + * arithmetic must be done using "char *" pointers. + */ +/* We need eight bits per xact, so one xact fits in a byte */ +#define MXACT_MEMBER_BITS_PER_XACT 8 +#define MXACT_MEMBER_FLAGS_PER_BYTE 1 +#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) + +/* how many full bytes of flags are there in a group? */ +#define MULTIXACT_FLAGBYTES_PER_GROUP 4 +#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \ + (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE) +/* size in bytes of a complete group */ +#define MULTIXACT_MEMBERGROUP_SIZE \ + (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP) +#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) +#define MULTIXACT_MEMBERS_PER_PAGE \ + (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP) + +/* page in which a member is to be found */ +static inline int64 +MXOffsetToMemberPage(OldMultiXactOffset offset) +{ + return offset / MULTIXACT_MEMBERS_PER_PAGE; +} + +/* Location (byte offset within page) of flag word for a given member */ +static inline int +MXOffsetToFlagsOffset(MultiXactOffset offset) +{ + OldMultiXactOffset group = offset / MULTIXACT_MEMBERS_PER_MEMBERGROUP; + int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE; + int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE; + + return byteoff; +} + +/* Location (byte offset within page) of TransactionId of given member */ +static inline int +MXOffsetToMemberOffset(OldMultiXactOffset offset) +{ + int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; + + return MXOffsetToFlagsOffset(offset) + + MULTIXACT_FLAGBYTES_PER_GROUP + + member_in_group * sizeof(TransactionId); +} + +static inline int +MXOffsetToFlagsBitShift(OldMultiXactOffset offset) +{ + int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; + int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT; + + return bshift; +} + +/* + * Construct reader of old multixacts. + * + * Returns the malloced memory used by the all other calls in this module. + */ +OldMultiXactReader * +AllocOldMultiXactRead(char *pgdata, MultiXactId nextMulti, + OldMultiXactOffset nextOffset) +{ + OldMultiXactReader *state = state = pg_malloc(sizeof(*state)); + char dir[MAXPGPATH] = {0}; + + state->nextMXact = nextMulti; + state->nextOffset = nextOffset; + + pg_sprintf(dir, "%s/pg_multixact/offsets", pgdata); + state->offset = AllocSlruRead(dir); + + pg_sprintf(dir, "%s/pg_multixact/members", pgdata); + state->members = AllocSlruRead(dir); + + return state; +} + +/* + * This is a simplified version of the GetMultiXactIdMembers() server function. + * + * - Only return the updating member, if any. Upgrade only cares about the + * updaters. If there is no updating member, return the first locking-only + * member. We don't have any way to represent "no members", but we also don't + * need to preserve all the locking members. + * + * - We don't need to worry about locking and some corner cases because there's + * no concurrent activity. + */ +void +GetOldMultiXactIdSingleMember(OldMultiXactReader *state, MultiXactId multi, + TransactionId *result, MultiXactStatus *status) +{ + MultiXactId nextMXact, + nextOffset, + tmpMXact; + int64 pageno, + prev_pageno; + int entryno, + length; + char *buf; + OldMultiXactOffset *offptr, + offset; + TransactionId result_xid = InvalidTransactionId; + bool result_isupdate = false; + + nextMXact = state->nextMXact; + nextOffset = state->nextOffset; + + /* + * See GetMultiXactIdMembers in multixact.c + * + * Find out the offset at which we need to start reading MultiXactMembers + * and the number of members in the multixact. We determine the latter as + * the difference between this multixact's starting offset and the next + * one's. However, there are some corner cases to worry about: + * + * 1. This multixact may be the latest one created, in which case there is + * no next one to look at. In this case the nextOffset value we just + * saved is the correct endpoint. + * + * 2. The next multixact may still be in process of being filled in... + * This cannot happen during upgrade. + * + * 3. Because GetNewMultiXactId increments offset zero to offset one to + * handle case #2, there is an ambiguity near the point of offset + * wraparound. If we see next multixact's offset is one, is that our + * multixact's actual endpoint, or did it end at zero with a subsequent + * increment? We handle this using the knowledge that if the zero'th + * member slot wasn't filled, it'll contain zero, and zero isn't a valid + * transaction ID so it can't be a multixact member. Therefore, if we + * read a zero from the members array, just ignore it. + */ + + pageno = MultiXactIdToOffsetPage(multi); + entryno = MultiXactIdToOffsetEntry(multi); + + buf = SlruReadSwitchPage(state->offset, pageno); + offptr = (OldMultiXactOffset *) buf; + offptr += entryno; + offset = *offptr; + + Assert(offset != 0); + + /* + * Use the same increment rule as GetNewMultiXactId(), that is, don't + * handle wraparound explicitly until needed. + */ + tmpMXact = multi + 1; + + if (nextMXact == tmpMXact) + { + /* Corner case 1: there is no next multixact */ + length = nextOffset - offset; + } + else + { + OldMultiXactOffset nextMXOffset; + + /* handle wraparound if needed */ + if (tmpMXact < FirstMultiXactId) + tmpMXact = FirstMultiXactId; + + prev_pageno = pageno; + + pageno = MultiXactIdToOffsetPage(tmpMXact); + entryno = MultiXactIdToOffsetEntry(tmpMXact); + + if (pageno != prev_pageno) + buf = SlruReadSwitchPage(state->offset, pageno); + + offptr = (OldMultiXactOffset *) buf; + offptr += entryno; + nextMXOffset = *offptr; + + /* + * Corner case 2: next multixact is still being filled in, this must + * not happen during upgrade. + */ + Assert(nextMXOffset != 0); + + length = nextMXOffset - offset; + } + + prev_pageno = -1; + for (int i = 0; i < length; i++, offset++) + { + TransactionId *xactptr; + uint32 *flagsptr; + int flagsoff; + int bshift; + int memberoff; + MultiXactStatus st; + + pageno = MXOffsetToMemberPage(offset); + memberoff = MXOffsetToMemberOffset(offset); + + if (pageno != prev_pageno) + { + buf = SlruReadSwitchPage(state->members, pageno); + prev_pageno = pageno; + } + + xactptr = (TransactionId *) (buf + memberoff); + if (!TransactionIdIsValid(*xactptr)) + { + /* Corner case 3: we must be looking at unused slot zero */ + Assert(offset == 0); + continue; + } + + flagsoff = MXOffsetToFlagsOffset(offset); + bshift = MXOffsetToFlagsBitShift(offset); + flagsptr = (uint32 *) (buf + flagsoff); + + st = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK; + + /* Verify that there is a single update Xid among the given members. */ + if (ISUPDATE_from_mxstatus(st)) + { + if (result_isupdate) + pg_fatal("multixact %u has more than one updating member", + multi); + result_xid = *xactptr; + result_isupdate = true; + } + else if (!TransactionIdIsValid(result_xid)) + result_xid = *xactptr; + } + + /* A multixid with zero members should not happen */ + Assert(TransactionIdIsValid(result_xid)); + + *result = result_xid; + *status = result_isupdate ? MultiXactStatusUpdate : + MultiXactStatusForKeyShare; +} + +/* + * Frees the malloced reader. + */ +void +FreeOldMultiXactReader(OldMultiXactReader *state) +{ + FreeSlruRead(state->offset); + FreeSlruRead(state->members); + + pfree(state); +} diff --git a/src/bin/pg_upgrade/multixact_old.h b/src/bin/pg_upgrade/multixact_old.h new file mode 100644 index 000000000000..8eb5af2ccafd --- /dev/null +++ b/src/bin/pg_upgrade/multixact_old.h @@ -0,0 +1,29 @@ +/* + * multixact_old.h + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/multixact_old.h + */ + +#include "access/multixact.h" +#include "slru_io.h" + +typedef uint32 OldMultiXactOffset; + +typedef struct OldMultiXactReader +{ + MultiXactId nextMXact; + OldMultiXactOffset nextOffset; + + SlruSegState *offset; + SlruSegState *members; +} OldMultiXactReader; + +extern OldMultiXactReader *AllocOldMultiXactRead(char *pgdata, + MultiXactId nextMulti, + OldMultiXactOffset nextOffset); +extern void GetOldMultiXactIdSingleMember(OldMultiXactReader *state, + MultiXactId multi, + TransactionId *result, + MultiXactStatus *status); +extern void FreeOldMultiXactReader(OldMultiXactReader *reader); diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index 490e98fa26f2..eb87052c4adb 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -48,6 +48,8 @@ #include "common/logging.h" #include "common/restricted_token.h" #include "fe_utils/string_utils.h" +#include "multixact_old.h" +#include "multixact_new.h" #include "pg_upgrade.h" /* @@ -769,6 +771,81 @@ copy_subdir_files(const char *old_subdir, const char *new_subdir) check_ok(); } +/* + * Convert pg_multixact/offset and /members to new format with 64-bit offsets. + */ +static void +convert_multixacts(MultiXactId *new_nxtmulti, MultiXactOffset *new_nxtmxoff) +{ + MultiXactId oldest_multi, + next_multi; + OldMultiXactReader *old_reader; + MultiXactWriter *new_writer; + MultiXactOffset next_offset; + + /* + * The range of valid multi XIDs is unchanged by the conversion (they are + * referenced from the heap tables), but the members SLRU is rewritten to + * start from offset 1. + */ + oldest_multi = old_cluster.controldata.chkpnt_oldstMulti; + next_multi = old_cluster.controldata.chkpnt_nxtmulti; + next_offset = 1; + + old_reader = AllocOldMultiXactRead(old_cluster.pgdata, + old_cluster.controldata.chkpnt_nxtmulti, + old_cluster.controldata.chkpnt_nxtmxoff); + new_writer = AllocMultiXactWrite(new_cluster.pgdata, + oldest_multi, next_offset); + + /* handle wraparound */ + if (next_multi < FirstMultiXactId) + next_multi = FirstMultiXactId; + + /* + * Read multixids from old files one by one, and write them back in the + * new format. + */ + for (MultiXactId multi = oldest_multi; multi != next_multi;) + { + TransactionId xid; + MultiXactStatus status; + MultiXactMember member; + + /* + * Read the old multixid. The locking-only XIDs that may be part of + * multi-xids don't matter after upgrade, as there can be no + * transactions running across upgrade. So as a little optimization, + * we only read one member from each multixid: the one updating one, + * or if there was no update, arbitrarily the first locking xid. + */ + GetOldMultiXactIdSingleMember(old_reader, multi, &xid, &status); + + /* Write it out in new format */ + member.xid = xid; + member.status = status; + RecordNewMultiXact(new_writer, next_offset, multi, 1, &member); + + next_offset += 1; + multi++; + /* handle wraparound */ + if (multi < FirstMultiXactId) + multi = FirstMultiXactId; + } + + /* + * Update the nextMXact/Offset values in the control file to match what we + * wrote. The nextMXact is unchanged, but nextOffset will be different. + */ + Assert(next_multi == old_cluster.controldata.chkpnt_nxtmulti); + *new_nxtmulti = next_multi; + *new_nxtmxoff = next_offset; + + /* Release resources */ + FreeMultiXactWrite(new_writer); + FreeOldMultiXactReader(old_reader); +} + static void copy_xact_xlog_xid(void) { @@ -816,8 +893,34 @@ copy_xact_xlog_xid(void) if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER && new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER) { - copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets"); - copy_subdir_files("pg_multixact/members", "pg_multixact/members"); + MultiXactId new_nxtmulti = old_cluster.controldata.chkpnt_nxtmulti; + MultiXactOffset new_nxtmxoff = old_cluster.controldata.chkpnt_nxtmxoff; + + /* + * If the old server is before the + * MULTIXACTOFFSET_FORMATCHANGE_CAT_VER it must have 32-bit multixid + * offsets, thus it should be converted. + */ + if (old_cluster.controldata.cat_ver < MULTIXACTOFFSET_FORMATCHANGE_CAT_VER && + new_cluster.controldata.cat_ver >= MULTIXACTOFFSET_FORMATCHANGE_CAT_VER) + { + remove_new_subdir("pg_multixact/members", false); + remove_new_subdir("pg_multixact/offsets", false); + + prep_status("Converting pg_multixact/offsets to 64-bit"); + /* convert_multixacts handles new_nxtmulti wraparound */ + convert_multixacts(&new_nxtmulti, &new_nxtmxoff); + check_ok(); + } + else + { + /* handle wraparound */ + if (new_nxtmulti < FirstMultiXactId) + new_nxtmulti = FirstMultiXactId; + + copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets"); + copy_subdir_files("pg_multixact/members", "pg_multixact/members"); + } prep_status("Setting next multixact ID and offset for new cluster"); @@ -826,10 +929,8 @@ copy_xact_xlog_xid(void) * counters here and the oldest multi present on system. */ exec_prog(UTILITY_LOG_FILE, NULL, true, true, - "\"%s/pg_resetwal\" -O %u -m %u,%u \"%s\"", - new_cluster.bindir, - old_cluster.controldata.chkpnt_nxtmxoff, - old_cluster.controldata.chkpnt_nxtmulti, + "\"%s/pg_resetwal\" -O %" PRIu64 " -m %u,%u \"%s\"", + new_cluster.bindir, new_nxtmxoff, new_nxtmulti, old_cluster.controldata.chkpnt_oldstMulti, new_cluster.pgdata); check_ok(); diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index e86336f4be95..127b2cb00fab 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -114,6 +114,11 @@ extern char *output_files[]; */ #define MULTIXACT_FORMATCHANGE_CAT_VER 201301231 +/* + * Swicth from 32-bit to 64-bit for multixid offsets. + */ +#define MULTIXACTOFFSET_FORMATCHANGE_CAT_VER 999999999 + /* * large object chunk size added to pg_controldata, * commit 5f93c37805e7485488480916b4585e098d3cc883 diff --git a/src/bin/pg_upgrade/slru_io.c b/src/bin/pg_upgrade/slru_io.c new file mode 100644 index 000000000000..2a0624ea8b8e --- /dev/null +++ b/src/bin/pg_upgrade/slru_io.c @@ -0,0 +1,239 @@ +/* + * slru_io.c + * + * Routines for reading and writing SLRU files during upgrade. + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/slru_io.c + */ + +#include "postgres_fe.h" + +#include + +#include "common/fe_memutils.h" +#include "common/file_perm.h" +#include "common/file_utils.h" +#include "port/pg_iovec.h" +#include "pg_upgrade.h" +#include "slru_io.h" + +/* + * State for reading or writing an SLRU, with a one page buffer. + */ +typedef struct SlruSegState +{ + bool writing; + bool long_segment_names; + + char *dir; + char *fn; + int fd; + int64 segno; + uint64 pageno; + + PGAlignedBlock buf; +} SlruSegState; + +static inline SlruSegState * +AllocSlruSegState(char *dir) +{ + SlruSegState *state = pg_malloc(sizeof(*state)); + + state->segno = -1; + state->pageno = 0; + state->dir = pstrdup(dir); + state->fd = -1; + state->fn = NULL; + + return state; +} + +static inline void +SlruFlush(SlruSegState *state) +{ + struct iovec iovec = { + .iov_base = &state->buf, + .iov_len = BLCKSZ, + }; + off_t offset; + + if (state->segno == -1) + return; + + offset = (state->pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ; + + if (pg_pwritev_with_retry(state->fd, &iovec, 1, offset) < 0) + pg_fatal("could not write file \"%s\": %m", state->fn); +} + +/* + * Create slru reader for dir. + * + * Returns the malloced memory used by the all other read calls in this module. + */ +SlruSegState * +AllocSlruRead(char *dir) +{ + SlruSegState *state = AllocSlruSegState(dir); + + state->writing = false; + + return state; +} + +/* + * Open given page for reading. + * + * Reading can be done in random order. + */ +char * +SlruReadSwitchPage(SlruSegState *state, uint64 pageno) +{ + int64 segno; + + Assert(!state->writing); /* read only mode */ + + if (state->segno != -1 && pageno == state->pageno) + return state->buf.data; + + segno = pageno / SLRU_PAGES_PER_SEGMENT; + if (segno != state->segno) + { + if (state->segno != -1) + { + close(state->fd); + state->fd = -1; + + pg_free(state->fn); + state->fn = NULL; + } + + /* Open new segment */ + state->fn = psprintf("%s/%04X", state->dir, (unsigned int) segno); + if ((state->fd = open(state->fn, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("could not open file \"%s\": %m", state->fn); + } + + state->segno = segno; + + { + struct iovec iovec = { + .iov_base = &state->buf, + .iov_len = BLCKSZ, + }; + off_t offset = (pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ; + + if (pg_preadv(state->fd, &iovec, 1, offset) < 0) + pg_fatal("could not read file \"%s\": %m", state->fn); + + state->pageno = pageno; + } + + return state->buf.data; +} + +/* + * Frees the malloced reader. + */ +void +FreeSlruRead(SlruSegState *state) +{ + Assert(!state->writing); /* read only mode */ + + close(state->fd); + pg_free(state); +} + +/* + * Open the given page for writing. + * + * NOTE: This uses O_EXCL when stepping to a new segment, so this assumes that + * each segment is written in full before moving on to next one. This + * limitation would be easy to lift if needed, but it fits the usage pattern of + * current callers. + */ +char * +SlruWriteSwitchPage(SlruSegState *state, uint64 pageno) +{ + int64 segno = pageno / SLRU_PAGES_PER_SEGMENT; + off_t offset; + + if (state->segno != -1 && pageno == state->pageno) + return state->buf.data; + + segno = pageno / SLRU_PAGES_PER_SEGMENT; + offset = (pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ; + + SlruFlush(state); + memset(state->buf.data, 0, BLCKSZ); + + if (segno != state->segno) + { + if (state->segno != -1) + { + close(state->fd); + state->fd = -1; + + pg_free(state->fn); + state->fn = NULL; + } + + /* Create the segment */ + if (state->long_segment_names) + { + Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFFFFFFFFFFF)); + state->fn = psprintf("%s/%015" PRIX64, state->dir, segno); + } + else + { + Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFF)); + state->fn = psprintf("%s/%04X", state->dir, (unsigned int) segno); + } + + if ((state->fd = open(state->fn, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + { + pg_fatal("could not create file \"%s\": %m", state->fn); + } + + state->segno = segno; + + if (offset > 0 && pg_pwrite_zeros(state->fd, offset, 0) < 0) + pg_fatal("could not write file \"%s\": %m", state->fn); + } + + state->pageno = pageno; + + return state->buf.data; +} + +/* + * Create slru writer for dir. + * + * Returns the malloced memory used by the all other write calls in this module. + */ +SlruSegState * +AllocSlruWrite(char *dir, bool long_segment_names) +{ + SlruSegState *state = AllocSlruSegState(dir); + + state->writing = true; + state->long_segment_names = long_segment_names; + + return state; +} + +/* + * Frees the malloced writer. + */ +void +FreeSlruWrite(SlruSegState *state) +{ + Assert(state->writing); + + SlruFlush(state); + + close(state->fd); + pg_free(state); +} diff --git a/src/bin/pg_upgrade/slru_io.h b/src/bin/pg_upgrade/slru_io.h new file mode 100644 index 000000000000..295fd0bebc43 --- /dev/null +++ b/src/bin/pg_upgrade/slru_io.h @@ -0,0 +1,23 @@ +/* + * slru_io.h + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * src/bin/pg_upgrade/slru_io.h + */ + +/* + * See access/slru.h + * + * Copy here, since slru.h could not be included in fe code. + */ +#define SLRU_PAGES_PER_SEGMENT 32 + +typedef struct SlruSegState SlruSegState; + +extern SlruSegState *AllocSlruRead(char *dir); +extern char *SlruReadSwitchPage(SlruSegState *state, uint64 pageno); +extern void FreeSlruRead(SlruSegState *state); + +extern SlruSegState *AllocSlruWrite(char *dir, bool long_segment_names); +extern char *SlruWriteSwitchPage(SlruSegState *state, uint64 pageno); +extern void FreeSlruWrite(SlruSegState *state); diff --git a/src/bin/pg_upgrade/t/007_multi_wrap.pl b/src/bin/pg_upgrade/t/007_multi_wrap.pl new file mode 100644 index 000000000000..0ad8fd599068 --- /dev/null +++ b/src/bin/pg_upgrade/t/007_multi_wrap.pl @@ -0,0 +1,176 @@ + +# Copyright (c) 2025, PostgreSQL Global Development Group + +use strict; +use warnings FATAL => 'all'; + +use Math::BigInt; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use PostgreSQL::Test::AdjustDump; +use PostgreSQL::Test::AdjustUpgrade; +use Test::More; + +# Temp dir for a dumps. +my $tempdir = PostgreSQL::Test::Utils::tempdir; + +# Can be changed to test the other modes. +my $mode = $ENV{PG_TEST_PG_UPGRADE_MODE} || '--copy'; + +# Handy pg_resetwal wrapper +sub reset_mxoff +{ + my %args = @_; + + my $node = $args{node}; + my $offset = $args{offset}; + my $multi = $args{multi}; + my $blcksz = sub # Get block size + { + my $out = (run_command([ 'pg_resetwal', '--dry-run', + $node->data_dir ]))[0]; + $out =~ /^Database block size: *(\d+)$/m or die; + return $1; + }->(); + + my @cmd; + + # Reset cluster + @cmd = ('pg_resetwal', '--pgdata' => $node->data_dir); + if (defined($offset)) + { + push @cmd, '--multixact-offset' => $offset; + } + if (defined($multi)) + { + push @cmd, "--multixact-ids=$multi,$multi"; + } + command_ok(\@cmd, 'reset multi/offset'); + + my $n_items; + my $segname; + + # Fill empty pg_multixact segments + if (defined($offset)) + { + $n_items = 32 * int($blcksz / 20) * 4; + $segname = sprintf "%015X", ($offset / $n_items); + $segname = $node->data_dir . "/pg_multixact/members/" . $segname; + + @cmd = ('dd'); + push @cmd, "if=/dev/zero"; + push @cmd, "of=" . $segname; + push @cmd, "bs=$blcksz"; + push @cmd, "count=32"; + command_ok(\@cmd, 'fill empty multixact-members'); + } + + if (defined($multi)) + { + $n_items = 32 * int($blcksz / 8); + $segname = sprintf "%04X", $multi / $n_items; + $segname = $node->data_dir . "/pg_multixact/offsets/" . $segname; + + @cmd = ('dd'); + push @cmd, "if=/dev/zero"; + push @cmd, "of=" . $segname; + push @cmd, "bs=$blcksz"; + push @cmd, "count=32"; + command_ok(\@cmd, 'fill empty multixact-offsets'); + } +} + +sub get_dump_for_comparison +{ + my ($node, $db, $file_prefix, $adjust_child_columns) = @_; + + my $dumpfile = $tempdir . '/' . $file_prefix . '.sql'; + my $dump_adjusted = "${dumpfile}_adjusted"; + + open(my $dh, '>', $dump_adjusted) + || die "could not open $dump_adjusted for writing $!"; + + $node->run_log( + [ + 'pg_dump', '--no-sync', + '--restrict-key' => 'test', + '-d' => $node->connstr($db), + '-f' => $dumpfile + ]); + + print $dh adjust_regress_dumpfile(slurp_file($dumpfile), + $adjust_child_columns); + close($dh); + + return $dump_adjusted; +} + +# Create old node +my $old = PostgreSQL::Test::Cluster->new("old"); +$old->init; +reset_mxoff(node => $old, multi => 4294967295, offset => 429496729); + +$old->start; +$old->safe_psql('postgres', +qq( + CREATE TABLE test_table (id integer NOT NULL PRIMARY KEY, val text); + INSERT INTO test_table VALUES (1, 'a'); +)); + +my $conn1 = $old->background_psql('postgres'); +my $conn2 = $old->background_psql('postgres'); + +$conn1->query_safe(qq( + BEGIN; + SELECT * FROM test_table WHERE id = 1 FOR SHARE; +)); +$conn2->query_safe(qq( + BEGIN; + SELECT * FROM test_table WHERE id = 1 FOR SHARE; +)); + +$conn1->query_safe(qq(COMMIT;)); +$conn2->query_safe(qq(COMMIT;)); + +$conn1->quit; +$conn2->quit; + +$old->stop; + +# Create new node +my $new = PostgreSQL::Test::Cluster->new("new"); +$new->init; + +# Run pg_upgrade +command_ok( + [ + 'pg_upgrade', '--no-sync', + '--old-datadir' => $old->data_dir, + '--new-datadir' => $new->data_dir, + '--old-bindir' => $old->config_data('--bindir'), + '--new-bindir' => $new->config_data('--bindir'), + '--socketdir' => $new->host, + '--old-port' => $old->port, + '--new-port' => $new->port, + $mode, + ], + 'run of pg_upgrade for new instance'); +ok( !-d $new->data_dir . "/pg_upgrade_output.d", + "pg_upgrade_output.d/ removed after pg_upgrade success"); + +$old->start; +my $src_dump = + get_dump_for_comparison($old, 'postgres', + "oldnode_1_dump", 0); +$old->stop; + +$new->start; +my $dst_dump = + get_dump_for_comparison($new, 'postgres', + "newnode_1_dump", 0); +$new->stop; + +compare_files($src_dump, $dst_dump, + 'dump outputs from original and restored regression databases match'); + +done_testing(); diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h index 82e4bb90dd58..d688b547c547 100644 --- a/src/include/access/multixact.h +++ b/src/include/access/multixact.h @@ -28,8 +28,6 @@ #define MultiXactIdIsValid(multi) ((multi) != InvalidMultiXactId) -#define MaxMultiXactOffset ((MultiXactOffset) 0xFFFFFFFF) - /* * Possible multixact lock modes ("status"). The first four modes are for * tuple locks (FOR KEY SHARE, FOR SHARE, FOR NO KEY UPDATE, FOR UPDATE); the @@ -111,9 +109,6 @@ extern bool MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly); extern void MultiXactIdSetOldestMember(void); extern int GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, bool from_pgupgrade, bool isLockOnly); -extern bool GetMultiXactInfo(uint32 *multixacts, MultiXactOffset *members, - MultiXactId *oldestMultiXactId, - MultiXactOffset *oldestOffset); extern bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2); extern bool MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2); @@ -147,7 +142,6 @@ extern void MultiXactSetNextMXact(MultiXactId nextMulti, extern void MultiXactAdvanceNextMXact(MultiXactId minMulti, MultiXactOffset minMultiOffset); extern void MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB); -extern int MultiXactMemberFreezeThreshold(void); extern void multixact_twophase_recover(FullTransactionId fxid, uint16 info, void *recdata, uint32 len); diff --git a/src/include/access/multixact_internal.h b/src/include/access/multixact_internal.h new file mode 100644 index 000000000000..73fb3e998fb7 --- /dev/null +++ b/src/include/access/multixact_internal.h @@ -0,0 +1,109 @@ +/* + * multixact_internal.h + * + * Defines and helper functions for the PostgreSQL multi-transaction-log manager + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * src/include/access/multixact_internal.h + */ +#ifndef MULTIXACT_INTERNAL_H +#define MULTIXACT_INTERNAL_H + +#include "postgres.h" + +#include "access/multixact.h" + +/* + * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is + * used everywhere else in Postgres. + */ + +/* We need 8 bytes per offset */ +#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset)) + +static inline int64 +MultiXactIdToOffsetPage(MultiXactId multi) +{ + return multi / MULTIXACT_OFFSETS_PER_PAGE; +} + +static inline int +MultiXactIdToOffsetEntry(MultiXactId multi) +{ + return multi % MULTIXACT_OFFSETS_PER_PAGE; +} + +/* + * The situation for members is a bit more complex: we store one byte of + * additional flag bits for each TransactionId. To do this without getting + * into alignment issues, we store four bytes of flags, and then the + * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and + * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups + * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and + * performance) trumps space efficiency here. + * + * Note that the "offset" macros work with byte offset, not array indexes, so + * arithmetic must be done using "char *" pointers. + */ +/* We need eight bits per xact, so one xact fits in a byte */ +#define MXACT_MEMBER_BITS_PER_XACT 8 +#define MXACT_MEMBER_FLAGS_PER_BYTE 1 +#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) + +/* how many full bytes of flags are there in a group? */ +#define MULTIXACT_FLAGBYTES_PER_GROUP 4 +#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \ + (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE) +/* size in bytes of a complete group */ +#define MULTIXACT_MEMBERGROUP_SIZE \ + (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP) +#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) +#define MULTIXACT_MEMBERS_PER_PAGE \ + (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP) + +/* page in which a member is to be found */ +static inline int64 +MXOffsetToMemberPage(MultiXactOffset offset) +{ + return offset / MULTIXACT_MEMBERS_PER_PAGE; +} + +static inline int64 +MXOffsetToMemberSegment(MultiXactOffset offset) +{ + return MXOffsetToMemberPage(offset) / SLRU_PAGES_PER_SEGMENT; +} + +/* Location (byte offset within page) of flag word for a given member */ +static inline int +MXOffsetToFlagsOffset(MultiXactOffset offset) +{ + MultiXactOffset group = offset / MULTIXACT_MEMBERS_PER_MEMBERGROUP; + int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE; + int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE; + + return byteoff; +} + +static inline int +MXOffsetToFlagsBitShift(MultiXactOffset offset) +{ + int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; + int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT; + + return bshift; +} + +/* Location (byte offset within page) of TransactionId of given member */ +static inline int +MXOffsetToMemberOffset(MultiXactOffset offset) +{ + int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; + + return MXOffsetToFlagsOffset(offset) + + MULTIXACT_FLAGBYTES_PER_GROUP + + member_in_group * sizeof(TransactionId); +} + +#endif /* MULTIXACT_INTERNAL_H */ diff --git a/src/include/access/slru.h b/src/include/access/slru.h index 8d57753ed01b..8576649b15e3 100644 --- a/src/include/access/slru.h +++ b/src/include/access/slru.h @@ -23,21 +23,6 @@ */ #define SLRU_MAX_ALLOWED_BUFFERS ((1024 * 1024 * 1024) / BLCKSZ) -/* - * Define SLRU segment size. A page is the same BLCKSZ as is used everywhere - * else in Postgres. The segment size can be chosen somewhat arbitrarily; - * we make it 32 pages by default, or 256Kb, i.e. 1M transactions for CLOG - * or 64K transactions for SUBTRANS. - * - * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF, - * page numbering also wraps around at 0xFFFFFFFF/xxxx_XACTS_PER_PAGE (where - * xxxx is CLOG or SUBTRANS, respectively), and segment numbering at - * 0xFFFFFFFF/xxxx_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need - * take no explicit notice of that fact in slru.c, except when comparing - * segment and page numbers in SimpleLruTruncate (see PagePrecedes()). - */ -#define SLRU_PAGES_PER_SEGMENT 32 - /* * Page status codes. Note that these do not include the "dirty" bit. * page_dirty can be true only in the VALID or WRITE_IN_PROGRESS states; diff --git a/src/include/c.h b/src/include/c.h index 757dfff47825..bc92a6f4565c 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -670,7 +670,7 @@ typedef uint32 SubTransactionId; /* MultiXactId must be equivalent to TransactionId, to fit in t_xmax */ typedef TransactionId MultiXactId; -typedef uint32 MultiXactOffset; +typedef uint64 MultiXactOffset; typedef uint32 CommandId; diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index 7e1aa4223326..8556ce40cbfc 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -356,3 +356,18 @@ * Enable tracing of syncscan operations (see also the trace_syncscan GUC var). */ /* #define TRACE_SYNCSCAN */ + +/* + * Define SLRU segment size. A page is the same BLCKSZ as is used everywhere + * else in Postgres. The segment size can be chosen somewhat arbitrarily; + * we make it 32 pages by default, or 256Kb, i.e. 1M transactions for CLOG + * or 64K transactions for SUBTRANS. + * + * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF, + * page numbering also wraps around at 0xFFFFFFFF/xxxx_XACTS_PER_PAGE (where + * xxxx is CLOG or SUBTRANS, respectively), and segment numbering at + * 0xFFFFFFFF/xxxx_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need + * take no explicit notice of that fact in slru.c, except when comparing + * segment and page numbers in SimpleLruTruncate (see PagePrecedes()). + */ +#define SLRU_PAGES_PER_SEGMENT 32 diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 432509277c98..9392bb729b9e 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1725,6 +1725,7 @@ MultiXactMember MultiXactOffset MultiXactStateData MultiXactStatus +MultiXactWriter MultirangeIOData MultirangeParseState MultirangeType @@ -1808,6 +1809,7 @@ OffsetVarNodes_context Oid OidOptions OkeysState +OldMultiXactReader OldToNewMapping OldToNewMappingData OnCommitAction @@ -2804,6 +2806,7 @@ SlruCtlData SlruErrorCause SlruPageStatus SlruScanCallback +SlruSegState SlruShared SlruSharedData SlruWriteAll