From eff770bbc9a99fe489bd25e30a5b4ae775314c49 Mon Sep 17 00:00:00 2001 From: Andrew Dunstan Date: Sat, 13 Oct 2012 22:37:58 -0400 Subject: [PATCH 1/4] Latest patch applied, in part by hand. Builds and runs. --- src/bin/pg_dump/Makefile | 3 +- src/bin/pg_dump/compress_io.c | 10 + src/bin/pg_dump/dumputils.c | 87 +++- src/bin/pg_dump/dumputils.h | 4 + src/bin/pg_dump/pg_backup.h | 8 +- src/bin/pg_dump/pg_backup_archiver.c | 715 +++++++++----------------- src/bin/pg_dump/pg_backup_archiver.h | 46 +- src/bin/pg_dump/pg_backup_custom.c | 89 +++- src/bin/pg_dump/pg_backup_db.c | 20 +- src/bin/pg_dump/pg_backup_directory.c | 255 ++++++++- src/bin/pg_dump/pg_backup_tar.c | 8 +- src/bin/pg_dump/pg_dump.c | 249 ++++++--- src/bin/pg_dump/pg_dump.h | 3 + src/bin/pg_dump/pg_dump_sort.c | 87 ++++ src/bin/pg_dump/pg_restore.c | 24 +- 15 files changed, 1026 insertions(+), 582 deletions(-) diff --git a/src/bin/pg_dump/Makefile b/src/bin/pg_dump/Makefile index e9be18bd0b..f3d554f286 100644 --- a/src/bin/pg_dump/Makefile +++ b/src/bin/pg_dump/Makefile @@ -20,7 +20,8 @@ override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS) OBJS= pg_backup_archiver.o pg_backup_db.o pg_backup_custom.o \ pg_backup_null.o pg_backup_tar.o \ - pg_backup_directory.o dumpmem.o dumputils.o compress_io.o $(WIN32RES) + pg_backup_directory.o dumpmem.o dumputils.o compress_io.o \ + parallel.o $(WIN32RES) KEYWRDOBJS = keywords.o kwlookup.o diff --git a/src/bin/pg_dump/compress_io.c b/src/bin/pg_dump/compress_io.c index 024722fe2d..67d0ca9b7f 100644 --- a/src/bin/pg_dump/compress_io.c +++ b/src/bin/pg_dump/compress_io.c @@ -55,6 +55,7 @@ #include "compress_io.h" #include "dumpmem.h" #include "dumputils.h" +#include "parallel.h" /*---------------------- * Compressor API @@ -183,6 +184,9 @@ size_t WriteDataToArchive(ArchiveHandle *AH, CompressorState *cs, const void *data, size_t dLen) { + /* Are we aborting? */ + checkAborting(AH); + switch (cs->comprAlg) { case COMPR_ALG_LIBZ: @@ -352,6 +356,9 @@ ReadDataFromArchiveZlib(ArchiveHandle *AH, ReadFunc readF) /* no minimal chunk size for zlib */ while ((cnt = readF(AH, &buf, &buflen))) { + /* Are we aborting? */ + checkAborting(AH); + zp->next_in = (void *) buf; zp->avail_in = cnt; @@ -412,6 +419,9 @@ ReadDataFromArchiveNone(ArchiveHandle *AH, ReadFunc readF) while ((cnt = readF(AH, &buf, &buflen))) { + /* Are we aborting? */ + checkAborting(AH); + ahwrite(buf, 1, cnt, AH); } diff --git a/src/bin/pg_dump/dumputils.c b/src/bin/pg_dump/dumputils.c index 91f2774955..c55eaa173e 100644 --- a/src/bin/pg_dump/dumputils.c +++ b/src/bin/pg_dump/dumputils.c @@ -16,6 +16,7 @@ #include +#include "dumpmem.h" #include "dumputils.h" #include "parser/keywords.h" @@ -38,6 +39,7 @@ static struct } on_exit_nicely_list[MAX_ON_EXIT_NICELY]; static int on_exit_nicely_index; +void (*on_exit_msg_func)(const char *modulename, const char *fmt, va_list ap) = vwrite_msg; #define supports_grant_options(version) ((version) >= 70400) @@ -48,11 +50,21 @@ static bool parseAclItem(const char *item, const char *type, static char *copyAclUserName(PQExpBuffer output, char *input); static void AddAcl(PQExpBuffer aclbuf, const char *keyword, const char *subname); +static PQExpBuffer getThreadLocalPQExpBuffer(void); #ifdef WIN32 +static void shutdown_parallel_dump_utils(int code, void* unused); static bool parallel_init_done = false; static DWORD tls_index; static DWORD mainThreadId; + +static void +shutdown_parallel_dump_utils(int code, void* unused) +{ + /* Call the cleanup function only from the main thread */ + if (mainThreadId == GetCurrentThreadId()) + WSACleanup(); +} #endif void @@ -61,23 +73,29 @@ init_parallel_dump_utils(void) #ifdef WIN32 if (!parallel_init_done) { + WSADATA wsaData; + int err; + tls_index = TlsAlloc(); - parallel_init_done = true; mainThreadId = GetCurrentThreadId(); + err = WSAStartup(MAKEWORD(2, 2), &wsaData); + if (err != 0) + { + fprintf(stderr, _("WSAStartup failed: %d\n"), err); + exit_nicely(1); + } + on_exit_nicely(shutdown_parallel_dump_utils, NULL); + parallel_init_done = true; } #endif } /* - * Quotes input string if it's not a legitimate SQL identifier as-is. - * - * Note that the returned string must be used before calling fmtId again, - * since we re-use the same return buffer each time. Non-reentrant but - * reduces memory leakage. (On Windows the memory leakage will be one buffer - * per thread, which is at least better than one per call). + * Non-reentrant but reduces memory leakage. (On Windows the memory leakage + * will be one buffer per thread, which is at least better than one per call). */ -const char * -fmtId(const char *rawid) +static PQExpBuffer +getThreadLocalPQExpBuffer(void) { /* * The Tls code goes awry if we use a static var, so we provide for both @@ -86,9 +104,6 @@ fmtId(const char *rawid) static PQExpBuffer s_id_return = NULL; PQExpBuffer id_return; - const char *cp; - bool need_quotes = false; - #ifdef WIN32 if (parallel_init_done) id_return = (PQExpBuffer) TlsGetValue(tls_index); /* 0 when not set */ @@ -118,6 +133,23 @@ fmtId(const char *rawid) } + return id_return; +} + +/* + * Quotes input string if it's not a legitimate SQL identifier as-is. + * + * Note that the returned string must be used before calling fmtId again, + * since we re-use the same return buffer each time. + */ +const char * +fmtId(const char *rawid) +{ + PQExpBuffer id_return = getThreadLocalPQExpBuffer(); + + const char *cp; + bool need_quotes = false; + /* * These checks need to match the identifier production in scan.l. Don't * use islower() etc. @@ -185,6 +217,35 @@ fmtId(const char *rawid) return id_return->data; } +/* + * fmtQualifiedId - convert a qualified name to the proper format for + * the source database. + * + * Like fmtId, use the result before calling again. + * + * Since we call fmtId and it also uses getThreadLocalPQExpBuffer() we cannot + * use it until we're finished with calling fmtId(). + */ +const char * +fmtQualifiedId(int remoteVersion, const char *schema, const char *id) +{ + PQExpBuffer id_return; + PQExpBuffer lcl_pqexp = createPQExpBuffer(); + + /* Suppress schema name if fetching from pre-7.3 DB */ + if (remoteVersion >= 70300 && schema && *schema) + { + appendPQExpBuffer(lcl_pqexp, "%s.", fmtId(schema)); + } + appendPQExpBuffer(lcl_pqexp, "%s", fmtId(id)); + + id_return = getThreadLocalPQExpBuffer(); + + appendPQExpBuffer(id_return, "%s", lcl_pqexp->data); + destroyPQExpBuffer(lcl_pqexp); + + return id_return->data; +} /* * Convert a string value to an SQL string literal and append it to @@ -1312,7 +1373,7 @@ exit_horribly(const char *modulename, const char *fmt,...) va_list ap; va_start(ap, fmt); - vwrite_msg(modulename, fmt, ap); + on_exit_msg_func(modulename, fmt, ap); va_end(ap); exit_nicely(1); diff --git a/src/bin/pg_dump/dumputils.h b/src/bin/pg_dump/dumputils.h index 4ef8cb3a49..fd92970028 100644 --- a/src/bin/pg_dump/dumputils.h +++ b/src/bin/pg_dump/dumputils.h @@ -34,6 +34,8 @@ extern const char *progname; extern void init_parallel_dump_utils(void); extern const char *fmtId(const char *identifier); +extern const char *fmtQualifiedId(int remoteVersion, + const char *schema, const char *id); extern void appendStringLiteral(PQExpBuffer buf, const char *str, int encoding, bool std_strings); extern void appendStringLiteralConn(PQExpBuffer buf, const char *str, @@ -72,6 +74,8 @@ __attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 0))); extern void exit_horribly(const char *modulename, const char *fmt,...) __attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 3), noreturn)); +extern void (*on_exit_msg_func)(const char *modulename, const char *fmt, va_list ap) + __attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 0))); extern void on_exit_nicely(on_exit_nicely_callback function, void *arg); extern void exit_nicely(int code) __attribute__((noreturn)); diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h index 3b49395ecb..623654435b 100644 --- a/src/bin/pg_dump/pg_backup.h +++ b/src/bin/pg_dump/pg_backup.h @@ -81,9 +81,13 @@ struct Archive int minRemoteVersion; /* allowable range */ int maxRemoteVersion; + int numWorkers; /* number of parallel processes */ + char *sync_snapshot_id; /* sync snapshot id for parallel operation */ + /* info needed for string escaping */ int encoding; /* libpq code for client_encoding */ bool std_strings; /* standard_conforming_strings */ + char *use_role; /* Issue SET ROLE to this */ /* error handling */ bool exit_on_error; /* whether to exit on SQL errors... */ @@ -141,7 +145,6 @@ typedef struct _restoreOptions int suppressDumpWarnings; /* Suppress output of WARNING entries * to stderr */ bool single_txn; - int number_of_jobs; bool *idWanted; /* array showing which dump IDs to emit */ } RestoreOptions; @@ -195,6 +198,9 @@ extern void PrintTOCSummary(Archive *AH, RestoreOptions *ropt); extern RestoreOptions *NewRestoreOptions(void); +/* We have one in pg_dump.c and another one in pg_restore.c */ +extern void _SetupWorker(Archive *AHX, RestoreOptions *ropt); + /* Rearrange and filter TOC entries */ extern void SortTocFromFile(Archive *AHX, RestoreOptions *ropt); diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c index 7f47a7c51a..cd7fed9a99 100644 --- a/src/bin/pg_dump/pg_backup_archiver.c +++ b/src/bin/pg_dump/pg_backup_archiver.c @@ -23,8 +23,10 @@ #include "pg_backup_db.h" #include "dumpmem.h" #include "dumputils.h" +#include "parallel.h" #include +#include #include #include #include @@ -36,72 +38,6 @@ #include "libpq/libpq-fs.h" -/* - * Special exit values from worker children. We reserve 0 for normal - * success; 1 and other small values should be interpreted as crashes. - */ -#define WORKER_CREATE_DONE 10 -#define WORKER_INHIBIT_DATA 11 -#define WORKER_IGNORED_ERRORS 12 - -/* - * Unix uses exit to return result from worker child, so function is void. - * Windows thread result comes via function return. - */ -#ifndef WIN32 -#define parallel_restore_result void -#else -#define parallel_restore_result DWORD -#endif - -/* IDs for worker children are either PIDs or thread handles */ -#ifndef WIN32 -#define thandle pid_t -#else -#define thandle HANDLE -#endif - -typedef struct ParallelStateEntry -{ -#ifdef WIN32 - unsigned int threadId; -#else - pid_t pid; -#endif - ArchiveHandle *AH; -} ParallelStateEntry; - -typedef struct ParallelState -{ - int numWorkers; - ParallelStateEntry *pse; -} ParallelState; - -/* Arguments needed for a worker child */ -typedef struct _restore_args -{ - ArchiveHandle *AH; - TocEntry *te; - ParallelStateEntry *pse; -} RestoreArgs; - -/* State for each parallel activity slot */ -typedef struct _parallel_slot -{ - thandle child_id; - RestoreArgs *args; -} ParallelSlot; - -typedef struct ShutdownInformation -{ - ParallelState *pstate; - Archive *AHX; -} ShutdownInformation; - -static ShutdownInformation shutdown_info; - -#define NO_SLOT (-1) - #define TEXT_DUMP_HEADER "--\n-- PostgreSQL database dump\n--\n\n" #define TEXT_DUMPALL_HEADER "--\n-- PostgreSQL database cluster dump\n--\n\n" @@ -137,7 +73,6 @@ static bool _tocEntryIsACL(TocEntry *te); static void _disableTriggersIfNecessary(ArchiveHandle *AH, TocEntry *te, RestoreOptions *ropt); static void _enableTriggersIfNecessary(ArchiveHandle *AH, TocEntry *te, RestoreOptions *ropt); static void buildTocEntryArrays(ArchiveHandle *AH); -static TocEntry *getTocEntryByDumpId(ArchiveHandle *AH, DumpId id); static void _moveBefore(ArchiveHandle *AH, TocEntry *pos, TocEntry *te); static int _discoverArchiveFormat(ArchiveHandle *AH); @@ -150,21 +85,19 @@ static void RestoreOutput(ArchiveHandle *AH, OutputContext savedContext); static int restore_toc_entry(ArchiveHandle *AH, TocEntry *te, RestoreOptions *ropt, bool is_parallel); -static void restore_toc_entries_parallel(ArchiveHandle *AH); -static thandle spawn_restore(RestoreArgs *args); -static thandle reap_child(ParallelSlot *slots, int n_slots, int *work_status); -static bool work_in_progress(ParallelSlot *slots, int n_slots); -static int get_next_slot(ParallelSlot *slots, int n_slots); +static void restore_toc_entries_prefork(ArchiveHandle *AH); +static void restore_toc_entries_parallel(ArchiveHandle *AH, ParallelState *pstate, + TocEntry *pending_list); +static void restore_toc_entries_postfork(ArchiveHandle *AH, TocEntry *pending_list); static void par_list_header_init(TocEntry *l); static void par_list_append(TocEntry *l, TocEntry *te); static void par_list_remove(TocEntry *te); static TocEntry *get_next_work_item(ArchiveHandle *AH, TocEntry *ready_list, - ParallelSlot *slots, int n_slots); -static parallel_restore_result parallel_restore(RestoreArgs *args); + ParallelState *pstate); static void mark_work_done(ArchiveHandle *AH, TocEntry *ready_list, - thandle worker, int status, - ParallelSlot *slots, int n_slots); + int worker, int status, + ParallelState *pstate); static void fix_dependencies(ArchiveHandle *AH); static bool has_lock_conflicts(TocEntry *te1, TocEntry *te2); static void repoint_table_dependencies(ArchiveHandle *AH); @@ -173,14 +106,6 @@ static void reduce_dependencies(ArchiveHandle *AH, TocEntry *te, TocEntry *ready_list); static void mark_create_done(ArchiveHandle *AH, TocEntry *te); static void inhibit_data_for_failed_table(ArchiveHandle *AH, TocEntry *te); -static ArchiveHandle *CloneArchive(ArchiveHandle *AH); -static void DeCloneArchive(ArchiveHandle *AH); - -static void setProcessIdentifier(ParallelStateEntry *pse, ArchiveHandle *AH); -static void unsetProcessIdentifier(ParallelStateEntry *pse); -static ParallelStateEntry *GetMyPSEntry(ParallelState *pstate); -static void archive_close_connection(int code, void *arg); - /* * Wrapper functions. @@ -321,7 +246,7 @@ RestoreArchive(Archive *AHX) /* * If we're going to do parallel restore, there are some restrictions. */ - parallel_mode = (ropt->number_of_jobs > 1 && ropt->useDB); + parallel_mode = (AH->public.numWorkers > 1 && ropt->useDB); if (parallel_mode) { /* We haven't got round to making this work for all archive formats */ @@ -491,7 +416,25 @@ RestoreArchive(Archive *AHX) * In parallel mode, turn control over to the parallel-restore logic. */ if (parallel_mode) - restore_toc_entries_parallel(AH); + { + ParallelState *pstate; + TocEntry pending_list; + + par_list_header_init(&pending_list); + + /* This runs PRE_DATA items and then disconnects from the database */ + restore_toc_entries_prefork(AH); + Assert(AH->connection == NULL); + + /* ParallelBackupStart() will actually fork the processes */ + pstate = ParallelBackupStart(AH, ropt); + restore_toc_entries_parallel(AH, pstate, &pending_list); + ParallelBackupEnd(AH, pstate); + + /* reconnect the master and see if we missed something */ + restore_toc_entries_postfork(AH, &pending_list); + Assert(AH->connection != NULL); + } else { for (te = AH->toc->next; te != AH->toc; te = te->next) @@ -550,7 +493,7 @@ static int restore_toc_entry(ArchiveHandle *AH, TocEntry *te, RestoreOptions *ropt, bool is_parallel) { - int retval = 0; + int status = WORKER_OK; teReqs reqs; bool defnDumped; @@ -603,7 +546,7 @@ restore_toc_entry(ArchiveHandle *AH, TocEntry *te, if (ropt->noDataForFailedTables) { if (is_parallel) - retval = WORKER_INHIBIT_DATA; + status = WORKER_INHIBIT_DATA; else inhibit_data_for_failed_table(AH, te); } @@ -618,7 +561,7 @@ restore_toc_entry(ArchiveHandle *AH, TocEntry *te, * just set the return value. */ if (is_parallel) - retval = WORKER_CREATE_DONE; + status = WORKER_CREATE_DONE; else mark_create_done(AH, te); } @@ -736,7 +679,10 @@ restore_toc_entry(ArchiveHandle *AH, TocEntry *te, } } - return retval; + if (AH->public.n_errors > 0 && status == WORKER_OK) + status = WORKER_IGNORED_ERRORS; + + return status; } /* @@ -1629,7 +1575,7 @@ buildTocEntryArrays(ArchiveHandle *AH) } } -static TocEntry * +TocEntry * getTocEntryByDumpId(ArchiveHandle *AH, DumpId id) { /* build index arrays if we didn't already */ @@ -2127,50 +2073,67 @@ _allocAH(const char *FileSpec, const ArchiveFormat fmt, return AH; } - void -WriteDataChunks(ArchiveHandle *AH) +WriteDataChunks(ArchiveHandle *AH, ParallelState *pstate) { TocEntry *te; - StartDataPtr startPtr; - EndDataPtr endPtr; for (te = AH->toc->next; te != AH->toc; te = te->next) { - if (te->dataDumper != NULL && (te->reqs & REQ_DATA) != 0) - { - AH->currToc = te; - /* printf("Writing data for %d (%x)\n", te->id, te); */ - - if (strcmp(te->desc, "BLOBS") == 0) - { - startPtr = AH->StartBlobsPtr; - endPtr = AH->EndBlobsPtr; - } - else - { - startPtr = AH->StartDataPtr; - endPtr = AH->EndDataPtr; - } + if (!te->dataDumper) + continue; - if (startPtr != NULL) - (*startPtr) (AH, te); + if ((te->reqs & REQ_DATA) == 0) + continue; + if (pstate && pstate->numWorkers > 1) + { /* - * printf("Dumper arg for %d is %x\n", te->id, te->dataDumperArg); + * If we are in a parallel backup, then we are always the master + * process. */ + EnsureIdleWorker(AH, pstate); + Assert(GetIdleWorker(pstate) != NO_SLOT); + DispatchJobForTocEntry(AH, pstate, te, ACT_DUMP); + } + else + WriteDataChunksForTocEntry(AH, te); + } + EnsureWorkersFinished(AH, pstate); +} - /* - * The user-provided DataDumper routine needs to call - * AH->WriteData - */ - (*te->dataDumper) ((Archive *) AH, te->dataDumperArg); +void +WriteDataChunksForTocEntry(ArchiveHandle *AH, TocEntry *te) +{ + StartDataPtr startPtr; + EndDataPtr endPtr; - if (endPtr != NULL) - (*endPtr) (AH, te); - AH->currToc = NULL; - } + AH->currToc = te; + + if (strcmp(te->desc, "BLOBS") == 0) + { + startPtr = AH->StartBlobsPtr; + endPtr = AH->EndBlobsPtr; } + else + { + startPtr = AH->StartDataPtr; + endPtr = AH->EndDataPtr; + } + + if (startPtr != NULL) + (*startPtr) (AH, te); + + /* + * The user-provided DataDumper routine needs to call + * AH->WriteData + */ + (*te->dataDumper) ((Archive *) AH, te->dataDumperArg); + + if (endPtr != NULL) + (*endPtr) (AH, te); + + AH->currToc = NULL; } void @@ -3394,67 +3357,6 @@ dumpTimestamp(ArchiveHandle *AH, const char *msg, time_t tim) ahprintf(AH, "-- %s %s\n\n", msg, buf); } -static void -setProcessIdentifier(ParallelStateEntry *pse, ArchiveHandle *AH) -{ -#ifdef WIN32 - pse->threadId = GetCurrentThreadId(); -#else - pse->pid = getpid(); -#endif - pse->AH = AH; -} - -static void -unsetProcessIdentifier(ParallelStateEntry *pse) -{ -#ifdef WIN32 - pse->threadId = 0; -#else - pse->pid = 0; -#endif - pse->AH = NULL; -} - -static ParallelStateEntry * -GetMyPSEntry(ParallelState *pstate) -{ - int i; - - for (i = 0; i < pstate->numWorkers; i++) -#ifdef WIN32 - if (pstate->pse[i].threadId == GetCurrentThreadId()) -#else - if (pstate->pse[i].pid == getpid()) -#endif - return &(pstate->pse[i]); - - return NULL; -} - -static void -archive_close_connection(int code, void *arg) -{ - ShutdownInformation *si = (ShutdownInformation *) arg; - - if (si->pstate) - { - ParallelStateEntry *entry = GetMyPSEntry(si->pstate); - - if (entry != NULL && entry->AH) - DisconnectDatabase(&(entry->AH->public)); - } - else if (si->AHX) - DisconnectDatabase(si->AHX); -} - -void -on_exit_close_archive(Archive *AHX) -{ - shutdown_info.AHX = AHX; - on_exit_nicely(archive_close_connection, &shutdown_info); -} - /* * Main engine for parallel restore. * @@ -3467,30 +3369,13 @@ on_exit_close_archive(Archive *AHX) * RestoreArchive). */ static void -restore_toc_entries_parallel(ArchiveHandle *AH) -{ - RestoreOptions *ropt = AH->ropt; - int n_slots = ropt->number_of_jobs; - ParallelSlot *slots; - int work_status; - int next_slot; - bool skipped_some; - TocEntry pending_list; - TocEntry ready_list; - TocEntry *next_work_item; - thandle ret_child; - TocEntry *te; - ParallelState *pstate; - int i; - - ahlog(AH, 2, "entering restore_toc_entries_parallel\n"); - - slots = (ParallelSlot *) pg_malloc0(n_slots * sizeof(ParallelSlot)); - pstate = (ParallelState *) pg_malloc(sizeof(ParallelState)); - pstate->pse = (ParallelStateEntry *) pg_malloc0(n_slots * sizeof(ParallelStateEntry)); - pstate->numWorkers = ropt->number_of_jobs; - for (i = 0; i < pstate->numWorkers; i++) - unsetProcessIdentifier(&(pstate->pse[i])); +restore_toc_entries_prefork(ArchiveHandle *AH) + { + RestoreOptions *ropt = AH->ropt; + bool skipped_some; + TocEntry *next_work_item; + + ahlog(AH, 2, "entering restore_toc_entries_prefork\n"); /* Adjust dependency information */ fix_dependencies(AH); @@ -3551,12 +3436,6 @@ restore_toc_entries_parallel(ArchiveHandle *AH) */ DisconnectDatabase(&AH->public); - /* - * Set the pstate in the shutdown_info. The exit handler uses pstate if - * set and falls back to AHX otherwise. - */ - shutdown_info.pstate = pstate; - /* blow away any transient state from the old connection */ if (AH->currUser) free(AH->currUser); @@ -3568,17 +3447,42 @@ restore_toc_entries_parallel(ArchiveHandle *AH) free(AH->currTablespace); AH->currTablespace = NULL; AH->currWithOids = -1; +} + +/* + * Main engine for parallel restore. + * + * Work is done in three phases. + * First we process all SECTION_PRE_DATA tocEntries, in a single connection, + * just as for a standard restore. This is done in restore_toc_entries_prefork(). + * Second we process the remaining non-ACL steps in parallel worker children + * (threads on Windows, processes on Unix), these fork off and set up their + * connections before we call restore_toc_entries_parallel_forked. + * Finally we process all the ACL entries in a single connection (that happens + * back in RestoreArchive). + */ +static void +restore_toc_entries_parallel(ArchiveHandle *AH, ParallelState *pstate, + TocEntry *pending_list) +{ + int work_status; + bool skipped_some; + TocEntry ready_list; + TocEntry *next_work_item; + int ret_child; + + ahlog(AH, 2, "entering restore_toc_entries_parallel\n"); /* - * Initialize the lists of pending and ready items. After this setup, the - * pending list is everything that needs to be done but is blocked by one - * or more dependencies, while the ready list contains items that have no - * remaining dependencies. Note: we don't yet filter out entries that - * aren't going to be restored. They might participate in dependency - * chains connecting entries that should be restored, so we treat them as - * live until we actually process them. + * Initialize the lists of ready items, the list for pending items has + * already been initialized in the caller. After this setup, the pending + * list is everything that needs to be done but is blocked by one or more + * dependencies, while the ready list contains items that have no remaining + * dependencies. Note: we don't yet filter out entries that aren't going + * to be restored. They might participate in dependency chains connecting + * entries that should be restored, so we treat them as live until we + * actually process them. */ - par_list_header_init(&pending_list); par_list_header_init(&ready_list); skipped_some = false; for (next_work_item = AH->toc->next; next_work_item != AH->toc; next_work_item = next_work_item->next) @@ -3603,7 +3507,7 @@ restore_toc_entries_parallel(ArchiveHandle *AH) } if (next_work_item->depCount > 0) - par_list_append(&pending_list, next_work_item); + par_list_append(pending_list, next_work_item); else par_list_append(&ready_list, next_work_item); } @@ -3617,9 +3521,8 @@ restore_toc_entries_parallel(ArchiveHandle *AH) ahlog(AH, 1, "entering main parallel loop\n"); - while ((next_work_item = get_next_work_item(AH, &ready_list, - slots, n_slots)) != NULL || - work_in_progress(slots, n_slots)) + while ((next_work_item = get_next_work_item(AH, &ready_list, pstate)) != NULL || + !IsEveryWorkerIdle(pstate)) { if (next_work_item != NULL) { @@ -3637,62 +3540,71 @@ restore_toc_entries_parallel(ArchiveHandle *AH) continue; } - if ((next_slot = get_next_slot(slots, n_slots)) != NO_SLOT) - { - /* There is work still to do and a worker slot available */ - thandle child; - RestoreArgs *args; + ahlog(AH, 1, "launching item %d %s %s\n", + next_work_item->dumpId, + next_work_item->desc, next_work_item->tag); - ahlog(AH, 1, "launching item %d %s %s\n", - next_work_item->dumpId, - next_work_item->desc, next_work_item->tag); - - par_list_remove(next_work_item); + par_list_remove(next_work_item); - /* this memory is dealloced in mark_work_done() */ - args = pg_malloc(sizeof(RestoreArgs)); - args->AH = CloneArchive(AH); - args->te = next_work_item; - args->pse = &pstate->pse[next_slot]; + Assert(GetIdleWorker(pstate) != NO_SLOT); + DispatchJobForTocEntry(AH, pstate, next_work_item, ACT_RESTORE); + } + else + /* at least one child is working and we have nothing ready. */ + Assert(!IsEveryWorkerIdle(pstate)); - /* run the step in a worker child */ - child = spawn_restore(args); + for (;;) + { + int nTerm = 0; - slots[next_slot].child_id = child; - slots[next_slot].args = args; + /* + * In order to reduce dependencies as soon as possible and + * especially to reap the status of workers who are working on + * items that pending items depend on, we do a non-blocking check + * for ended workers first. + * + * However, if we do not have any other work items currently that + * workers can work on, we do not busy-loop here but instead + * really wait for at least one worker to terminate. Hence we call + * ListenToWorkers(..., ..., do_wait = true) in this case. + */ + ListenToWorkers(AH, pstate, !next_work_item); - continue; + while ((ret_child = ReapWorkerStatus(pstate, &work_status)) != NO_SLOT) + { + nTerm++; + mark_work_done(AH, &ready_list, ret_child, work_status, pstate); } - } - /* - * If we get here there must be work being done. Either there is no - * work available to schedule (and work_in_progress returned true) or - * there are no slots available. So we wait for a worker to finish, - * and process the result. - */ - ret_child = reap_child(slots, n_slots, &work_status); + /* + * We need to make sure that we have an idle worker before re-running the + * loop. If nTerm > 0 we already have that (quick check). + */ + if (nTerm > 0) + break; - if (WIFEXITED(work_status)) - { - mark_work_done(AH, &ready_list, - ret_child, WEXITSTATUS(work_status), - slots, n_slots); - } - else - { - exit_horribly(modulename, "worker process crashed: status %d\n", - work_status); + /* if nobody terminated, explicitly check for an idle worker */ + if (GetIdleWorker(pstate) != NO_SLOT) + break; + + /* + * If we have no idle worker, read the result of one or more + * workers and loop the loop to call ReapWorkerStatus() on them. + */ + ListenToWorkers(AH, pstate, true); } } ahlog(AH, 1, "finished main parallel loop\n"); +} - /* - * Remove the pstate again, so the exit handler will now fall back to - * closing AH->connection again. - */ - shutdown_info.pstate = NULL; +static void +restore_toc_entries_postfork(ArchiveHandle *AH, TocEntry *pending_list) +{ + RestoreOptions *ropt = AH->ropt; + TocEntry *te; + + ahlog(AH, 2, "entering restore_toc_entries_postfork\n"); /* * Now reconnect the single parent connection. @@ -3708,7 +3620,7 @@ restore_toc_entries_parallel(ArchiveHandle *AH) * dependencies, or some other pathological condition. If so, do it in the * single parent connection. */ - for (te = pending_list.par_next; te != &pending_list; te = te->par_next) + for (te = pending_list->par_next; te != pending_list; te = te->par_next) { ahlog(AH, 1, "processing missed item %d %s %s\n", te->dumpId, te->desc, te->tag); @@ -3718,121 +3630,6 @@ restore_toc_entries_parallel(ArchiveHandle *AH) /* The ACLs will be handled back in RestoreArchive. */ } -/* - * create a worker child to perform a restore step in parallel - */ -static thandle -spawn_restore(RestoreArgs *args) -{ - thandle child; - - /* Ensure stdio state is quiesced before forking */ - fflush(NULL); - -#ifndef WIN32 - child = fork(); - if (child == 0) - { - /* in child process */ - parallel_restore(args); - exit_horribly(modulename, - "parallel_restore should not return\n"); - } - else if (child < 0) - { - /* fork failed */ - exit_horribly(modulename, - "could not create worker process: %s\n", - strerror(errno)); - } -#else - child = (HANDLE) _beginthreadex(NULL, 0, (void *) parallel_restore, - args, 0, NULL); - if (child == 0) - exit_horribly(modulename, - "could not create worker thread: %s\n", - strerror(errno)); -#endif - - return child; -} - -/* - * collect status from a completed worker child - */ -static thandle -reap_child(ParallelSlot *slots, int n_slots, int *work_status) -{ -#ifndef WIN32 - /* Unix is so much easier ... */ - return wait(work_status); -#else - static HANDLE *handles = NULL; - int hindex, - snum, - tnum; - thandle ret_child; - DWORD res; - - /* first time around only, make space for handles to listen on */ - if (handles == NULL) - handles = (HANDLE *) pg_malloc0(n_slots * sizeof(HANDLE)); - - /* set up list of handles to listen to */ - for (snum = 0, tnum = 0; snum < n_slots; snum++) - if (slots[snum].child_id != 0) - handles[tnum++] = slots[snum].child_id; - - /* wait for one to finish */ - hindex = WaitForMultipleObjects(tnum, handles, false, INFINITE); - - /* get handle of finished thread */ - ret_child = handles[hindex - WAIT_OBJECT_0]; - - /* get the result */ - GetExitCodeThread(ret_child, &res); - *work_status = res; - - /* dispose of handle to stop leaks */ - CloseHandle(ret_child); - - return ret_child; -#endif -} - -/* - * are we doing anything now? - */ -static bool -work_in_progress(ParallelSlot *slots, int n_slots) -{ - int i; - - for (i = 0; i < n_slots; i++) - { - if (slots[i].child_id != 0) - return true; - } - return false; -} - -/* - * find the first free parallel slot (if any). - */ -static int -get_next_slot(ParallelSlot *slots, int n_slots) -{ - int i; - - for (i = 0; i < n_slots; i++) - { - if (slots[i].child_id == 0) - return i; - } - return NO_SLOT; -} - - /* * Check if te1 has an exclusive lock requirement for an item that te2 also * requires, whether or not te2's requirement is for an exclusive lock. @@ -3906,7 +3703,7 @@ par_list_remove(TocEntry *te) */ static TocEntry * get_next_work_item(ArchiveHandle *AH, TocEntry *ready_list, - ParallelSlot *slots, int n_slots) + ParallelState *pstate) { bool pref_non_data = false; /* or get from AH->ropt */ TocEntry *data_te = NULL; @@ -3921,11 +3718,11 @@ get_next_work_item(ArchiveHandle *AH, TocEntry *ready_list, { int count = 0; - for (k = 0; k < n_slots; k++) - if (slots[k].args->te != NULL && - slots[k].args->te->section == SECTION_DATA) + for (k = 0; k < pstate->numWorkers; k++) + if (pstate->parallelSlot[k].args->te != NULL && + pstate->parallelSlot[k].args->te->section == SECTION_DATA) count++; - if (n_slots == 0 || count * 4 < n_slots) + if (pstate->numWorkers == 0 || count * 4 < pstate->numWorkers) pref_non_data = false; } @@ -3941,13 +3738,13 @@ get_next_work_item(ArchiveHandle *AH, TocEntry *ready_list, * that a currently running item also needs lock on, or vice versa. If * so, we don't want to schedule them together. */ - for (i = 0; i < n_slots && !conflicts; i++) + for (i = 0; i < pstate->numWorkers && !conflicts; i++) { TocEntry *running_te; - if (slots[i].args == NULL) + if (pstate->parallelSlot[i].workerStatus != WRKR_WORKING) continue; - running_te = slots[i].args->te; + running_te = pstate->parallelSlot[i].args->te; if (has_lock_conflicts(te, running_te) || has_lock_conflicts(running_te, te)) @@ -3982,63 +3779,29 @@ get_next_work_item(ArchiveHandle *AH, TocEntry *ready_list, /* * Restore a single TOC item in parallel with others * - * this is the procedure run as a thread (Windows) or a - * separate process (everything else). + * this is run in the worker, i.e. in a thread (Windows) or a separate process + * (everything else). A worker process executes several such work items during + * a parallel backup or restore. Once we terminate here and report back that + * our work is finished, the master process will assign us a new work item. */ -static parallel_restore_result -parallel_restore(RestoreArgs *args) +int +parallel_restore(ParallelArgs *args) { ArchiveHandle *AH = args->AH; TocEntry *te = args->te; RestoreOptions *ropt = AH->ropt; - int retval; - - setProcessIdentifier(args->pse, AH); - - /* - * Close and reopen the input file so we have a private file pointer that - * doesn't stomp on anyone else's file pointer, if we're actually going to - * need to read from the file. Otherwise, just close it except on Windows, - * where it will possibly be needed by other threads. - * - * Note: on Windows, since we are using threads not processes, the reopen - * call *doesn't* close the original file pointer but just open a new one. - */ - if (te->section == SECTION_DATA) - (AH->ReopenPtr) (AH); -#ifndef WIN32 - else - (AH->ClosePtr) (AH); -#endif - - /* - * We need our own database connection, too - */ - ConnectDatabase((Archive *) AH, ropt->dbname, - ropt->pghost, ropt->pgport, ropt->username, - ropt->promptPassword); + int status; _doSetFixedOutputState(AH); - /* Restore the TOC item */ - retval = restore_toc_entry(AH, te, ropt, true); - - /* And clean up */ - DisconnectDatabase((Archive *) AH); - unsetProcessIdentifier(args->pse); + Assert(AH->connection != NULL); - /* If we reopened the file, we are done with it, so close it now */ - if (te->section == SECTION_DATA) - (AH->ClosePtr) (AH); + AH->public.n_errors = 0; - if (retval == 0 && AH->public.n_errors) - retval = WORKER_IGNORED_ERRORS; + /* Restore the TOC item */ + status = restore_toc_entry(AH, te, ropt, true); -#ifndef WIN32 - exit(retval); -#else - return retval; -#endif + return status; } @@ -4050,25 +3813,12 @@ parallel_restore(RestoreArgs *args) */ static void mark_work_done(ArchiveHandle *AH, TocEntry *ready_list, - thandle worker, int status, - ParallelSlot *slots, int n_slots) + int worker, int status, + ParallelState *pstate) { TocEntry *te = NULL; - int i; - - for (i = 0; i < n_slots; i++) - { - if (slots[i].child_id == worker) - { - slots[i].child_id = 0; - te = slots[i].args->te; - DeCloneArchive(slots[i].args->AH); - free(slots[i].args); - slots[i].args = NULL; - break; - } - } + te = pstate->parallelSlot[worker].args->te; if (te == NULL) exit_horribly(modulename, "could not find slot of finished worker\n"); @@ -4367,16 +4117,13 @@ inhibit_data_for_failed_table(ArchiveHandle *AH, TocEntry *te) } } - /* * Clone and de-clone routines used in parallel restoration. * * Enough of the structure is cloned to ensure that there is no * conflict between different threads each with their own clone. - * - * These could be public, but no need at present. */ -static ArchiveHandle * +ArchiveHandle * CloneArchive(ArchiveHandle *AH) { ArchiveHandle *clone; @@ -4402,9 +4149,59 @@ CloneArchive(ArchiveHandle *AH) /* clone has its own error count, too */ clone->public.n_errors = 0; + /* + * Connect our new clone object to the database: + * In parallel restore the parent is already disconnected, because we can + * connect the worker processes independently to the database (no snapshot + * sync required). + * In parallel backup we clone the parent's existing connection. + */ + if (AH->mode == archModeRead) + { + RestoreOptions *ropt = AH->ropt; + Assert(AH->connection == NULL); + /* this also sets clone->connection */ + ConnectDatabase((Archive *) clone, ropt->dbname, + ropt->pghost, ropt->pgport, ropt->username, + ropt->promptPassword); + } + else + { + char *dbname; + char *pghost; + char *pgport; + char *username; + const char *encname; + + Assert(AH->connection != NULL); + + /* + * Even though we are technically accessing the parent's database object + * here, these functions are fine to be called like that because all just + * return a pointer and do not actually send/receive any data to/from the + * database. + */ + dbname = PQdb(AH->connection); + pghost = PQhost(AH->connection); + pgport = PQport(AH->connection); + username = PQuser(AH->connection); + encname = pg_encoding_to_char(AH->public.encoding); + + /* this also sets clone->connection */ + ConnectDatabase((Archive *) clone, dbname, pghost, pgport, username, TRI_NO); + + /* + * Set the same encoding, whatever we set here is what we got from + * pg_encoding_to_char(), so we really shouldn't run into an error setting that + * very same value. Also see the comment in SetupConnection(). + */ + PQsetClientEncoding(clone->connection, encname); + } + /* Let the format-specific code have a chance too */ (clone->ClonePtr) (clone); + Assert(clone->connection != NULL); return clone; } @@ -4413,7 +4210,7 @@ CloneArchive(ArchiveHandle *AH) * * Note: we assume any clone-local connection was already closed. */ -static void +void DeCloneArchive(ArchiveHandle *AH) { /* Clear format-specific state */ diff --git a/src/bin/pg_dump/pg_backup_archiver.h b/src/bin/pg_dump/pg_backup_archiver.h index 8859bd9776..844066e430 100644 --- a/src/bin/pg_dump/pg_backup_archiver.h +++ b/src/bin/pg_dump/pg_backup_archiver.h @@ -100,8 +100,21 @@ typedef z_stream *z_streamp; #define K_OFFSET_POS_SET 2 #define K_OFFSET_NO_DATA 3 +/* + * Special exit values from worker children. We reserve 0 for normal + * success; 1 and other small values should be interpreted as crashes. + */ +#define WORKER_OK 0 +#define WORKER_CREATE_DONE 10 +#define WORKER_INHIBIT_DATA 11 +#define WORKER_IGNORED_ERRORS 12 + struct _archiveHandle; struct _tocEntry; +struct _restoreList; +struct ParallelArgs; +struct ParallelState; +enum T_Action; typedef void (*ClosePtr) (struct _archiveHandle * AH); typedef void (*ReopenPtr) (struct _archiveHandle * AH); @@ -129,6 +142,13 @@ typedef void (*PrintTocDataPtr) (struct _archiveHandle * AH, struct _tocEntry * typedef void (*ClonePtr) (struct _archiveHandle * AH); typedef void (*DeClonePtr) (struct _archiveHandle * AH); +typedef char *(*WorkerJobRestorePtr)(struct _archiveHandle * AH, struct _tocEntry * te); +typedef char *(*WorkerJobDumpPtr)(struct _archiveHandle * AH, struct _tocEntry * te); +typedef char *(*MasterStartParallelItemPtr)(struct _archiveHandle * AH, struct _tocEntry * te, + enum T_Action act); +typedef int (*MasterEndParallelItemPtr)(struct _archiveHandle * AH, struct _tocEntry * te, + const char *str, enum T_Action act); + typedef size_t (*CustomOutPtr) (struct _archiveHandle * AH, const void *buf, size_t len); typedef enum @@ -227,6 +247,12 @@ typedef struct _archiveHandle StartBlobPtr StartBlobPtr; EndBlobPtr EndBlobPtr; + MasterStartParallelItemPtr MasterStartParallelItemPtr; + MasterEndParallelItemPtr MasterEndParallelItemPtr; + + WorkerJobDumpPtr WorkerJobDumpPtr; + WorkerJobRestorePtr WorkerJobRestorePtr; + ClonePtr ClonePtr; /* Clone format-specific fields */ DeClonePtr DeClonePtr; /* Clean up cloned fields */ @@ -236,6 +262,7 @@ typedef struct _archiveHandle char *archdbname; /* DB name *read* from archive */ enum trivalue promptPassword; char *savedPassword; /* password for ropt->username, if known */ + char *use_role; PGconn *connection; int connectToDB; /* Flag to indicate if direct DB connection is * required */ @@ -327,6 +354,7 @@ typedef struct _tocEntry int nLockDeps; /* number of such dependencies */ } TocEntry; +extern int parallel_restore(struct ParallelArgs *args); extern void on_exit_close_archive(Archive *AHX); extern void warn_or_exit_horribly(ArchiveHandle *AH, const char *modulename, const char *fmt,...) __attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 4))); @@ -337,9 +365,13 @@ extern void WriteHead(ArchiveHandle *AH); extern void ReadHead(ArchiveHandle *AH); extern void WriteToc(ArchiveHandle *AH); extern void ReadToc(ArchiveHandle *AH); -extern void WriteDataChunks(ArchiveHandle *AH); +extern void WriteDataChunks(ArchiveHandle *AH, struct ParallelState *pstate); +extern void WriteDataChunksForTocEntry(ArchiveHandle *AH, TocEntry *te); +extern ArchiveHandle *CloneArchive(ArchiveHandle *AH); +extern void DeCloneArchive(ArchiveHandle *AH); extern teReqs TocIDRequired(ArchiveHandle *AH, DumpId id); +TocEntry *getTocEntryByDumpId(ArchiveHandle *AH, DumpId id); extern bool checkSeek(FILE *fp); #define appendStringLiteralAHX(buf,str,AH) \ @@ -380,4 +412,16 @@ int ahprintf(ArchiveHandle *AH, const char *fmt,...) __attribute__((format(PG_ void ahlog(ArchiveHandle *AH, int level, const char *fmt,...) __attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 4))); +#ifdef USE_ASSERT_CHECKING +#define Assert(condition) \ + if (!(condition)) \ + { \ + write_msg(NULL, "Failed assertion in %s, line %d\n", \ + __FILE__, __LINE__); \ + abort();\ + } +#else +#define Assert(condition) +#endif + #endif diff --git a/src/bin/pg_dump/pg_backup_custom.c b/src/bin/pg_dump/pg_backup_custom.c index c9adc6f82f..e44e6a3af1 100644 --- a/src/bin/pg_dump/pg_backup_custom.c +++ b/src/bin/pg_dump/pg_backup_custom.c @@ -27,6 +27,7 @@ #include "compress_io.h" #include "dumputils.h" #include "dumpmem.h" +#include "parallel.h" /*-------- * Routines in the format interface @@ -60,6 +61,10 @@ static void _LoadBlobs(ArchiveHandle *AH, bool drop); static void _Clone(ArchiveHandle *AH); static void _DeClone(ArchiveHandle *AH); +static char *_MasterStartParallelItem(ArchiveHandle *AH, TocEntry *te, T_Action act); +static int _MasterEndParallelItem(ArchiveHandle *AH, TocEntry *te, const char *str, T_Action act); +char *_WorkerJobRestoreCustom(ArchiveHandle *AH, TocEntry *te); + typedef struct { CompressorState *cs; @@ -128,6 +133,13 @@ InitArchiveFmt_Custom(ArchiveHandle *AH) AH->ClonePtr = _Clone; AH->DeClonePtr = _DeClone; + AH->MasterStartParallelItemPtr = _MasterStartParallelItem; + AH->MasterEndParallelItemPtr = _MasterEndParallelItem; + + /* no parallel dump in the custom archive, only parallel restore */ + AH->WorkerJobDumpPtr = NULL; + AH->WorkerJobRestorePtr = _WorkerJobRestoreCustom; + /* Set up a private area. */ ctx = (lclContext *) pg_malloc0(sizeof(lclContext)); AH->formatData = (void *) ctx; @@ -699,7 +711,7 @@ _CloseArchive(ArchiveHandle *AH) tpos = ftello(AH->FH); WriteToc(AH); ctx->dataStart = _getFilePos(AH, ctx); - WriteDataChunks(AH); + WriteDataChunks(AH, NULL); /* * If possible, re-write the TOC in order to update the data offset @@ -797,6 +809,81 @@ _DeClone(ArchiveHandle *AH) free(ctx); } +/* + * This function is executed in the child of a parallel backup for the + * custom format archive and dumps the actual data. + */ +char * +_WorkerJobRestoreCustom(ArchiveHandle *AH, TocEntry *te) +{ + /* short fixed-size string + some ID so far, this needs to be malloc'ed + * instead of static because we work with threads on windows */ + const int buflen = 64; + char *buf = (char*) pg_malloc(buflen); + ParallelArgs pargs; + int status; + lclTocEntry *tctx; + + tctx = (lclTocEntry *) te->formatData; + + pargs.AH = AH; + pargs.te = te; + + status = parallel_restore(&pargs); + + snprintf(buf, buflen, "OK RESTORE %d %d %d", te->dumpId, status, + status == WORKER_IGNORED_ERRORS ? AH->public.n_errors : 0); + + return buf; +} + +/* + * This function is executed in the parent process. Depending on the desired + * action (dump or restore) it creates a string that is understood by the + * _WorkerJobDumpDirectory/_WorkerJobRestoreDirectory functions of the + * respective dump format. + */ +static char * +_MasterStartParallelItem(ArchiveHandle *AH, TocEntry *te, T_Action act) +{ + /* + * A static char is okay here, even on Windows because we call this + * function only from one process (the master). + */ + static char buf[64]; /* short fixed-size string + number */ + + /* no parallel dump in the custom archive format */ + Assert(act == ACT_RESTORE); + + snprintf(buf, sizeof(buf), "RESTORE %d", te->dumpId); + + return buf; +} + +/* + * This function is executed in the parent process. It analyzes the response of + * the _WorkerJobDumpDirectory/_WorkerJobRestoreDirectory functions of the + * respective dump format. + */ +static int +_MasterEndParallelItem(ArchiveHandle *AH, TocEntry *te, const char *str, T_Action act) +{ + DumpId dumpId; + int nBytes, status, n_errors; + + /* no parallel dump in the custom archive */ + Assert(act == ACT_RESTORE); + + sscanf(str, "%u %u %u%n", &dumpId, &status, &n_errors, &nBytes); + + Assert(nBytes == strlen(str)); + Assert(dumpId == te->dumpId); + + AH->public.n_errors += n_errors; + + return status; +} + /*-------------------------------------------------- * END OF FORMAT CALLBACKS *-------------------------------------------------- diff --git a/src/bin/pg_dump/pg_backup_db.c b/src/bin/pg_dump/pg_backup_db.c index c295fc5ddd..a82c2c6b19 100644 --- a/src/bin/pg_dump/pg_backup_db.c +++ b/src/bin/pg_dump/pg_backup_db.c @@ -309,12 +309,30 @@ ConnectDatabase(Archive *AHX, PQsetNoticeProcessor(AH->connection, notice_processor, NULL); } +/* + * Close the connection to the database and also cancel off the query if we + * have one running. + */ void DisconnectDatabase(Archive *AHX) { ArchiveHandle *AH = (ArchiveHandle *) AHX; + PGcancel *cancel; + char errbuf[1]; + + if (!AH->connection) + return; - PQfinish(AH->connection); /* noop if AH->connection is NULL */ + if (PQtransactionStatus(AH->connection) == PQTRANS_ACTIVE) + { + if ((cancel = PQgetCancel(AH->connection))) + { + PQcancel(cancel, errbuf, sizeof(errbuf)); + PQfreeCancel(cancel); + } + } + + PQfinish(AH->connection); AH->connection = NULL; } diff --git a/src/bin/pg_dump/pg_backup_directory.c b/src/bin/pg_dump/pg_backup_directory.c index 574fdaa28a..fb3327d061 100644 --- a/src/bin/pg_dump/pg_backup_directory.c +++ b/src/bin/pg_dump/pg_backup_directory.c @@ -36,6 +36,7 @@ #include "compress_io.h" #include "dumpmem.h" #include "dumputils.h" +#include "parallel.h" #include #include @@ -51,6 +52,7 @@ typedef struct cfp *dataFH; /* currently open data file */ cfp *blobsTocFH; /* file handle for blobs.toc */ + ParallelState *pstate; /* for parallel backup / restore */ } lclContext; typedef struct @@ -71,6 +73,7 @@ static int _ReadByte(ArchiveHandle *); static size_t _WriteBuf(ArchiveHandle *AH, const void *buf, size_t len); static size_t _ReadBuf(ArchiveHandle *AH, void *buf, size_t len); static void _CloseArchive(ArchiveHandle *AH); +static void _ReopenArchive(ArchiveHandle *AH); static void _PrintTocData(ArchiveHandle *AH, TocEntry *te, RestoreOptions *ropt); static void _WriteExtraToc(ArchiveHandle *AH, TocEntry *te); @@ -83,8 +86,17 @@ static void _EndBlob(ArchiveHandle *AH, TocEntry *te, Oid oid); static void _EndBlobs(ArchiveHandle *AH, TocEntry *te); static void _LoadBlobs(ArchiveHandle *AH, RestoreOptions *ropt); -static char *prependDirectory(ArchiveHandle *AH, const char *relativeFilename); +static void _Clone(ArchiveHandle *AH); +static void _DeClone(ArchiveHandle *AH); +static char *_MasterStartParallelItem(ArchiveHandle *AH, TocEntry *te, T_Action act); +static int _MasterEndParallelItem(ArchiveHandle *AH, TocEntry *te, + const char *str, T_Action act); +static char *_WorkerJobRestoreDirectory(ArchiveHandle *AH, TocEntry *te); +static char *_WorkerJobDumpDirectory(ArchiveHandle *AH, TocEntry *te); + +static char *prependDirectory(ArchiveHandle *AH, char *buf, + const char *relativeFilename); /* * Init routine required by ALL formats. This is a global routine @@ -111,7 +123,7 @@ InitArchiveFmt_Directory(ArchiveHandle *AH) AH->WriteBufPtr = _WriteBuf; AH->ReadBufPtr = _ReadBuf; AH->ClosePtr = _CloseArchive; - AH->ReopenPtr = NULL; + AH->ReopenPtr = _ReopenArchive; AH->PrintTocDataPtr = _PrintTocData; AH->ReadExtraTocPtr = _ReadExtraToc; AH->WriteExtraTocPtr = _WriteExtraToc; @@ -122,8 +134,14 @@ InitArchiveFmt_Directory(ArchiveHandle *AH) AH->EndBlobPtr = _EndBlob; AH->EndBlobsPtr = _EndBlobs; - AH->ClonePtr = NULL; - AH->DeClonePtr = NULL; + AH->ClonePtr = _Clone; + AH->DeClonePtr = _DeClone; + + AH->WorkerJobRestorePtr = _WorkerJobRestoreDirectory; + AH->WorkerJobDumpPtr = _WorkerJobDumpDirectory; + + AH->MasterStartParallelItemPtr = _MasterStartParallelItem; + AH->MasterEndParallelItemPtr = _MasterEndParallelItem; /* Set up our private context */ ctx = (lclContext *) pg_malloc0(sizeof(lclContext)); @@ -147,16 +165,37 @@ InitArchiveFmt_Directory(ArchiveHandle *AH) if (AH->mode == archModeWrite) { - if (mkdir(ctx->directory, 0700) < 0) + struct stat st; + bool is_empty = false; + + /* we accept an empty existing directory */ + if (stat(ctx->directory, &st) == 0 && S_ISDIR(st.st_mode)) + { + DIR* dir = opendir(ctx->directory); + if (dir) { + struct dirent *d; + is_empty = true; + while ((d = readdir(dir))) { + if (strcmp(d->d_name, ".") != 0 && strcmp(d->d_name, "..") != 0) + { + is_empty = false; + break; + } + } + closedir(dir); + } + } + + if (!is_empty && mkdir(ctx->directory, 0700) < 0) exit_horribly(modulename, "could not create directory \"%s\": %s\n", ctx->directory, strerror(errno)); } else { /* Read Mode */ - char *fname; + char fname[MAXPGPATH]; cfp *tocFH; - fname = prependDirectory(AH, "toc.dat"); + prependDirectory(AH, fname, "toc.dat"); tocFH = cfopen_read(fname, PG_BINARY_R); if (tocFH == NULL) @@ -282,9 +321,9 @@ _StartData(ArchiveHandle *AH, TocEntry *te) { lclTocEntry *tctx = (lclTocEntry *) te->formatData; lclContext *ctx = (lclContext *) AH->formatData; - char *fname; + char fname[MAXPGPATH]; - fname = prependDirectory(AH, tctx->filename); + prependDirectory(AH, fname, tctx->filename); ctx->dataFH = cfopen_write(fname, PG_BINARY_W, AH->compression); if (ctx->dataFH == NULL) @@ -309,6 +348,9 @@ _WriteData(ArchiveHandle *AH, const void *data, size_t dLen) if (dLen == 0) return 0; + /* Are we aborting? */ + checkAborting(AH); + return cfwrite(data, dLen, ctx->dataFH); } @@ -376,8 +418,9 @@ _PrintTocData(ArchiveHandle *AH, TocEntry *te, RestoreOptions *ropt) _LoadBlobs(AH, ropt); else { - char *fname = prependDirectory(AH, tctx->filename); + char fname[MAXPGPATH]; + prependDirectory(AH, fname, tctx->filename); _PrintFileData(AH, fname, ropt); } } @@ -387,12 +430,12 @@ _LoadBlobs(ArchiveHandle *AH, RestoreOptions *ropt) { Oid oid; lclContext *ctx = (lclContext *) AH->formatData; - char *fname; + char fname[MAXPGPATH]; char line[MAXPGPATH]; StartRestoreBlobs(AH); - fname = prependDirectory(AH, "blobs.toc"); + prependDirectory(AH, fname, "blobs.toc"); ctx->blobsTocFH = cfopen_read(fname, PG_BINARY_R); @@ -475,6 +518,9 @@ _WriteBuf(ArchiveHandle *AH, const void *buf, size_t len) lclContext *ctx = (lclContext *) AH->formatData; size_t res; + /* Are we aborting? */ + checkAborting(AH); + res = cfwrite(buf, len, ctx->dataFH); if (res != len) exit_horribly(modulename, "could not write to output file: %s\n", @@ -519,7 +565,12 @@ _CloseArchive(ArchiveHandle *AH) if (AH->mode == archModeWrite) { cfp *tocFH; - char *fname = prependDirectory(AH, "toc.dat"); + char fname[MAXPGPATH]; + + prependDirectory(AH, fname, "toc.dat"); + + /* this will actually fork the processes for a parallel backup */ + ctx->pstate = ParallelBackupStart(AH, NULL); /* The TOC is always created uncompressed */ tocFH = cfopen_write(fname, PG_BINARY_W, 0); @@ -540,11 +591,24 @@ _CloseArchive(ArchiveHandle *AH) if (cfclose(tocFH) != 0) exit_horribly(modulename, "could not close TOC file: %s\n", strerror(errno)); - WriteDataChunks(AH); + WriteDataChunks(AH, ctx->pstate); + + ParallelBackupEnd(AH, ctx->pstate); } AH->FH = NULL; } +/* + * Reopen the archive's file handle. + */ +static void +_ReopenArchive(ArchiveHandle *AH) +{ + /* + * Our TOC is in memory, our data files are opened by each child anyway as + * they are separate. We support reopening the archive by just doing nothing. + */ +} /* * BLOB support @@ -561,9 +625,9 @@ static void _StartBlobs(ArchiveHandle *AH, TocEntry *te) { lclContext *ctx = (lclContext *) AH->formatData; - char *fname; + char fname[MAXPGPATH]; - fname = prependDirectory(AH, "blobs.toc"); + prependDirectory(AH, fname, "blobs.toc"); /* The blob TOC file is never compressed */ ctx->blobsTocFH = cfopen_write(fname, "ab", 0); @@ -628,12 +692,16 @@ _EndBlobs(ArchiveHandle *AH, TocEntry *te) ctx->blobsTocFH = NULL; } - +/* + * Gets a relative file name and prepends the output directory, writing the + * result to buf. The caller needs to make sure that buf is MAXPGPATH bytes + * big. Can't use a static char[MAXPGPATH] inside the function because we run + * multithreaded on Windows. + */ static char * -prependDirectory(ArchiveHandle *AH, const char *relativeFilename) +prependDirectory(ArchiveHandle *AH, char *buf, const char *relativeFilename) { lclContext *ctx = (lclContext *) AH->formatData; - static char buf[MAXPGPATH]; char *dname; dname = ctx->directory; @@ -647,3 +715,152 @@ prependDirectory(ArchiveHandle *AH, const char *relativeFilename) return buf; } + +/* + * Clone format-specific fields during parallel restoration. + */ +static void +_Clone(ArchiveHandle *AH) +{ + lclContext *ctx = (lclContext *) AH->formatData; + + AH->formatData = (lclContext *) pg_malloc(sizeof(lclContext)); + memcpy(AH->formatData, ctx, sizeof(lclContext)); + ctx = (lclContext *) AH->formatData; + + /* + * Note: we do not make a local lo_buf because we expect at most one BLOBS + * entry per archive, so no parallelism is possible. Likewise, + * TOC-entry-local state isn't an issue because any one TOC entry is + * touched by just one worker child. + */ + + /* + * We also don't copy the ParallelState pointer (pstate), only the master + * process ever writes to it. + */ +} + +static void +_DeClone(ArchiveHandle *AH) +{ + lclContext *ctx = (lclContext *) AH->formatData; + free(ctx); +} + +/* + * This function is executed in the parent process. Depending on the desired + * action (dump or restore) it creates a string that is understood by the + * _WorkerJobDumpDirectory/_WorkerJobRestoreDirectory functions of the + * respective dump format. + */ +static char * +_MasterStartParallelItem(ArchiveHandle *AH, TocEntry *te, T_Action act) +{ + /* + * A static char is okay here, even on Windows because we call this + * function only from one process (the master). + */ + static char buf[64]; + + if (act == ACT_DUMP) + snprintf(buf, sizeof(buf), "DUMP %d", te->dumpId); + else if (act == ACT_RESTORE) + snprintf(buf, sizeof(buf), "RESTORE %d", te->dumpId); + + return buf; +} + +/* + * This function is executed in the child of a parallel backup for the + * directory archive and dumps the actual data. + * + * We are currently returning only the DumpId so theoretically we could + * make this function returning an int (or a DumpId). However, to + * facilitate further enhancements and because sooner or later we need to + * convert this to a string and send it via a message anyway, we stick with + * char *. It is parsed on the other side by the _EndMasterParallel() + * function of the respective dump format. + */ +static char * +_WorkerJobDumpDirectory(ArchiveHandle *AH, TocEntry *te) +{ + /* short fixed-size string + some ID so far, this needs to be malloc'ed + * instead of static because we work with threads on windows */ + const int buflen = 64; + char *buf = (char*) pg_malloc(buflen); + lclTocEntry *tctx = (lclTocEntry *) te->formatData; + + /* This should never happen */ + if (!tctx) + exit_horribly(modulename, "Error during backup\n"); + + /* + * This function returns void. We either fail and die horribly or succeed... + * A failure will be detected by the parent when the child dies unexpectedly. + */ + WriteDataChunksForTocEntry(AH, te); + + snprintf(buf, buflen, "OK DUMP %d", te->dumpId); + + return buf; +} + +/* + * This function is executed in the child of a parallel backup for the + * directory archive and dumps the actual data. + */ +static char * +_WorkerJobRestoreDirectory(ArchiveHandle *AH, TocEntry *te) +{ + /* short fixed-size string + some ID so far, this needs to be malloc'ed + * instead of static because we work with threads on windows */ + const int buflen = 64; + char *buf = (char*) pg_malloc(buflen); + ParallelArgs pargs; + int status; + lclTocEntry *tctx; + + tctx = (lclTocEntry *) te->formatData; + + pargs.AH = AH; + pargs.te = te; + + status = parallel_restore(&pargs); + + snprintf(buf, buflen, "OK RESTORE %d %d %d", te->dumpId, status, + status == WORKER_IGNORED_ERRORS ? AH->public.n_errors : 0); + + return buf; +} +/* + * This function is executed in the parent process. It analyzes the response of + * the _WorkerJobDumpDirectory/_WorkerJobRestoreDirectory functions of the + * respective dump format. + */ +static int +_MasterEndParallelItem(ArchiveHandle *AH, TocEntry *te, const char *str, T_Action act) +{ + DumpId dumpId; + int nBytes, n_errors; + int status = 0; + + if (act == ACT_DUMP) + { + sscanf(str, "%u%n", &dumpId, &nBytes); + + Assert(dumpId == te->dumpId); + Assert(nBytes == strlen(str)); + } + else if (act == ACT_RESTORE) + { + sscanf(str, "%u %u %u%n", &dumpId, &status, &n_errors, &nBytes); + + Assert(dumpId == te->dumpId); + Assert(nBytes == strlen(str)); + + AH->public.n_errors += n_errors; + } + + return status; +} diff --git a/src/bin/pg_dump/pg_backup_tar.c b/src/bin/pg_dump/pg_backup_tar.c index ee277b1f8c..a5673d65d6 100644 --- a/src/bin/pg_dump/pg_backup_tar.c +++ b/src/bin/pg_dump/pg_backup_tar.c @@ -156,6 +156,12 @@ InitArchiveFmt_Tar(ArchiveHandle *AH) AH->ClonePtr = NULL; AH->DeClonePtr = NULL; + AH->MasterStartParallelItemPtr = NULL; + AH->MasterEndParallelItemPtr = NULL; + + AH->WorkerJobDumpPtr = NULL; + AH->WorkerJobRestorePtr = NULL; + /* * Set up some special context used in compressing data. */ @@ -826,7 +832,7 @@ _CloseArchive(ArchiveHandle *AH) /* * Now send the data (tables & blobs) */ - WriteDataChunks(AH); + WriteDataChunks(AH, NULL); /* * Now this format wants to append a script which does a full restore diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index dd2019a1fb..420fc8c36d 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -136,6 +136,7 @@ static int disable_dollar_quoting = 0; static int dump_inserts = 0; static int column_inserts = 0; static int no_security_labels = 0; +static int no_synchronized_snapshots = 0; static int no_unlogged_table_data = 0; static int serializable_deferrable = 0; @@ -241,8 +242,6 @@ static Oid findLastBuiltinOid_V70(Archive *fout); static void selectSourceSchema(Archive *fout, const char *schemaName); static char *getFormattedTypeName(Archive *fout, Oid oid, OidOptions opts); static char *myFormatType(const char *typname, int32 typmod); -static const char *fmtQualifiedId(Archive *fout, - const char *schema, const char *id); static void getBlobs(Archive *fout); static void dumpBlob(Archive *fout, BlobInfo *binfo); static int dumpBlobs(Archive *fout, void *arg); @@ -260,7 +259,8 @@ static void binary_upgrade_extension_member(PQExpBuffer upgrade_buffer, DumpableObject *dobj, const char *objlabel); static const char *getAttrName(int attrnum, TableInfo *tblInfo); -static const char *fmtCopyColumnList(const TableInfo *ti); +static const char *fmtCopyColumnList(const TableInfo *ti, PQExpBuffer buffer); +static char *get_synchronized_snapshot(Archive *fout); static PGresult *ExecuteSqlQueryForSingleRow(Archive *fout, char *query); @@ -282,6 +282,7 @@ main(int argc, char **argv) int numObjs; DumpableObject *boundaryObjs; int i; + int numWorkers = 1; enum trivalue prompt_password = TRI_DEFAULT; int compressLevel = -1; int plainText = 0; @@ -311,6 +312,7 @@ main(int argc, char **argv) {"format", required_argument, NULL, 'F'}, {"host", required_argument, NULL, 'h'}, {"ignore-version", no_argument, NULL, 'i'}, + {"jobs", 1, NULL, 'j'}, {"no-reconnect", no_argument, NULL, 'R'}, {"oids", no_argument, NULL, 'o'}, {"no-owner", no_argument, NULL, 'O'}, @@ -350,6 +352,7 @@ main(int argc, char **argv) {"serializable-deferrable", no_argument, &serializable_deferrable, 1}, {"use-set-session-authorization", no_argument, &use_setsessauth, 1}, {"no-security-labels", no_argument, &no_security_labels, 1}, + {"no-synchronized-snapshots", no_argument, &no_synchronized_snapshots, 1}, {"no-unlogged-table-data", no_argument, &no_unlogged_table_data, 1}, {NULL, 0, NULL, 0} @@ -357,6 +360,12 @@ main(int argc, char **argv) set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_dump")); + /* + * Initialize what we need for parallel execution, especially for thread + * support on Windows. + */ + init_parallel_dump_utils(); + g_verbose = false; strcpy(g_comment_start, "-- "); @@ -387,7 +396,7 @@ main(int argc, char **argv) } } - while ((c = getopt_long(argc, argv, "abcCE:f:F:h:in:N:oOp:RsS:t:T:U:vwWxZ:", + while ((c = getopt_long(argc, argv, "abcCE:f:F:h:ij:n:N:oOp:RsS:t:T:U:vwWxZ:", long_options, &optindex)) != -1) { switch (c) @@ -428,6 +437,10 @@ main(int argc, char **argv) /* ignored, deprecated option */ break; + case 'j': /* number of dump jobs */ + numWorkers = atoi(optarg); + break; + case 'n': /* include schema(s) */ simple_string_list_append(&schema_include_patterns, optarg); include_everything = false; @@ -567,6 +580,22 @@ main(int argc, char **argv) compressLevel = 0; } + /* + * On Windows we can only have at most MAXIMUM_WAIT_OBJECTS (= 64 usually) + * parallel jobs because that's the maximum limit for the + * WaitForMultipleObjects() call. + */ + if (numWorkers <= 0 +#ifdef WIN32 + || numWorkers > MAXIMUM_WAIT_OBJECTS +#endif + ) + exit_horribly(NULL, "%s: invalid number of parallel jobs\n", progname); + + /* Parallel backup only in the directory archive format so far */ + if (archiveFormat != archDirectory && numWorkers > 1) + exit_horribly(NULL, "parallel backup only supported by the directory format\n"); + /* Open the output file */ fout = CreateArchive(filename, archiveFormat, compressLevel, archiveMode); @@ -590,6 +619,8 @@ main(int argc, char **argv) fout->minRemoteVersion = 70000; fout->maxRemoteVersion = (my_version / 100) * 100 + 99; + fout->numWorkers = numWorkers; + /* * Open the database using the Archiver, so it knows about it. Errors mean * death. @@ -604,25 +635,6 @@ main(int argc, char **argv) if (fout->remoteVersion < 90100) no_security_labels = 1; - /* - * Start transaction-snapshot mode transaction to dump consistent data. - */ - ExecuteSqlStatement(fout, "BEGIN"); - if (fout->remoteVersion >= 90100) - { - if (serializable_deferrable) - ExecuteSqlStatement(fout, - "SET TRANSACTION ISOLATION LEVEL " - "SERIALIZABLE, READ ONLY, DEFERRABLE"); - else - ExecuteSqlStatement(fout, - "SET TRANSACTION ISOLATION LEVEL " - "REPEATABLE READ"); - } - else - ExecuteSqlStatement(fout, - "SET TRANSACTION ISOLATION LEVEL SERIALIZABLE"); - /* Select the appropriate subquery to convert user IDs to names */ if (fout->remoteVersion >= 80100) username_subquery = "SELECT rolname FROM pg_catalog.pg_roles WHERE oid ="; @@ -631,6 +643,14 @@ main(int argc, char **argv) else username_subquery = "SELECT usename FROM pg_user WHERE usesysid ="; + /* check the version for the synchronized snapshots feature */ + if (numWorkers > 1 && fout->remoteVersion < 90200 + && !no_synchronized_snapshots) + exit_horribly(NULL, + "No synchronized snapshots available in this server version.\n" + "Run with --no-synchronized-snapshots instead if you do not\n" + "need synchronized snapshots.\n"); + /* Find the last built-in OID, if needed */ if (fout->remoteVersion < 70300) { @@ -727,6 +747,10 @@ main(int argc, char **argv) else sortDumpableObjectsByTypeOid(dobjs, numObjs); + /* If we do a parallel dump, we want the largest tables to go first */ + if (archiveFormat == archDirectory && numWorkers > 1) + sortDataAndIndexObjectsBySize(dobjs, numObjs); + sortDumpableObjects(dobjs, numObjs, boundaryObjs[0].dumpId, boundaryObjs[1].dumpId); @@ -808,6 +832,7 @@ help(const char *progname) printf(_(" -f, --file=FILENAME output file or directory name\n")); printf(_(" -F, --format=c|d|t|p output file format (custom, directory, tar,\n" " plain text (default))\n")); + printf(_(" -j, --jobs=NUM use this many parallel jobs to dump\n")); printf(_(" -v, --verbose verbose mode\n")); printf(_(" -V, --version output version information, then exit\n")); printf(_(" -Z, --compress=0-9 compression level for compressed formats\n")); @@ -837,6 +862,7 @@ help(const char *progname) printf(_(" --exclude-table-data=TABLE do NOT dump data for the named table(s)\n")); printf(_(" --inserts dump data as INSERT commands, rather than COPY\n")); printf(_(" --no-security-labels do not dump security label assignments\n")); + printf(_(" --no-synchronized-snapshots parallel processes should not use synchronized snapshots\n")); printf(_(" --no-tablespaces do not dump tablespace assignments\n")); printf(_(" --no-unlogged-table-data do not dump unlogged table data\n")); printf(_(" --quote-all-identifiers quote all identifiers, even if not key words\n")); @@ -865,7 +891,12 @@ setup_connection(Archive *AH, const char *dumpencoding, char *use_role) PGconn *conn = GetConnection(AH); const char *std_strings; - /* Set the client encoding if requested */ + /* + * Set the client encoding if requested. If dumpencoding == NULL then + * either it hasn't been requested or we're a cloned connection and then this + * has already been set in CloneArchive according to the original + * connection encoding. + */ if (dumpencoding) { if (PQsetClientEncoding(conn, dumpencoding) < 0) @@ -882,6 +913,10 @@ setup_connection(Archive *AH, const char *dumpencoding, char *use_role) std_strings = PQparameterStatus(conn, "standard_conforming_strings"); AH->std_strings = (std_strings && strcmp(std_strings, "on") == 0); + /* Set the role if requested */ + if (!use_role && AH->use_role) + use_role = AH->use_role; + /* Set the role if requested */ if (use_role && AH->remoteVersion >= 80100) { @@ -890,6 +925,10 @@ setup_connection(Archive *AH, const char *dumpencoding, char *use_role) appendPQExpBuffer(query, "SET ROLE %s", fmtId(use_role)); ExecuteSqlStatement(AH, query->data); destroyPQExpBuffer(query); + + /* save this for later use on parallel connections */ + if (!AH->use_role) + AH->use_role = strdup(use_role); } /* Set the datestyle to ISO to ensure the dump's portability */ @@ -926,6 +965,59 @@ setup_connection(Archive *AH, const char *dumpencoding, char *use_role) */ if (quote_all_identifiers && AH->remoteVersion >= 90100) ExecuteSqlStatement(AH, "SET quote_all_identifiers = true"); + + /* + * Start transaction-snapshot mode transaction to dump consistent data. + */ + ExecuteSqlStatement(AH, "BEGIN"); + if (AH->remoteVersion >= 90100) + { + if (serializable_deferrable) + ExecuteSqlStatement(AH, + "SET TRANSACTION ISOLATION LEVEL SERIALIZABLE, " + "READ ONLY, DEFERRABLE"); + else + ExecuteSqlStatement(AH, + "SET TRANSACTION ISOLATION LEVEL REPEATABLE READ"); + } + else + ExecuteSqlStatement(AH, "SET TRANSACTION ISOLATION LEVEL SERIALIZABLE"); + + if (AH->numWorkers > 1 && AH->remoteVersion >= 90200 && !no_synchronized_snapshots) + { + if (AH->sync_snapshot_id) + { + PQExpBuffer query = createPQExpBuffer(); + appendPQExpBuffer(query, "SET TRANSACTION SNAPSHOT "); + appendStringLiteralConn(query, AH->sync_snapshot_id, conn); + destroyPQExpBuffer(query); + } + else + AH->sync_snapshot_id = get_synchronized_snapshot(AH); + } +} + +/* + * Initialize the connection for a new worker process. + */ +void +_SetupWorker(Archive *AHX, RestoreOptions *ropt) +{ + setup_connection(AHX, NULL, NULL); +} + +static char* +get_synchronized_snapshot(Archive *fout) +{ + char *query = "select pg_export_snapshot()"; + char *result; + PGresult *res; + + res = ExecuteSqlQueryForSingleRow(fout, query); + result = strdup(PQgetvalue(res, 0, 0)); + PQclear(res); + + return result; } static ArchiveFormat @@ -1243,6 +1335,11 @@ dumpTableData_copy(Archive *fout, void *dcontext) const bool hasoids = tbinfo->hasoids; const bool oids = tdinfo->oids; PQExpBuffer q = createPQExpBuffer(); + /* + * Note: can't use getThreadLocalPQExpBuffer() here, we're calling fmtId which + * uses it already. + */ + PQExpBuffer clistBuf = createPQExpBuffer(); PGconn *conn = GetConnection(fout); PGresult *res; int ret; @@ -1267,14 +1364,14 @@ dumpTableData_copy(Archive *fout, void *dcontext) * cases involving ADD COLUMN and inheritance.) */ if (fout->remoteVersion >= 70300) - column_list = fmtCopyColumnList(tbinfo); + column_list = fmtCopyColumnList(tbinfo, clistBuf); else column_list = ""; /* can't select columns in COPY */ if (oids && hasoids) { appendPQExpBuffer(q, "COPY %s %s WITH OIDS TO stdout;", - fmtQualifiedId(fout, + fmtQualifiedId(fout->remoteVersion, tbinfo->dobj.namespace->dobj.name, classname), column_list); @@ -1292,7 +1389,7 @@ dumpTableData_copy(Archive *fout, void *dcontext) else appendPQExpBufferStr(q, "* "); appendPQExpBuffer(q, "FROM %s %s) TO stdout;", - fmtQualifiedId(fout, + fmtQualifiedId(fout->remoteVersion, tbinfo->dobj.namespace->dobj.name, classname), tdinfo->filtercond); @@ -1300,13 +1397,14 @@ dumpTableData_copy(Archive *fout, void *dcontext) else { appendPQExpBuffer(q, "COPY %s %s TO stdout;", - fmtQualifiedId(fout, + fmtQualifiedId(fout->remoteVersion, tbinfo->dobj.namespace->dobj.name, classname), column_list); } res = ExecuteSqlQuery(fout, q->data, PGRES_COPY_OUT); PQclear(res); + destroyPQExpBuffer(clistBuf); for (;;) { @@ -1425,7 +1523,7 @@ dumpTableData_insert(Archive *fout, void *dcontext) { appendPQExpBuffer(q, "DECLARE _pg_dump_cursor CURSOR FOR " "SELECT * FROM ONLY %s", - fmtQualifiedId(fout, + fmtQualifiedId(fout->remoteVersion, tbinfo->dobj.namespace->dobj.name, classname)); } @@ -1433,7 +1531,7 @@ dumpTableData_insert(Archive *fout, void *dcontext) { appendPQExpBuffer(q, "DECLARE _pg_dump_cursor CURSOR FOR " "SELECT * FROM %s", - fmtQualifiedId(fout, + fmtQualifiedId(fout->remoteVersion, tbinfo->dobj.namespace->dobj.name, classname)); } @@ -1565,6 +1663,7 @@ dumpTableData(Archive *fout, TableDataInfo *tdinfo) { TableInfo *tbinfo = tdinfo->tdtable; PQExpBuffer copyBuf = createPQExpBuffer(); + PQExpBuffer clistBuf = createPQExpBuffer(); DataDumperPtr dumpFn; char *copyStmt; @@ -1576,7 +1675,7 @@ dumpTableData(Archive *fout, TableDataInfo *tdinfo) appendPQExpBuffer(copyBuf, "COPY %s ", fmtId(tbinfo->dobj.name)); appendPQExpBuffer(copyBuf, "%s %sFROM stdin;\n", - fmtCopyColumnList(tbinfo), + fmtCopyColumnList(tbinfo, clistBuf), (tdinfo->oids && tbinfo->hasoids) ? "WITH OIDS " : ""); copyStmt = copyBuf->data; } @@ -1601,6 +1700,7 @@ dumpTableData(Archive *fout, TableDataInfo *tdinfo) dumpFn, tdinfo); destroyPQExpBuffer(copyBuf); + destroyPQExpBuffer(clistBuf); } /* @@ -3867,6 +3967,7 @@ getTables(Archive *fout, int *numTables) int i_reloptions; int i_toastreloptions; int i_reloftype; + int i_relpages; /* Make sure we are in proper schema */ selectSourceSchema(fout, "pg_catalog"); @@ -3906,6 +4007,7 @@ getTables(Archive *fout, int *numTables) "c.relfrozenxid, tc.oid AS toid, " "tc.relfrozenxid AS tfrozenxid, " "c.relpersistence, " + "c.relpages, " "CASE WHEN c.reloftype <> 0 THEN c.reloftype::pg_catalog.regtype ELSE NULL END AS reloftype, " "d.refobjid AS owning_tab, " "d.refobjsubid AS owning_col, " @@ -3942,6 +4044,7 @@ getTables(Archive *fout, int *numTables) "c.relfrozenxid, tc.oid AS toid, " "tc.relfrozenxid AS tfrozenxid, " "'p' AS relpersistence, " + "c.relpages, " "CASE WHEN c.reloftype <> 0 THEN c.reloftype::pg_catalog.regtype ELSE NULL END AS reloftype, " "d.refobjid AS owning_tab, " "d.refobjsubid AS owning_col, " @@ -3977,6 +4080,7 @@ getTables(Archive *fout, int *numTables) "c.relfrozenxid, tc.oid AS toid, " "tc.relfrozenxid AS tfrozenxid, " "'p' AS relpersistence, " + "c.relpages, " "NULL AS reloftype, " "d.refobjid AS owning_tab, " "d.refobjsubid AS owning_col, " @@ -4012,6 +4116,7 @@ getTables(Archive *fout, int *numTables) "c.relfrozenxid, tc.oid AS toid, " "tc.relfrozenxid AS tfrozenxid, " "'p' AS relpersistence, " + "c.relpages, " "NULL AS reloftype, " "d.refobjid AS owning_tab, " "d.refobjsubid AS owning_col, " @@ -4048,6 +4153,7 @@ getTables(Archive *fout, int *numTables) "0 AS toid, " "0 AS tfrozenxid, " "'p' AS relpersistence, " + "relpages, " "NULL AS reloftype, " "d.refobjid AS owning_tab, " "d.refobjsubid AS owning_col, " @@ -4083,6 +4189,7 @@ getTables(Archive *fout, int *numTables) "0 AS toid, " "0 AS tfrozenxid, " "'p' AS relpersistence, " + "relpages, " "NULL AS reloftype, " "d.refobjid AS owning_tab, " "d.refobjsubid AS owning_col, " @@ -4114,6 +4221,7 @@ getTables(Archive *fout, int *numTables) "0 AS toid, " "0 AS tfrozenxid, " "'p' AS relpersistence, " + "relpages, " "NULL AS reloftype, " "NULL::oid AS owning_tab, " "NULL::int4 AS owning_col, " @@ -4140,6 +4248,7 @@ getTables(Archive *fout, int *numTables) "0 AS toid, " "0 AS tfrozenxid, " "'p' AS relpersistence, " + "relpages, " "NULL AS reloftype, " "NULL::oid AS owning_tab, " "NULL::int4 AS owning_col, " @@ -4176,6 +4285,7 @@ getTables(Archive *fout, int *numTables) "0 AS toid, " "0 AS tfrozenxid, " "'p' AS relpersistence, " + "0 AS relpages, " "NULL AS reloftype, " "NULL::oid AS owning_tab, " "NULL::int4 AS owning_col, " @@ -4229,6 +4339,7 @@ getTables(Archive *fout, int *numTables) i_reloptions = PQfnumber(res, "reloptions"); i_toastreloptions = PQfnumber(res, "toast_reloptions"); i_reloftype = PQfnumber(res, "reloftype"); + i_relpages = PQfnumber(res, "relpages"); if (lockWaitTimeout && fout->remoteVersion >= 70300) { @@ -4285,6 +4396,7 @@ getTables(Archive *fout, int *numTables) tblinfo[i].reltablespace = pg_strdup(PQgetvalue(res, i, i_reltablespace)); tblinfo[i].reloptions = pg_strdup(PQgetvalue(res, i, i_reloptions)); tblinfo[i].toast_reloptions = pg_strdup(PQgetvalue(res, i, i_toastreloptions)); + tblinfo[i].relpages = atoi(PQgetvalue(res, i, i_relpages)); /* other fields were zeroed above */ @@ -4313,7 +4425,7 @@ getTables(Archive *fout, int *numTables) resetPQExpBuffer(query); appendPQExpBuffer(query, "LOCK TABLE %s IN ACCESS SHARE MODE", - fmtQualifiedId(fout, + fmtQualifiedId(fout->remoteVersion, tblinfo[i].dobj.namespace->dobj.name, tblinfo[i].dobj.name)); ExecuteSqlStatement(fout, query->data); @@ -4452,7 +4564,8 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) i_conoid, i_condef, i_tablespace, - i_options; + i_options, + i_relpages; int ntups; for (i = 0; i < numTables; i++) @@ -4494,6 +4607,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) "pg_catalog.pg_get_indexdef(i.indexrelid) AS indexdef, " "t.relnatts AS indnkeys, " "i.indkey, i.indisclustered, " + "t.relpages, " "c.contype, c.conname, " "c.condeferrable, c.condeferred, " "c.tableoid AS contableoid, " @@ -4519,6 +4633,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) "pg_catalog.pg_get_indexdef(i.indexrelid) AS indexdef, " "t.relnatts AS indnkeys, " "i.indkey, i.indisclustered, " + "t.relpages, " "c.contype, c.conname, " "c.condeferrable, c.condeferred, " "c.tableoid AS contableoid, " @@ -4547,6 +4662,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) "pg_catalog.pg_get_indexdef(i.indexrelid) AS indexdef, " "t.relnatts AS indnkeys, " "i.indkey, i.indisclustered, " + "t.relpages, " "c.contype, c.conname, " "c.condeferrable, c.condeferred, " "c.tableoid AS contableoid, " @@ -4575,6 +4691,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) "pg_catalog.pg_get_indexdef(i.indexrelid) AS indexdef, " "t.relnatts AS indnkeys, " "i.indkey, i.indisclustered, " + "t.relpages, " "c.contype, c.conname, " "c.condeferrable, c.condeferred, " "c.tableoid AS contableoid, " @@ -4603,6 +4720,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) "pg_get_indexdef(i.indexrelid) AS indexdef, " "t.relnatts AS indnkeys, " "i.indkey, false AS indisclustered, " + "t.relpages, " "CASE WHEN i.indisprimary THEN 'p'::char " "ELSE '0'::char END AS contype, " "t.relname AS conname, " @@ -4629,6 +4747,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) "pg_get_indexdef(i.indexrelid) AS indexdef, " "t.relnatts AS indnkeys, " "i.indkey, false AS indisclustered, " + "t.relpages, " "CASE WHEN i.indisprimary THEN 'p'::char " "ELSE '0'::char END AS contype, " "t.relname AS conname, " @@ -4657,6 +4776,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) i_indnkeys = PQfnumber(res, "indnkeys"); i_indkey = PQfnumber(res, "indkey"); i_indisclustered = PQfnumber(res, "indisclustered"); + i_relpages = PQfnumber(res, "relpages"); i_contype = PQfnumber(res, "contype"); i_conname = PQfnumber(res, "conname"); i_condeferrable = PQfnumber(res, "condeferrable"); @@ -4699,6 +4819,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables) parseOidArray(PQgetvalue(res, j, i_indkey), indxinfo[j].indkeys, INDEX_MAX_KEYS); indxinfo[j].indisclustered = (PQgetvalue(res, j, i_indisclustered)[0] == 't'); + indxinfo[j].relpages = atoi(PQgetvalue(res, j, i_relpages)); contype = *(PQgetvalue(res, j, i_contype)); if (contype == 'p' || contype == 'u' || contype == 'x') @@ -14476,22 +14597,21 @@ findDumpableDependencies(ArchiveHandle *AH, DumpableObject *dobj, * * Whenever the selected schema is not pg_catalog, be careful to qualify * references to system catalogs and types in our emitted commands! + * + * This function is called only from selectSourceSchemaOnAH and + * selectSourceSchema. */ static void selectSourceSchema(Archive *fout, const char *schemaName) { - static char *curSchemaName = NULL; PQExpBuffer query; + /* This is checked by the callers already */ + Assert(schemaName != NULL && *schemaName != '\0'); + /* Not relevant if fetching from pre-7.3 DB */ if (fout->remoteVersion < 70300) return; - /* Ignore null schema names */ - if (schemaName == NULL || *schemaName == '\0') - return; - /* Optimize away repeated selection of same schema */ - if (curSchemaName && strcmp(curSchemaName, schemaName) == 0) - return; query = createPQExpBuffer(); appendPQExpBuffer(query, "SET search_path = %s", @@ -14502,9 +14622,6 @@ selectSourceSchema(Archive *fout, const char *schemaName) ExecuteSqlStatement(fout, query->data); destroyPQExpBuffer(query); - if (curSchemaName) - free(curSchemaName); - curSchemaName = pg_strdup(schemaName); } /* @@ -14641,34 +14758,6 @@ myFormatType(const char *typname, int32 typmod) return result; } -/* - * fmtQualifiedId - convert a qualified name to the proper format for - * the source database. - * - * Like fmtId, use the result before calling again. - */ -static const char * -fmtQualifiedId(Archive *fout, const char *schema, const char *id) -{ - static PQExpBuffer id_return = NULL; - - if (id_return) /* first time through? */ - resetPQExpBuffer(id_return); - else - id_return = createPQExpBuffer(); - - /* Suppress schema name if fetching from pre-7.3 DB */ - if (fout->remoteVersion >= 70300 && schema && *schema) - { - appendPQExpBuffer(id_return, "%s.", - fmtId(schema)); - } - appendPQExpBuffer(id_return, "%s", - fmtId(id)); - - return id_return->data; -} - /* * Return a column list clause for the given relation. * @@ -14676,37 +14765,31 @@ fmtQualifiedId(Archive *fout, const char *schema, const char *id) * "", not an invalid "()" column list. */ static const char * -fmtCopyColumnList(const TableInfo *ti) +fmtCopyColumnList(const TableInfo *ti, PQExpBuffer buffer) { - static PQExpBuffer q = NULL; int numatts = ti->numatts; char **attnames = ti->attnames; bool *attisdropped = ti->attisdropped; bool needComma; int i; - if (q) /* first time through? */ - resetPQExpBuffer(q); - else - q = createPQExpBuffer(); - - appendPQExpBuffer(q, "("); + appendPQExpBuffer(buffer, "("); needComma = false; for (i = 0; i < numatts; i++) { if (attisdropped[i]) continue; if (needComma) - appendPQExpBuffer(q, ", "); - appendPQExpBuffer(q, "%s", fmtId(attnames[i])); + appendPQExpBuffer(buffer, ", "); + appendPQExpBuffer(buffer, "%s", fmtId(attnames[i])); needComma = true; } if (!needComma) return ""; /* no undropped columns */ - appendPQExpBuffer(q, ")"); - return q->data; + appendPQExpBuffer(buffer, ")"); + return buffer->data; } /* diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h index 2aa206038d..40b24d2c54 100644 --- a/src/bin/pg_dump/pg_dump.h +++ b/src/bin/pg_dump/pg_dump.h @@ -259,6 +259,7 @@ typedef struct _tableInfo /* these two are set only if table is a sequence owned by a column: */ Oid owning_tab; /* OID of table owning sequence */ int owning_col; /* attr # of column owning sequence */ + int relpages; bool interesting; /* true if need to collect more data */ @@ -322,6 +323,7 @@ typedef struct _indxInfo bool indisclustered; /* if there is an associated constraint object, its dumpId: */ DumpId indexconstraint; + int relpages; /* relpages of the underlying table */ } IndxInfo; typedef struct _ruleInfo @@ -541,6 +543,7 @@ extern void sortDumpableObjects(DumpableObject **objs, int numObjs, DumpId preBoundaryId, DumpId postBoundaryId); extern void sortDumpableObjectsByTypeName(DumpableObject **objs, int numObjs); extern void sortDumpableObjectsByTypeOid(DumpableObject **objs, int numObjs); +extern void sortDataAndIndexObjectsBySize(DumpableObject **objs, int numObjs); /* * version specific routines diff --git a/src/bin/pg_dump/pg_dump_sort.c b/src/bin/pg_dump/pg_dump_sort.c index f0dc14592a..9b9d84bfa2 100644 --- a/src/bin/pg_dump/pg_dump_sort.c +++ b/src/bin/pg_dump/pg_dump_sort.c @@ -142,6 +142,93 @@ static void repairDependencyLoop(DumpableObject **loop, static void describeDumpableObject(DumpableObject *obj, char *buf, int bufsize); +static int DOSizeCompare(const void *p1, const void *p2); + +static int +findFirstEqualType(DumpableObjectType type, DumpableObject **objs, int numObjs) +{ + int i; + for (i = 0; i < numObjs; i++) + if (objs[i]->objType == type) + return i; + return -1; +} + +static int +findFirstDifferentType(DumpableObjectType type, DumpableObject **objs, int numObjs, int start) +{ + int i; + for (i = start; i < numObjs; i++) + if (objs[i]->objType != type) + return i; + return numObjs - 1; +} + +/* + * When we do a parallel dump, we want to start with the largest items first. + * + * Say we have the objects in this order: + * ....DDDDD....III.... + * + * with D = Table data, I = Index, . = other object + * + * This sorting function now takes each of the D or I blocks and sorts them + * according to their size. + */ +void +sortDataAndIndexObjectsBySize(DumpableObject **objs, int numObjs) +{ + int startIdx, endIdx; + void *startPtr; + + if (numObjs <= 1) + return; + + startIdx = findFirstEqualType(DO_TABLE_DATA, objs, numObjs); + if (startIdx >= 0) + { + endIdx = findFirstDifferentType(DO_TABLE_DATA, objs, numObjs, startIdx); + startPtr = objs + startIdx; + qsort(startPtr, endIdx - startIdx, sizeof(DumpableObject *), + DOSizeCompare); + } + + startIdx = findFirstEqualType(DO_INDEX, objs, numObjs); + if (startIdx >= 0) + { + endIdx = findFirstDifferentType(DO_INDEX, objs, numObjs, startIdx); + startPtr = objs + startIdx; + qsort(startPtr, endIdx - startIdx, sizeof(DumpableObject *), + DOSizeCompare); + } +} + +static int +DOSizeCompare(const void *p1, const void *p2) +{ + DumpableObject *obj1 = *(DumpableObject **) p1; + DumpableObject *obj2 = *(DumpableObject **) p2; + int obj1_size = 0; + int obj2_size = 0; + + if (obj1->objType == DO_TABLE_DATA) + obj1_size = ((TableDataInfo *) obj1)->tdtable->relpages; + if (obj1->objType == DO_INDEX) + obj1_size = ((IndxInfo *) obj1)->relpages; + + if (obj2->objType == DO_TABLE_DATA) + obj2_size = ((TableDataInfo *) obj2)->tdtable->relpages; + if (obj2->objType == DO_INDEX) + obj2_size = ((IndxInfo *) obj2)->relpages; + + /* we want to see the biggest item go first */ + if (obj1_size > obj2_size) + return -1; + if (obj2_size > obj1_size) + return 1; + + return 0; +} /* * Sort the given objects into a type/name-based ordering diff --git a/src/bin/pg_dump/pg_restore.c b/src/bin/pg_dump/pg_restore.c index 49d799b953..84fe8b66ad 100644 --- a/src/bin/pg_dump/pg_restore.c +++ b/src/bin/pg_dump/pg_restore.c @@ -72,6 +72,7 @@ main(int argc, char **argv) RestoreOptions *opts; int c; int exit_code; + int numWorkers = 1; Archive *AH; char *inputFileSpec; static int disable_triggers = 0; @@ -183,7 +184,7 @@ main(int argc, char **argv) break; case 'j': /* number of restore jobs */ - opts->number_of_jobs = atoi(optarg); + numWorkers = atoi(optarg); break; case 'l': /* Dump the TOC summary */ @@ -314,7 +315,7 @@ main(int argc, char **argv) } /* Can't do single-txn mode with multiple connections */ - if (opts->single_txn && opts->number_of_jobs > 1) + if (opts->single_txn && numWorkers > 1) { fprintf(stderr, _("%s: cannot specify both --single-transaction and multiple jobs\n"), progname); @@ -373,6 +374,18 @@ main(int argc, char **argv) if (opts->tocFile) SortTocFromFile(AH, opts); + /* See comments in pg_dump.c */ +#ifdef WIN32 + if (numWorkers > MAXIMUM_WAIT_OBJECTS) + { + fprintf(stderr, _("%s: maximum number of parallel jobs is %d\n"), + progname, MAXIMUM_WAIT_OBJECTS); + exit(1); + } +#endif + + AH->numWorkers = numWorkers; + if (opts->tocSummary) PrintTOCSummary(AH, opts); else @@ -394,6 +407,13 @@ main(int argc, char **argv) return exit_code; } +void +_SetupWorker(Archive *AHX, RestoreOptions *ropt) +{ + ArchiveHandle *AH = (ArchiveHandle *) AHX; + (AH->ReopenPtr) (AH); +} + static void usage(const char *progname) { From 4134cc93054672cd2aac60b787d8fc60224d0b70 Mon Sep 17 00:00:00 2001 From: Andrew Dunstan Date: Mon, 15 Oct 2012 15:07:51 -0400 Subject: [PATCH 2/4] Add new files missed in previous commit --- src/bin/pg_dump/parallel.c | 1283 ++++++++++++++++++++++++++++++++++++ src/bin/pg_dump/parallel.h | 86 +++ 2 files changed, 1369 insertions(+) create mode 100644 src/bin/pg_dump/parallel.c create mode 100644 src/bin/pg_dump/parallel.h diff --git a/src/bin/pg_dump/parallel.c b/src/bin/pg_dump/parallel.c new file mode 100644 index 0000000000..65bc8bb7cf --- /dev/null +++ b/src/bin/pg_dump/parallel.c @@ -0,0 +1,1283 @@ +/*------------------------------------------------------------------------- + * + * parallel.c + * + * Parallel support for the pg_dump archiver + * + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * The author is not responsible for loss or damages that may + * result from its use. + * + * IDENTIFICATION + * src/bin/pg_dump/parallel.c + * + *------------------------------------------------------------------------- + */ + +#include "pg_backup_db.h" + +#include "dumpmem.h" +#include "dumputils.h" +#include "parallel.h" + +#ifndef WIN32 +#include +#include +#include "signal.h" +#include +#include +#endif + +#define PIPE_READ 0 +#define PIPE_WRITE 1 + +/* file-scope variables */ +#ifdef WIN32 +static unsigned int tMasterThreadId = 0; +static HANDLE termEvent = INVALID_HANDLE_VALUE; +static int pgpipe(int handles[2]); +static int piperead(int s, char *buf, int len); +#define pipewrite(a,b,c) send(a,b,c,0) +#else +/* + * aborting is only ever used in the master, the workers are fine with just + * wantAbort. + */ +static bool aborting = false; +static volatile sig_atomic_t wantAbort = 0; +#define pgpipe(a) pipe(a) +#define piperead(a,b,c) read(a,b,c) +#define pipewrite(a,b,c) write(a,b,c) +#endif + +typedef struct ShutdownInformation +{ + ParallelState *pstate; + Archive *AHX; +} ShutdownInformation; + +static ShutdownInformation shutdown_info; + +static const char *modulename = gettext_noop("parallel archiver"); + +static ParallelSlot *GetMyPSlot(ParallelState *pstate); +static void parallel_exit_msg_func(const char *modulename, + const char *fmt, va_list ap) + __attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 0))); +static void parallel_msg_master(ParallelSlot *slot, const char *modulename, + const char *fmt, va_list ap) + __attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 0))); +static void archive_close_connection(int code, void *arg); +static void ShutdownWorkersHard(ParallelState *pstate); +static void WaitForTerminatingWorkers(ParallelState *pstate); +#ifndef WIN32 +static void sigTermHandler(int signum); +#endif +static void SetupWorker(ArchiveHandle *AH, int pipefd[2], int worker, + RestoreOptions *ropt); +static bool HasEveryWorkerTerminated(ParallelState *pstate); + +static void lockTableNoWait(ArchiveHandle *AH, TocEntry *te); +static void WaitForCommands(ArchiveHandle *AH, int pipefd[2]); +static char *getMessageFromMaster(int pipefd[2]); +static void sendMessageToMaster(int pipefd[2], const char *str); +static int select_loop(int maxFd, fd_set *workerset); +static char *getMessageFromWorker(ParallelState *pstate, + bool do_wait, int *worker); +static void sendMessageToWorker(ParallelState *pstate, + int worker, const char *str); +static char *readMessageFromPipe(int fd); + +#define messageStartsWith(msg, prefix) \ + (strncmp(msg, prefix, strlen(prefix)) == 0) +#define messageEquals(msg, pattern) \ + (strcmp(msg, pattern) == 0) + +static ParallelSlot * +GetMyPSlot(ParallelState *pstate) +{ + int i; + + for (i = 0; i < pstate->numWorkers; i++) +#ifdef WIN32 + if (pstate->parallelSlot[i].threadId == GetCurrentThreadId()) +#else + if (pstate->parallelSlot[i].pid == getpid()) +#endif + return &(pstate->parallelSlot[i]); + + return NULL; +} + +/* + * This is the function that will be called from exit_horribly() to print the + * error message. If the worker process does exit_horribly(), we forward its + * last words to the master process. The master process then does + * exit_horribly() with this error message itself and prints it normally. + * After printing the message, exit_horribly() on the master will shut down + * the remaining worker processes. + */ +static void +parallel_exit_msg_func(const char *modulename, const char *fmt, va_list ap) +{ + ParallelState *pstate = shutdown_info.pstate; + ParallelSlot *slot; + + Assert(pstate); + + slot = GetMyPSlot(pstate); + + if (!slot) + /* We're the parent, just write the message out */ + vwrite_msg(modulename, fmt, ap); + else + /* If we're a worker process, send the msg to the master process */ + parallel_msg_master(slot, modulename, fmt, ap); +} + +/* Sends the error message from the worker to the master process */ +static void +parallel_msg_master(ParallelSlot *slot, const char *modulename, + const char *fmt, va_list ap) +{ + char buf[512]; + int pipefd[2]; + + pipefd[PIPE_READ] = slot->pipeRevRead; + pipefd[PIPE_WRITE] = slot->pipeRevWrite; + + strcpy(buf, "ERROR "); + vsnprintf(buf + strlen("ERROR "), + sizeof(buf) - strlen("ERROR "), fmt, ap); + + sendMessageToMaster(pipefd, buf); +} + +/* + * pg_dump and pg_restore register the Archive pointer for the exit handler + * (called from exit_horribly). This function mainly exists so that we can + * keep shutdown_info in file scope only. + */ +void +on_exit_close_archive(Archive *AHX) +{ + shutdown_info.AHX = AHX; + on_exit_nicely(archive_close_connection, &shutdown_info); +} + +/* + * This function can close archives in both the parallel and non-parallel + * case. + */ +static void +archive_close_connection(int code, void *arg) +{ + ShutdownInformation *si = (ShutdownInformation *) arg; + + if (si->pstate) + { + ParallelSlot *slot = GetMyPSlot(si->pstate); + + if (!slot) { + /* + * We're the master: We have already printed out the message + * passed to exit_horribly() either from the master itself or from + * a worker process. Now we need to close our own database + * connection (only open during parallel dump but not restore) and + * shut down the remaining workers. + */ + DisconnectDatabase(si->AHX); +#ifndef WIN32 + /* + * Setting aborting to true switches to best-effort-mode + * (send/receive but ignore errors) in communicating with our + * workers. + */ + aborting = true; +#endif + ShutdownWorkersHard(si->pstate); + } + else if (slot->args->AH) + DisconnectDatabase(&(slot->args->AH->public)); + } + else if (si->AHX) + DisconnectDatabase(si->AHX); +} + +/* + * If we have one worker that terminates for some reason, we'd like the other + * threads to terminate as well (and not finish with their 70 GB table dump + * first...). Now in UNIX we can just kill these processes, and let the signal + * handler set wantAbort to 1. In Windows we set a termEvent and this serves + * as the signal for everyone to terminate. + */ +void +checkAborting(ArchiveHandle *AH) +{ +#ifdef WIN32 + if (WaitForSingleObject(termEvent, 0) == WAIT_OBJECT_0) +#else + if (wantAbort) +#endif + exit_horribly(modulename, "worker is terminating\n"); +} + +/* + * Shut down any remaining workers, this has an implicit do_wait == true. + * + * The fastest way we can make the workers terminate gracefully is when + * they are listening for new commands and we just tell them to terminate. + */ +static void +ShutdownWorkersHard(ParallelState *pstate) +{ +#ifndef WIN32 + int i; + signal(SIGPIPE, SIG_IGN); + + /* + * Close our write end of the sockets so that the workers know they can + * exit. + */ + for (i = 0; i < pstate->numWorkers; i++) + closesocket(pstate->parallelSlot[i].pipeWrite); + + for (i = 0; i < pstate->numWorkers; i++) + kill(pstate->parallelSlot[i].pid, SIGTERM); + +#else + /* The workers monitor this event via checkAborting(). */ + SetEvent(termEvent); +#endif + + WaitForTerminatingWorkers(pstate); +} + +/* + * Wait for the termination of the processes using the OS-specific method. + */ +static void +WaitForTerminatingWorkers(ParallelState *pstate) +{ + while (!HasEveryWorkerTerminated(pstate)) + { + ParallelSlot *slot = NULL; + int j; +#ifndef WIN32 + int status; + pid_t pid = wait(&status); + for (j = 0; j < pstate->numWorkers; j++) + if (pstate->parallelSlot[j].pid == pid) + slot = &(pstate->parallelSlot[j]); +#else + uintptr_t hThread; + DWORD ret; + uintptr_t *lpHandles = pg_malloc(sizeof(HANDLE) * pstate->numWorkers); + int nrun = 0; + for (j = 0; j < pstate->numWorkers; j++) + if (pstate->parallelSlot[j].workerStatus != WRKR_TERMINATED) + { + lpHandles[nrun] = pstate->parallelSlot[j].hThread; + nrun++; + } + ret = WaitForMultipleObjects(nrun, (HANDLE*) lpHandles, false, INFINITE); + Assert(ret != WAIT_FAILED); + hThread = lpHandles[ret - WAIT_OBJECT_0]; + + for (j = 0; j < pstate->numWorkers; j++) + if (pstate->parallelSlot[j].hThread == hThread) + slot = &(pstate->parallelSlot[j]); + + free(lpHandles); +#endif + Assert(slot); + + slot->workerStatus = WRKR_TERMINATED; + } + Assert(HasEveryWorkerTerminated(pstate)); +} + +#ifndef WIN32 +/* Signal handling (UNIX only) */ +static void +sigTermHandler(int signum) +{ + wantAbort = 1; +} +#endif + +/* + * This function is called by both UNIX and Windows variants to set up a + * worker process. + */ +static void +SetupWorker(ArchiveHandle *AH, int pipefd[2], int worker, + RestoreOptions *ropt) +{ + /* + * In dump mode (pg_dump) this calls _SetupWorker() as defined in + * pg_dump.c, while in restore mode (pg_restore) it calls _SetupWorker() + * as defined in pg_restore.c. + * + * We get the raw connection only for the reason that we can close it + * properly when we shut down. This happens only that way when it is + * brought down because of an error. + */ + _SetupWorker((Archive *) AH, ropt); + + Assert(AH->connection != NULL); + + WaitForCommands(AH, pipefd); + + closesocket(pipefd[PIPE_READ]); + closesocket(pipefd[PIPE_WRITE]); +} + +#ifdef WIN32 +/* + * On Windows the _beginthreadex() function allows us to pass one parameter. + * Since we need to pass a few values however, we define a structure here + * and then pass a pointer to such a structure in _beginthreadex(). + */ +typedef struct { + ArchiveHandle *AH; + RestoreOptions *ropt; + int worker; + int pipeRead; + int pipeWrite; +} WorkerInfo; + +static unsigned __stdcall +init_spawned_worker_win32(WorkerInfo *wi) +{ + ArchiveHandle *AH; + int pipefd[2] = { wi->pipeRead, wi->pipeWrite }; + int worker = wi->worker; + RestoreOptions *ropt = wi->ropt; + + AH = CloneArchive(wi->AH); + + free(wi); + SetupWorker(AH, pipefd, worker, ropt); + + DeCloneArchive(AH); + _endthreadex(0); + return 0; +} +#endif + +/* + * This function starts the parallel dump or restore by spawning off the + * worker processes in both Unix and Windows. For Windows, it creates a number + * of threads while it does a fork() on Unix. + */ +ParallelState * +ParallelBackupStart(ArchiveHandle *AH, RestoreOptions *ropt) +{ + ParallelState *pstate; + int i; + const size_t slotSize = AH->public.numWorkers * sizeof(ParallelSlot); + + Assert(AH->public.numWorkers > 0); + + /* Ensure stdio state is quiesced before forking */ + fflush(NULL); + + pstate = (ParallelState *) pg_malloc(sizeof(ParallelState)); + + pstate->numWorkers = AH->public.numWorkers; + pstate->parallelSlot = NULL; + + if (AH->public.numWorkers == 1) + return pstate; + + pstate->parallelSlot = (ParallelSlot *) pg_malloc(slotSize); + memset((void *) pstate->parallelSlot, 0, slotSize); + + /* + * Set the pstate in the shutdown_info. The exit handler uses pstate if + * set and falls back to AHX otherwise. + */ + shutdown_info.pstate = pstate; + on_exit_msg_func = parallel_exit_msg_func; + +#ifdef WIN32 + tMasterThreadId = GetCurrentThreadId(); + termEvent = CreateEvent(NULL, true, false, "Terminate"); +#else + signal(SIGTERM, sigTermHandler); + signal(SIGINT, sigTermHandler); + signal(SIGQUIT, sigTermHandler); +#endif + + for (i = 0; i < pstate->numWorkers; i++) + { +#ifdef WIN32 + WorkerInfo *wi; + uintptr_t handle; +#else + pid_t pid; +#endif + int pipeMW[2], pipeWM[2]; + + if (pgpipe(pipeMW) < 0 || pgpipe(pipeWM) < 0) + exit_horribly(modulename, + "Cannot create communication channels: %s\n", + strerror(errno)); + + pstate->parallelSlot[i].workerStatus = WRKR_IDLE; + pstate->parallelSlot[i].args = (ParallelArgs *) pg_malloc(sizeof(ParallelArgs)); + pstate->parallelSlot[i].args->AH = NULL; + pstate->parallelSlot[i].args->te = NULL; +#ifdef WIN32 + /* Allocate a new structure for every worker */ + wi = (WorkerInfo *) pg_malloc(sizeof(WorkerInfo)); + + wi->ropt = ropt; + wi->worker = i; + wi->AH = AH; + wi->pipeRead = pstate->parallelSlot[i].pipeRevRead = pipeMW[PIPE_READ]; + wi->pipeWrite = pstate->parallelSlot[i].pipeRevWrite = pipeWM[PIPE_WRITE]; + + handle = _beginthreadex(NULL, 0, (void *) &init_spawned_worker_win32, + wi, 0, &(pstate->parallelSlot[i].threadId)); + pstate->parallelSlot[i].hThread = handle; +#else + pid = fork(); + if (pid == 0) + { + /* we are the worker */ + int j; + int pipefd[2] = { pipeMW[PIPE_READ], pipeWM[PIPE_WRITE] }; + + /* + * Store the fds for the reverse communication in pstate. Actually + * we only use this in case of an error and don't use pstate + * otherwise in the worker process. On Windows we write to the + * global pstate, in Unix we write to our process-local copy but + * that's also where we'd retrieve this information back from. + */ + pstate->parallelSlot[i].pipeRevRead = pipefd[PIPE_READ]; + pstate->parallelSlot[i].pipeRevWrite = pipefd[PIPE_WRITE]; + pstate->parallelSlot[i].pid = getpid(); + + /* + * Call CloneArchive on Unix as well even though technically we + * don't need to because fork() gives us a copy in our own address + * space already. But CloneArchive resets the state information + * and also clones the database connection (for parallel dump) + * which both seem kinda helpful. + */ + pstate->parallelSlot[i].args->AH = CloneArchive(AH); + + /* close read end of Worker -> Master */ + closesocket(pipeWM[PIPE_READ]); + /* close write end of Master -> Worker */ + closesocket(pipeMW[PIPE_WRITE]); + + /* + * Close all inherited fds for communication of the master with + * the other workers. + */ + for (j = 0; j < i; j++) + { + closesocket(pstate->parallelSlot[j].pipeRead); + closesocket(pstate->parallelSlot[j].pipeWrite); + } + + SetupWorker(pstate->parallelSlot[i].args->AH, pipefd, i, ropt); + + exit(0); + } + else if (pid < 0) + /* fork failed */ + exit_horribly(modulename, + "could not create worker process: %s\n", + strerror(errno)); + + /* we are the Master, pid > 0 here */ + Assert(pid > 0); + + /* close read end of Master -> Worker */ + closesocket(pipeMW[PIPE_READ]); + /* close write end of Worker -> Master */ + closesocket(pipeWM[PIPE_WRITE]); + + pstate->parallelSlot[i].pid = pid; +#endif + + pstate->parallelSlot[i].pipeRead = pipeWM[PIPE_READ]; + pstate->parallelSlot[i].pipeWrite = pipeMW[PIPE_WRITE]; + } + + return pstate; +} + +/* + * Tell all of our workers to terminate. + * + * Pretty straightforward routine, first we tell everyone to terminate, then + * we listen to the workers' replies and finally close the sockets that we + * have used for communication. + */ +void +ParallelBackupEnd(ArchiveHandle *AH, ParallelState *pstate) +{ + int i; + + if (pstate->numWorkers == 1) + return; + + Assert(IsEveryWorkerIdle(pstate)); + + /* close the sockets so that the workers know they can exit */ + for (i = 0; i < pstate->numWorkers; i++) + { + closesocket(pstate->parallelSlot[i].pipeRead); + closesocket(pstate->parallelSlot[i].pipeWrite); + } + WaitForTerminatingWorkers(pstate); + + /* + * Remove the pstate again, so the exit handler in the parent will now + * again fall back to closing AH->connection (if connected). + */ + shutdown_info.pstate = NULL; + + free(pstate->parallelSlot); + free(pstate); +} + + +/* + * The sequence is the following (for dump, similar for restore): + * + * The master process starts the parallel backup in ParllelBackupStart, this + * forks the worker processes which enter WaitForCommand(). + * + * The master process dispatches an individual work item to one of the worker + * processes in DispatchJobForTocEntry(). It calls + * AH->MasterStartParallelItemPtr, a routine of the output format. This + * function's arguments are the parents archive handle AH (containing the full + * catalog information), the TocEntry that the worker should work on and a + * T_Action act indicating whether this is a backup or a restore item. The + * function then converts the TocEntry assignment into a string that is then + * sent over to the worker process. In the simplest case that would be + * something like "DUMP 1234", with 1234 being the TocEntry id. + * + * The worker receives the message in the routine pointed to by + * WorkerJobDumpPtr or WorkerJobRestorePtr. These are also pointers to + * corresponding routines of the respective output format, e.g. + * _WorkerJobDumpDirectory(). + * + * Remember that we have forked off the workers only after we have read in the + * catalog. That's why our worker processes can also access the catalog + * information. Now they re-translate the textual representation to a TocEntry + * on their side and do the required action (restore or dump). + * + * The result is again a textual string that is sent back to the master and is + * interpreted by AH->MasterEndParallelItemPtr. This function can update state + * or catalog information on the master's side, depending on the reply from + * the worker process. In the end it returns status which is 0 for successful + * execution. + * + * --------------------------------------------------------------------- + * Master Worker + * + * enters WaitForCommands() + * DispatchJobForTocEntry(...te...) + * + * [ Worker is IDLE ] + * + * arg = (MasterStartParallelItemPtr)() + * send: DUMP arg + * receive: DUMP arg + * str = (WorkerJobDumpPtr)(arg) + * [ Worker is WORKING ] ... gets te from arg ... + * ... dump te ... + * send: OK DUMP info + * + * In ListenToWorkers(): + * + * [ Worker is FINISHED ] + * receive: OK DUMP info + * status = (MasterEndParallelItemPtr)(info) + * + * In ReapWorkerStatus(&ptr): + * *ptr = status; + * [ Worker is IDLE ] + * --------------------------------------------------------------------- + */ +void +DispatchJobForTocEntry(ArchiveHandle *AH, ParallelState *pstate, TocEntry *te, + T_Action act) +{ + int worker; + char *arg; + + /* our caller makes sure that at least one worker is idle */ + Assert(GetIdleWorker(pstate) != NO_SLOT); + worker = GetIdleWorker(pstate); + Assert(worker != NO_SLOT); + + arg = (AH->MasterStartParallelItemPtr)(AH, te, act); + + sendMessageToWorker(pstate, worker, arg); + + pstate->parallelSlot[worker].workerStatus = WRKR_WORKING; + pstate->parallelSlot[worker].args->te = te; +} + +/* + * Find the first free parallel slot (if any). + */ +int +GetIdleWorker(ParallelState *pstate) +{ + int i; + for (i = 0; i < pstate->numWorkers; i++) + if (pstate->parallelSlot[i].workerStatus == WRKR_IDLE) + return i; + return NO_SLOT; +} + +/* + * Return true iff every worker process is in the WRKR_TERMINATED state. + */ +static bool +HasEveryWorkerTerminated(ParallelState *pstate) +{ + int i; + for (i = 0; i < pstate->numWorkers; i++) + if (pstate->parallelSlot[i].workerStatus != WRKR_TERMINATED) + return false; + return true; +} + +/* + * Return true iff every worker is in the WRKR_IDLE state. + */ +bool +IsEveryWorkerIdle(ParallelState *pstate) +{ + int i; + for (i = 0; i < pstate->numWorkers; i++) + if (pstate->parallelSlot[i].workerStatus != WRKR_IDLE) + return false; + return true; +} + +/* + * --------------------------------------------------------------------- + * One danger of the parallel backup is a possible deadlock: + * + * 1) Master dumps the schema and locks all tables in ACCESS SHARE mode. + * 2) Another process requests an ACCESS EXCLUSIVE lock (which is not granted + * because the master holds a conflicting ACCESS SHARE lock). + * 3) The worker process also requests an ACCESS SHARE lock to read the table. + * The worker's not granted that lock but is enqueued behind the ACCESS + * EXCLUSIVE lock request. + * --------------------------------------------------------------------- + * + * Now what we do here is to just request a lock in ACCESS SHARE but with + * NOWAIT in the worker prior to touching the table. If we don't get the lock, + * then we know that somebody else has requested an ACCESS EXCLUSIVE lock and + * are good to just fail the whole backup because we have detected a deadlock. + */ +static void +lockTableNoWait(ArchiveHandle *AH, TocEntry *te) +{ + Archive *AHX = (Archive *) AH; + const char *qualId; + PQExpBuffer query = createPQExpBuffer(); + PGresult *res; + + Assert(AH->format == archDirectory); + Assert(strcmp(te->desc, "BLOBS") != 0); + + appendPQExpBuffer(query, + "SELECT pg_namespace.nspname," + " pg_class.relname " + " FROM pg_class " + " JOIN pg_namespace on pg_namespace.oid = relnamespace " + " WHERE pg_class.oid = %d", te->catalogId.oid); + + res = PQexec(AH->connection, query->data); + + if (!res || PQresultStatus(res) != PGRES_TUPLES_OK) + exit_horribly(modulename, + "could not get relation name for oid %d: %s\n", + te->catalogId.oid, PQerrorMessage(AH->connection)); + + resetPQExpBuffer(query); + + qualId = fmtQualifiedId(AHX->remoteVersion, + PQgetvalue(res, 0, 0), + PQgetvalue(res, 0, 1)); + + appendPQExpBuffer(query, "LOCK TABLE %s IN ACCESS SHARE MODE NOWAIT", + qualId); + PQclear(res); + + res = PQexec(AH->connection, query->data); + + if (!res || PQresultStatus(res) != PGRES_COMMAND_OK) + exit_horribly(modulename, + "could not obtain lock on relation \"%s\". This " + "usually means that someone requested an ACCESS EXCLUSIVE lock " + "on the table after the pg_dump parent process has gotten the " + "initial ACCESS SHARE lock on the table.\n", qualId); + + PQclear(res); + destroyPQExpBuffer(query); +} + +/* + * That's the main routine for the worker. + * When it starts up it enters this routine and waits for commands from the + * master process. After having processed a command it comes back to here to + * wait for the next command. Finally it will receive a TERMINATE command and + * exit. + */ +static void +WaitForCommands(ArchiveHandle *AH, int pipefd[2]) +{ + char *command; + DumpId dumpId; + int nBytes; + char *str = NULL; + TocEntry *te; + + for(;;) + { + if (!(command = getMessageFromMaster(pipefd))) + { + PQfinish(AH->connection); + AH->connection = NULL; + return; + } + + if (messageStartsWith(command, "DUMP ")) + { + Assert(AH->format == archDirectory); + sscanf(command + strlen("DUMP "), "%d%n", &dumpId, &nBytes); + Assert(nBytes == strlen(command) - strlen("DUMP ")); + + te = getTocEntryByDumpId(AH, dumpId); + Assert(te != NULL); + + /* + * Lock the table but with NOWAIT. Note that the parent is already + * holding a lock. If we cannot acquire another ACCESS SHARE MODE + * lock, then somebody else has requested an exclusive lock in the + * meantime. lockTableNoWait dies in this case to prevent a + * deadlock. + */ + if (strcmp(te->desc, "BLOBS") != 0) + lockTableNoWait(AH, te); + + /* + * The message we return here has been pg_malloc()ed and we are + * responsible for free()ing it. + */ + str = (AH->WorkerJobDumpPtr)(AH, te); + Assert(AH->connection != NULL); + sendMessageToMaster(pipefd, str); + free(str); + } + else if (messageStartsWith(command, "RESTORE ")) + { + Assert(AH->format == archDirectory || AH->format == archCustom); + Assert(AH->connection != NULL); + + sscanf(command + strlen("RESTORE "), "%d%n", &dumpId, &nBytes); + Assert(nBytes == strlen(command) - strlen("RESTORE ")); + + te = getTocEntryByDumpId(AH, dumpId); + Assert(te != NULL); + /* + * The message we return here has been pg_malloc()ed and we are + * responsible for free()ing it. + */ + str = (AH->WorkerJobRestorePtr)(AH, te); + Assert(AH->connection != NULL); + sendMessageToMaster(pipefd, str); + free(str); + } + else + exit_horribly(modulename, + "Unknown command on communication channel: %s\n", + command); + } +} + +/* + * --------------------------------------------------------------------- + * Note the status change: + * + * DispatchJobForTocEntry WRKR_IDLE -> WRKR_WORKING + * ListenToWorkers WRKR_WORKING -> WRKR_FINISHED / WRKR_TERMINATED + * ReapWorkerStatus WRKR_FINISHED -> WRKR_IDLE + * --------------------------------------------------------------------- + * + * Just calling ReapWorkerStatus() when all workers are working might or might + * not give you an idle worker because you need to call ListenToWorkers() in + * between and only thereafter ReapWorkerStatus(). This is necessary in order + * to get and deal with the status (=result) of the worker's execution. + */ +void +ListenToWorkers(ArchiveHandle *AH, ParallelState *pstate, bool do_wait) +{ + int worker; + char *msg; + + msg = getMessageFromWorker(pstate, do_wait, &worker); + + if (!msg) + { + if (do_wait) + exit_horribly(modulename, "A worker process died unexpectedly\n"); + return; + } + + if (messageStartsWith(msg, "OK ")) + { + char *statusString; + TocEntry *te; + + pstate->parallelSlot[worker].workerStatus = WRKR_FINISHED; + te = pstate->parallelSlot[worker].args->te; + if (messageStartsWith(msg, "OK RESTORE ")) + { + statusString = msg + strlen("OK RESTORE "); + pstate->parallelSlot[worker].status = + (AH->MasterEndParallelItemPtr) + (AH, te, statusString, ACT_RESTORE); + } + else if (messageStartsWith(msg, "OK DUMP ")) + { + statusString = msg + strlen("OK DUMP "); + pstate->parallelSlot[worker].status = + (AH->MasterEndParallelItemPtr) + (AH, te, statusString, ACT_DUMP); + } + else + exit_horribly(modulename, + "Invalid message received from worker: %s\n", msg); + } + else if (messageStartsWith(msg, "ERROR ")) + { + Assert(AH->format == archDirectory || AH->format == archCustom); + pstate->parallelSlot[worker].workerStatus = WRKR_TERMINATED; + exit_horribly(modulename, "%s", msg + strlen("ERROR ")); + } + else + exit_horribly(modulename, "Invalid message received from worker: %s\n", msg); + + /* both Unix and Win32 return pg_malloc()ed space, so we free it */ + free(msg); +} + +/* + * This function is executed in the master process. + * + * This function is used to get the return value of a terminated worker + * process. If a process has terminated, its status is stored in *status and + * the id of the worker is returned. + */ +int +ReapWorkerStatus(ParallelState *pstate, int *status) +{ + int i; + + for (i = 0; i < pstate->numWorkers; i++) + { + if (pstate->parallelSlot[i].workerStatus == WRKR_FINISHED) + { + *status = pstate->parallelSlot[i].status; + pstate->parallelSlot[i].status = 0; + pstate->parallelSlot[i].workerStatus = WRKR_IDLE; + return i; + } + } + return NO_SLOT; +} + +/* + * This function is executed in the master process. + * + * It looks for an idle worker process and only returns if there is one. + */ +void +EnsureIdleWorker(ArchiveHandle *AH, ParallelState *pstate) +{ + int ret_worker; + int work_status; + + for (;;) + { + int nTerm = 0; + while ((ret_worker = ReapWorkerStatus(pstate, &work_status)) != NO_SLOT) + { + if (work_status != 0) + exit_horribly(modulename, "Error processing a parallel work item.\n"); + + nTerm++; + } + + /* + * We need to make sure that we have an idle worker before dispatching + * the next item. If nTerm > 0 we already have that (quick check). + */ + if (nTerm > 0) + return; + + /* explicit check for an idle worker */ + if (GetIdleWorker(pstate) != NO_SLOT) + return; + + /* + * If we have no idle worker, read the result of one or more + * workers and loop the loop to call ReapWorkerStatus() on them + */ + ListenToWorkers(AH, pstate, true); + } +} + +/* + * This function is executed in the master process. + * + * It waits for all workers to terminate. + */ +void +EnsureWorkersFinished(ArchiveHandle *AH, ParallelState *pstate) +{ + int work_status; + + if (!pstate || pstate->numWorkers == 1) + return; + + /* Waiting for the remaining worker processes to finish */ + while (!IsEveryWorkerIdle(pstate)) + { + if (ReapWorkerStatus(pstate, &work_status) == NO_SLOT) + ListenToWorkers(AH, pstate, true); + else if (work_status != 0) + exit_horribly(modulename, + "Error processing a parallel work item\n"); + } +} + +/* + * This function is executed in the worker process. + * + * It returns the next message on the communication channel, blocking until it + * becomes available. + */ +static char * +getMessageFromMaster(int pipefd[2]) +{ + return readMessageFromPipe(pipefd[PIPE_READ]); +} + +/* + * This function is executed in the worker process. + * + * It sends a message to the master on the communication channel. + */ +static void +sendMessageToMaster(int pipefd[2], const char *str) +{ + int len = strlen(str) + 1; + + if (pipewrite(pipefd[PIPE_WRITE], str, len) != len) + exit_horribly(modulename, + "Error writing to the communication channel: %s\n", + strerror(errno)); +} + +/* + * A select loop that repeats calling select until a descriptor in the read + * set becomes readable. On Windows we have to check for the termination event + * from time to time, on Unix we can just block forever. + */ +#ifdef WIN32 +static int +select_loop(int maxFd, fd_set *workerset) +{ + int i; + fd_set saveSet = *workerset; + + /* should always be the master */ + Assert(tMasterThreadId == GetCurrentThreadId()); + + for (;;) + { + /* + * sleep a quarter of a second before checking if we should + * terminate. + */ + struct timeval tv = { 0, 250000 }; + *workerset = saveSet; + i = select(maxFd + 1, workerset, NULL, NULL, &tv); + + if (i == SOCKET_ERROR && WSAGetLastError() == WSAEINTR) + continue; + if (i) + break; + } + + return i; +} +#else /* UNIX */ +static int +select_loop(int maxFd, fd_set *workerset) +{ + int i; + + fd_set saveSet = *workerset; + for (;;) + { + *workerset = saveSet; + i = select(maxFd + 1, workerset, NULL, NULL, NULL); + + /* + * If we Ctrl-C the master process , it's likely that we interrupt + * select() here. The signal handler will set wantAbort == true and + * the shutdown journey starts from here. Note that we'll come back + * here later when we tell all workers to terminate and read their + * responses. But then we have aborting set to true. + */ + if (wantAbort && !aborting) + exit_horribly(modulename, "terminated by user\n"); + + if (i < 0 && errno == EINTR) + continue; + break; + } + + return i; +} +#endif + +/* + * This function is executed in the master process. + * + * It returns the next message from the worker on the communication channel, + * optionally blocking (do_wait) until it becomes available. + * + * The id of the worker is returned in *worker. + */ +static char * +getMessageFromWorker(ParallelState *pstate, bool do_wait, int *worker) +{ + int i; + fd_set workerset; + int maxFd = -1; + struct timeval nowait = { 0, 0 }; + + FD_ZERO(&workerset); + + for (i = 0; i < pstate->numWorkers; i++) + { + if (pstate->parallelSlot[i].workerStatus == WRKR_TERMINATED) + continue; + FD_SET(pstate->parallelSlot[i].pipeRead, &workerset); + /* actually WIN32 ignores the first parameter to select()... */ + if (pstate->parallelSlot[i].pipeRead > maxFd) + maxFd = pstate->parallelSlot[i].pipeRead; + } + + if (do_wait) + { + i = select_loop(maxFd, &workerset); + Assert(i != 0); + } + else + { + if ((i = select(maxFd + 1, &workerset, NULL, NULL, &nowait)) == 0) + return NULL; + } + + if (i < 0) + exit_horribly(modulename, "Error in ListenToWorkers(): %s", strerror(errno)); + + for (i = 0; i < pstate->numWorkers; i++) + { + char *msg; + + if (!FD_ISSET(pstate->parallelSlot[i].pipeRead, &workerset)) + continue; + + msg = readMessageFromPipe(pstate->parallelSlot[i].pipeRead); + *worker = i; + return msg; + } + Assert(false); + return NULL; +} + +/* + * This function is executed in the master process. + * + * It sends a message to a certain worker on the communication channel. + */ +static void +sendMessageToWorker(ParallelState *pstate, int worker, const char *str) +{ + int len = strlen(str) + 1; + + if (pipewrite(pstate->parallelSlot[worker].pipeWrite, str, len) != len) + { + /* + * If we're already aborting anyway, don't care if we succeed or not. + * The child might have gone already. + */ +#ifndef WIN32 + if (!aborting) +#endif + exit_horribly(modulename, + "Error writing to the communication channel: %s\n", + strerror(errno)); + } +} + +/* + * The underlying function to read a message from the communication channel + * (fd) with optional blocking (do_wait). + */ +static char * +readMessageFromPipe(int fd) +{ + char *msg; + int msgsize, bufsize; + int ret; + + /* + * The problem here is that we need to deal with several possibilites: + * we could receive only a partial message or several messages at once. + * The caller expects us to return exactly one message however. + * + * We could either read in as much as we can and keep track of what we + * delivered back to the caller or we just read byte by byte. Once we see + * (char) 0, we know that it's the message's end. This would be quite + * inefficient for more data but since we are reading only on the command + * channel, the performance loss does not seem worth the trouble of + * keeping internal states for different file descriptors. + */ + bufsize = 64; /* could be any number */ + msg = (char *) pg_malloc(bufsize); + + msgsize = 0; + for (;;) + { + Assert(msgsize <= bufsize); + ret = piperead(fd, msg + msgsize, 1); + + /* worker has closed the connection or another error happened */ + if (ret <= 0) + return NULL; + + Assert(ret == 1); + + if (msg[msgsize] == '\0') + return msg; + + msgsize++; + if (msgsize == bufsize) + { + /* could be any number */ + bufsize += 16; + msg = (char *) realloc(msg, bufsize); + } + } +} + +#ifdef WIN32 +/* + * This is a replacement version of pipe for Win32 which allows returned + * handles to be used in select(). Note that read/write calls must be replaced + * with recv/send. + */ +static int +pgpipe(int handles[2]) +{ + SOCKET s; + struct sockaddr_in serv_addr; + int len = sizeof(serv_addr); + + handles[0] = handles[1] = INVALID_SOCKET; + + if ((s = socket(AF_INET, SOCK_STREAM, 0)) == INVALID_SOCKET) + { + write_msg(modulename, "pgpipe could not create socket: %ui", + WSAGetLastError()); + return -1; + } + + memset((void *) &serv_addr, 0, sizeof(serv_addr)); + serv_addr.sin_family = AF_INET; + serv_addr.sin_port = htons(0); + serv_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + if (bind(s, (SOCKADDR *) &serv_addr, len) == SOCKET_ERROR) + { + write_msg(modulename, "pgpipe could not bind: %ui", + WSAGetLastError()); + closesocket(s); + return -1; + } + if (listen(s, 1) == SOCKET_ERROR) + { + write_msg(modulename, "pgpipe could not listen: %ui", + WSAGetLastError()); + closesocket(s); + return -1; + } + if (getsockname(s, (SOCKADDR *) &serv_addr, &len) == SOCKET_ERROR) + { + write_msg(modulename, "pgpipe could not getsockname: %ui", + WSAGetLastError()); + closesocket(s); + return -1; + } + if ((handles[1] = socket(PF_INET, SOCK_STREAM, 0)) == INVALID_SOCKET) + { + write_msg(modulename, "pgpipe could not create socket 2: %ui", + WSAGetLastError()); + closesocket(s); + return -1; + } + + if (connect(handles[1], (SOCKADDR *) &serv_addr, len) == SOCKET_ERROR) + { + write_msg(modulename, "pgpipe could not connect socket: %ui", + WSAGetLastError()); + closesocket(s); + return -1; + } + if ((handles[0] = accept(s, (SOCKADDR *) &serv_addr, &len)) == INVALID_SOCKET) + { + write_msg(modulename, "pgpipe could not accept socket: %ui", + WSAGetLastError()); + closesocket(handles[1]); + handles[1] = INVALID_SOCKET; + closesocket(s); + return -1; + } + closesocket(s); + return 0; +} + +static int +piperead(int s, char *buf, int len) +{ + int ret = recv(s, buf, len, 0); + + if (ret < 0 && WSAGetLastError() == WSAECONNRESET) + /* EOF on the pipe! (win32 socket based implementation) */ + ret = 0; + return ret; +} +#endif diff --git a/src/bin/pg_dump/parallel.h b/src/bin/pg_dump/parallel.h new file mode 100644 index 0000000000..3eafe2f5b1 --- /dev/null +++ b/src/bin/pg_dump/parallel.h @@ -0,0 +1,86 @@ +/*------------------------------------------------------------------------- + * + * parallel.h + * + * Parallel support header file for the pg_dump archiver + * + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * The author is not responsible for loss or damages that may + * result from its use. + * + * IDENTIFICATION + * src/bin/pg_dump/parallel.h + * + *------------------------------------------------------------------------- + */ + +#include "pg_backup_db.h" + +struct _archiveHandle; +struct _tocEntry; + +typedef enum +{ + WRKR_TERMINATED = 0, + WRKR_IDLE, + WRKR_WORKING, + WRKR_FINISHED +} T_WorkerStatus; + +typedef enum T_Action +{ + ACT_DUMP, + ACT_RESTORE, +} T_Action; + +/* Arguments needed for a worker process */ +typedef struct ParallelArgs +{ + struct _archiveHandle *AH; + struct _tocEntry *te; +} ParallelArgs; + +/* State for each parallel activity slot */ +typedef struct ParallelSlot +{ + ParallelArgs *args; + T_WorkerStatus workerStatus; + int status; + int pipeRead; + int pipeWrite; + int pipeRevRead; + int pipeRevWrite; +#ifdef WIN32 + uintptr_t hThread; + unsigned int threadId; +#else + pid_t pid; +#endif +} ParallelSlot; + +#define NO_SLOT (-1) + +typedef struct ParallelState +{ + int numWorkers; + ParallelSlot *parallelSlot; +} ParallelState; + +extern int GetIdleWorker(ParallelState *pstate); +extern bool IsEveryWorkerIdle(ParallelState *pstate); +extern void ListenToWorkers(struct _archiveHandle *AH, ParallelState *pstate, bool do_wait); +extern int ReapWorkerStatus(ParallelState *pstate, int *status); +extern void EnsureIdleWorker(struct _archiveHandle *AH, ParallelState *pstate); +extern void EnsureWorkersFinished(struct _archiveHandle *AH, ParallelState *pstate); + +extern ParallelState *ParallelBackupStart(struct _archiveHandle *AH, + RestoreOptions *ropt); +extern void DispatchJobForTocEntry(struct _archiveHandle *AH, + ParallelState *pstate, + struct _tocEntry *te, T_Action act); +extern void ParallelBackupEnd(struct _archiveHandle *AH, ParallelState *pstate); + +extern void checkAborting(struct _archiveHandle *AH); + From 73873e839056280ccbca2fbd2401ef488efd0b98 Mon Sep 17 00:00:00 2001 From: Andrew Dunstan Date: Tue, 16 Oct 2012 15:57:02 -0400 Subject: [PATCH 3/4] Add missing lib needed by MSVC build. --- src/tools/msvc/Mkvcbuild.pm | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm index a98cc45897..23b56f78f0 100644 --- a/src/tools/msvc/Mkvcbuild.pm +++ b/src/tools/msvc/Mkvcbuild.pm @@ -371,6 +371,7 @@ sub mkvcbuild $psql->AddIncludeDir('src\bin\pg_dump'); $psql->AddIncludeDir('src\backend'); $psql->AddFile('src\bin\psql\psqlscan.l'); + $psql->AddLibrary('ws2_32.lib'); my $pgdump = AddSimpleFrontend('pg_dump', 1); $pgdump->AddIncludeDir('src\backend'); @@ -379,6 +380,7 @@ sub mkvcbuild $pgdump->AddFile('src\bin\pg_dump\pg_dump_sort.c'); $pgdump->AddFile('src\bin\pg_dump\keywords.c'); $pgdump->AddFile('src\backend\parser\kwlookup.c'); + $pgdump->AddLibrary('ws2_32.lib'); my $pgdumpall = AddSimpleFrontend('pg_dump', 1); @@ -396,6 +398,7 @@ sub mkvcbuild $pgdumpall->AddFile('src\bin\pg_dump\dumpmem.c'); $pgdumpall->AddFile('src\bin\pg_dump\keywords.c'); $pgdumpall->AddFile('src\backend\parser\kwlookup.c'); + $pgdumpall->AddLibrary('ws2_32.lib'); my $pgrestore = AddSimpleFrontend('pg_dump', 1); $pgrestore->{name} = 'pg_restore'; @@ -403,6 +406,7 @@ sub mkvcbuild $pgrestore->AddFile('src\bin\pg_dump\pg_restore.c'); $pgrestore->AddFile('src\bin\pg_dump\keywords.c'); $pgrestore->AddFile('src\backend\parser\kwlookup.c'); + $pgrestore->AddLibrary('ws2_32.lib'); my $zic = $solution->AddProject('zic', 'exe', 'utils'); $zic->AddFiles('src\timezone', 'zic.c', 'ialloc.c', 'scheck.c', @@ -549,6 +553,7 @@ sub mkvcbuild $proj->AddIncludeDir('src\bin\psql'); $proj->AddReference($libpq, $libpgport); $proj->AddResourceFile('src\bin\scripts', 'PostgreSQL Utility'); + $proj->AddLibrary('ws2_32.lib'); } # Regression DLL and EXE From 7910ca930287636debd204410553406859f96b74 Mon Sep 17 00:00:00 2001 From: Andrew Dunstan Date: Sat, 8 Dec 2012 16:27:42 -0500 Subject: [PATCH 4/4] Update to master tip. --- contrib/auth_delay/auth_delay.c | 2 +- contrib/file_fdw/file_fdw.c | 24 +- contrib/pageinspect/btreefuncs.c | 20 +- contrib/pg_test_fsync/pg_test_fsync.c | 1 + contrib/pg_test_timing/pg_test_timing.c | 1 + contrib/pg_upgrade/check.c | 7 +- contrib/pg_upgrade/dump.c | 98 +- contrib/pg_upgrade/exec.c | 4 +- contrib/pg_upgrade/file.c | 111 +- contrib/pg_upgrade/info.c | 3 +- contrib/pg_upgrade/pg_upgrade.c | 56 +- contrib/pg_upgrade/pg_upgrade.h | 12 +- contrib/pg_upgrade/relfilenode.c | 213 ++-- contrib/pg_upgrade/server.c | 13 +- contrib/pg_upgrade/test.sh | 36 +- contrib/pg_upgrade/util.c | 12 + contrib/pgbench/pgbench.c | 20 +- contrib/sepgsql/expected/ddl.out | 53 + contrib/sepgsql/hooks.c | 125 +- contrib/sepgsql/relation.c | 194 ++- contrib/sepgsql/sepgsql.h | 2 +- contrib/sepgsql/sql/ddl.sql | 12 + contrib/tcn/tcn.c | 4 +- doc/src/sgml/Makefile | 6 +- doc/src/sgml/catalogs.sgml | 12 +- doc/src/sgml/client-auth.sgml | 39 +- doc/src/sgml/config.sgml | 26 +- doc/src/sgml/datatype.sgml | 2 +- doc/src/sgml/docguide.sgml | 4 +- doc/src/sgml/func.sgml | 2 +- doc/src/sgml/indices.sgml | 6 +- doc/src/sgml/libpq.sgml | 24 + doc/src/sgml/pgupgrade.sgml | 5 +- doc/src/sgml/protocol.sgml | 133 +- doc/src/sgml/ref/copy.sgml | 23 + doc/src/sgml/ref/create_function.sgml | 2 +- doc/src/sgml/ref/create_index.sgml | 12 + doc/src/sgml/ref/create_table.sgml | 1 + doc/src/sgml/ref/create_view.sgml | 7 +- doc/src/sgml/ref/drop_index.sgml | 37 +- doc/src/sgml/ref/initdb.sgml | 11 + doc/src/sgml/ref/pg_basebackup.sgml | 8 +- doc/src/sgml/ref/pg_dump.sgml | 21 +- doc/src/sgml/ref/pg_dumpall.sgml | 4 +- doc/src/sgml/ref/pg_receivexlog.sgml | 7 +- doc/src/sgml/ref/pg_restore.sgml | 27 +- doc/src/sgml/release-8.3.sgml | 296 +++++ doc/src/sgml/release-8.4.sgml | 302 +++++ doc/src/sgml/release-9.0.sgml | 378 ++++++ doc/src/sgml/release-9.1.sgml | 465 ++++++- doc/src/sgml/release-9.2.sgml | 731 ++++++++++- doc/src/sgml/sepgsql.sgml | 6 + doc/src/sgml/stylesheet.xsl | 2 +- src/Makefile.global.in | 14 +- src/backend/access/Makefile | 2 +- src/backend/access/gin/ginfast.c | 10 +- src/backend/access/gin/ginxlog.c | 184 ++- src/backend/access/gist/gistxlog.c | 240 ++-- src/backend/access/hash/hash.c | 5 - src/backend/access/heap/README.HOT | 28 + src/backend/access/heap/heapam.c | 475 +++---- src/backend/access/nbtree/nbtxlog.c | 339 ++--- src/backend/access/rmgrdesc/Makefile | 15 + src/backend/access/rmgrdesc/clogdesc.c | 41 + src/backend/access/rmgrdesc/dbasedesc.c | 43 + src/backend/access/rmgrdesc/gindesc.c | 83 ++ src/backend/access/rmgrdesc/gistdesc.c | 68 ++ src/backend/access/rmgrdesc/hashdesc.c | 22 + src/backend/access/rmgrdesc/heapdesc.c | 165 +++ src/backend/access/rmgrdesc/mxactdesc.c | 51 + src/backend/access/rmgrdesc/nbtdesc.c | 162 +++ src/backend/access/rmgrdesc/relmapdesc.c | 33 + src/backend/access/rmgrdesc/seqdesc.c | 36 + src/backend/access/rmgrdesc/smgrdesc.c | 45 + src/backend/access/rmgrdesc/spgdesc.c | 89 ++ src/backend/access/rmgrdesc/standbydesc.c | 65 + src/backend/access/rmgrdesc/tblspcdesc.c | 40 + src/backend/access/rmgrdesc/xactdesc.c | 194 +++ src/backend/access/rmgrdesc/xlogdesc.c | 120 ++ src/backend/access/spgist/spgvacuum.c | 2 +- src/backend/access/spgist/spgxlog.c | 347 +++--- src/backend/access/transam/README | 67 +- src/backend/access/transam/clog.c | 23 - src/backend/access/transam/multixact.c | 33 - src/backend/access/transam/rmgr.c | 2 +- src/backend/access/transam/slru.c | 24 +- src/backend/access/transam/timeline.c | 160 ++- src/backend/access/transam/twophase.c | 69 +- src/backend/access/transam/xact.c | 192 +-- src/backend/access/transam/xlog.c | 1087 ++++++++++------- src/backend/access/transam/xlogarchive.c | 19 +- src/backend/bootstrap/bootparse.y | 3 +- src/backend/catalog/dependency.c | 57 +- src/backend/catalog/heap.c | 56 +- src/backend/catalog/index.c | 406 ++++-- src/backend/catalog/pg_constraint.c | 12 +- src/backend/catalog/storage.c | 51 +- src/backend/catalog/toasting.c | 3 +- src/backend/commands/alter.c | 16 +- src/backend/commands/cluster.c | 11 +- src/backend/commands/copy.c | 42 +- src/backend/commands/dbcommands.c | 24 - src/backend/commands/dropcmds.c | 4 +- src/backend/commands/extension.c | 7 +- src/backend/commands/indexcmds.c | 55 +- src/backend/commands/portalcmds.c | 2 +- src/backend/commands/prepare.c | 2 +- src/backend/commands/sequence.c | 18 - src/backend/commands/tablecmds.c | 200 +-- src/backend/commands/tablespace.c | 22 - src/backend/commands/trigger.c | 43 +- src/backend/commands/typecmds.c | 57 +- src/backend/commands/vacuum.c | 30 +- src/backend/commands/vacuumlazy.c | 16 +- src/backend/executor/execMain.c | 43 +- src/backend/executor/execUtils.c | 3 + src/backend/executor/nodeLockRows.c | 27 +- src/backend/executor/nodeMergeAppend.c | 114 +- src/backend/executor/nodeModifyTable.c | 93 +- src/backend/executor/spi.c | 18 +- src/backend/lib/Makefile | 2 +- src/backend/lib/binaryheap.c | 293 +++++ src/backend/lib/dllist.c | 214 ---- src/backend/lib/ilist.c | 114 ++ src/backend/libpq/auth.c | 2 +- src/backend/libpq/be-fsstubs.c | 18 +- src/backend/libpq/hba.c | 62 +- src/backend/libpq/pqcomm.c | 8 + src/backend/nodes/outfuncs.c | 1 + src/backend/optimizer/path/allpaths.c | 8 +- src/backend/optimizer/path/equivclass.c | 100 +- src/backend/optimizer/path/indxpath.c | 146 ++- src/backend/optimizer/plan/initsplan.c | 95 +- src/backend/optimizer/plan/planagg.c | 8 +- src/backend/optimizer/util/plancat.c | 5 +- src/backend/parser/analyze.c | 4 + src/backend/parser/gram.y | 4 + src/backend/parser/parse_node.c | 10 +- src/backend/parser/parse_utilcmd.c | 8 +- src/backend/port/ipc_test.c | 3 +- src/backend/port/unix_latch.c | 274 +++-- src/backend/port/win32_latch.c | 46 +- src/backend/postmaster/autovacuum.c | 219 ++-- src/backend/postmaster/bgwriter.c | 1 + src/backend/postmaster/checkpointer.c | 1 + src/backend/postmaster/pgarch.c | 6 +- src/backend/postmaster/pgstat.c | 2 + src/backend/postmaster/postmaster.c | 100 +- src/backend/postmaster/syslogger.c | 19 +- src/backend/postmaster/walwriter.c | 1 + src/backend/replication/walreceiver.c | 147 ++- src/backend/replication/walsender.c | 146 ++- src/backend/rewrite/rewriteDefine.c | 11 +- src/backend/rewrite/rewriteHandler.c | 65 +- src/backend/rewrite/rewriteManip.c | 91 +- src/backend/storage/buffer/bufmgr.c | 34 +- src/backend/storage/file/copydir.c | 26 +- src/backend/storage/file/fd.c | 216 ++-- src/backend/storage/ipc/procarray.c | 169 +-- src/backend/storage/ipc/standby.c | 71 +- src/backend/storage/lmgr/lock.c | 3 +- src/backend/storage/lmgr/proc.c | 22 +- src/backend/storage/smgr/md.c | 13 +- src/backend/storage/smgr/smgr.c | 97 +- src/backend/tcop/postgres.c | 32 +- src/backend/tcop/pquery.c | 15 +- src/backend/tcop/utility.c | 22 +- src/backend/utils/adt/int.c | 125 +- src/backend/utils/adt/int8.c | 98 +- src/backend/utils/adt/numeric.c | 259 ++-- src/backend/utils/adt/rowtypes.c | 52 +- src/backend/utils/adt/ruleutils.c | 18 +- src/backend/utils/adt/selfuncs.c | 12 +- src/backend/utils/adt/timestamp.c | 44 + src/backend/utils/cache/catcache.c | 141 +-- src/backend/utils/cache/relcache.c | 35 +- src/backend/utils/cache/relmapper.c | 35 +- src/backend/utils/error/elog.c | 60 +- src/backend/utils/hash/dynahash.c | 46 +- src/backend/utils/misc/guc.c | 6 +- src/backend/utils/mmgr/portalmem.c | 19 + src/backend/utils/time/snapmgr.c | 9 + src/bin/initdb/initdb.c | 705 ++++++----- src/bin/pg_basebackup/pg_receivexlog.c | 1 + src/bin/pg_basebackup/receivelog.c | 200 ++- src/bin/pg_controldata/pg_controldata.c | 4 +- src/bin/pg_ctl/pg_ctl.c | 95 +- src/bin/pg_dump/pg_backup_archiver.c | 32 +- src/bin/pg_dump/pg_dump.c | 356 +++--- src/bin/pg_dump/pg_dump.h | 1 + src/bin/pg_dump/pg_dump_sort.c | 3 + src/bin/pg_dump/pg_dumpall.c | 63 +- src/bin/pg_resetxlog/pg_resetxlog.c | 11 +- src/bin/psql/psqlscan.l | 2 +- src/include/access/gist_private.h | 8 +- src/include/access/heapam.h | 37 +- src/include/access/timeline.h | 20 +- src/include/access/xlog.h | 16 +- src/include/access/xlog_internal.h | 22 +- src/include/c.h | 8 +- src/include/catalog/catversion.h | 2 +- src/include/catalog/heap.h | 4 +- src/include/catalog/index.h | 14 +- src/include/catalog/objectaccess.h | 13 + src/include/catalog/pg_constraint.h | 3 +- src/include/catalog/pg_control.h | 3 +- src/include/catalog/pg_index.h | 25 +- src/include/catalog/storage.h | 6 - src/include/catalog/storage_xlog.h | 49 + src/include/commands/alter.h | 4 +- src/include/commands/tablecmds.h | 7 +- src/include/commands/typecmds.h | 10 +- src/include/lib/binaryheap.h | 53 + src/include/lib/dllist.h | 85 -- src/include/lib/ilist.h | 735 +++++++++++ src/include/libpq/hba.h | 1 + src/include/libpq/pqcomm.h | 13 + src/include/miscadmin.h | 2 +- src/include/nodes/execnodes.h | 8 +- src/include/nodes/parsenodes.h | 1 + src/include/nodes/relation.h | 1 + src/include/optimizer/paths.h | 3 +- src/include/optimizer/planmain.h | 4 +- src/include/parser/parse_node.h | 2 +- src/include/port/win32.h | 8 +- src/include/replication/walprotocol.h | 128 -- src/include/rewrite/rewriteManip.h | 18 +- src/include/storage/fd.h | 12 +- src/include/storage/latch.h | 14 +- src/include/storage/lock.h | 1 + src/include/storage/proc.h | 3 +- src/include/storage/procarray.h | 4 +- src/include/storage/smgr.h | 9 +- src/include/storage/standby.h | 4 +- src/include/tcop/pquery.h | 2 +- src/include/utils/catcache.h | 18 +- src/include/utils/elog.h | 14 +- src/include/utils/portal.h | 1 + src/include/utils/snapmgr.h | 1 + src/include/utils/timestamp.h | 13 +- src/interfaces/ecpg/Makefile | 2 + src/interfaces/ecpg/ecpglib/Makefile | 4 +- src/interfaces/ecpg/preproc/Makefile | 6 + src/interfaces/ecpg/preproc/type.c | 7 +- src/interfaces/ecpg/preproc/variable.c | 6 +- src/interfaces/libpq/exports.txt | 1 + src/interfaces/libpq/fe-connect.c | 283 +++-- src/interfaces/libpq/libpq-fe.h | 3 + src/makefiles/pgxs.mk | 3 + src/test/isolation/Makefile | 14 +- .../expected/drop-index-concurrently-1.out | 40 + src/test/isolation/isolation_schedule | 1 + .../specs/drop-index-concurrently-1.spec | 38 + src/test/regress/expected/aggregates.out | 39 + src/test/regress/expected/alter_table.out | 22 + src/test/regress/expected/copy2.out | 124 ++ src/test/regress/expected/enum.out | 24 + src/test/regress/expected/int2.out | 11 + src/test/regress/expected/int4.out | 21 + .../expected/int8-exp-three-digits.out | 31 + src/test/regress/expected/int8.out | 31 + src/test/regress/expected/join.out | 71 ++ src/test/regress/expected/rules.out | 22 + src/test/regress/expected/triggers.out | 194 +++ src/test/regress/parallel_schedule | 8 +- src/test/regress/pg_regress.c | 27 +- src/test/regress/serial_schedule | 4 +- src/test/regress/sql/aggregates.sql | 5 + src/test/regress/sql/alter_table.sql | 7 + src/test/regress/sql/copy2.sql | 80 ++ src/test/regress/sql/enum.sql | 27 + src/test/regress/sql/int2.sql | 5 + src/test/regress/sql/int4.sql | 8 + src/test/regress/sql/int8.sql | 11 + src/test/regress/sql/join.sql | 32 + src/test/regress/sql/rules.sql | 15 + src/test/regress/sql/triggers.sql | 155 +++ src/timezone/data/africa | 28 +- src/timezone/data/asia | 96 +- src/timezone/data/australasia | 23 +- src/timezone/data/europe | 2 +- src/timezone/data/northamerica | 11 +- src/timezone/data/southamerica | 17 +- src/tools/find_static | 3 +- 284 files changed, 12524 insertions(+), 5484 deletions(-) create mode 100644 src/backend/access/rmgrdesc/Makefile create mode 100644 src/backend/access/rmgrdesc/clogdesc.c create mode 100644 src/backend/access/rmgrdesc/dbasedesc.c create mode 100644 src/backend/access/rmgrdesc/gindesc.c create mode 100644 src/backend/access/rmgrdesc/gistdesc.c create mode 100644 src/backend/access/rmgrdesc/hashdesc.c create mode 100644 src/backend/access/rmgrdesc/heapdesc.c create mode 100644 src/backend/access/rmgrdesc/mxactdesc.c create mode 100644 src/backend/access/rmgrdesc/nbtdesc.c create mode 100644 src/backend/access/rmgrdesc/relmapdesc.c create mode 100644 src/backend/access/rmgrdesc/seqdesc.c create mode 100644 src/backend/access/rmgrdesc/smgrdesc.c create mode 100644 src/backend/access/rmgrdesc/spgdesc.c create mode 100644 src/backend/access/rmgrdesc/standbydesc.c create mode 100644 src/backend/access/rmgrdesc/tblspcdesc.c create mode 100644 src/backend/access/rmgrdesc/xactdesc.c create mode 100644 src/backend/access/rmgrdesc/xlogdesc.c create mode 100644 src/backend/lib/binaryheap.c delete mode 100644 src/backend/lib/dllist.c create mode 100644 src/backend/lib/ilist.c create mode 100644 src/include/catalog/storage_xlog.h create mode 100644 src/include/lib/binaryheap.h delete mode 100644 src/include/lib/dllist.h create mode 100644 src/include/lib/ilist.h delete mode 100644 src/include/replication/walprotocol.h create mode 100644 src/test/isolation/expected/drop-index-concurrently-1.out create mode 100644 src/test/isolation/specs/drop-index-concurrently-1.spec diff --git a/contrib/auth_delay/auth_delay.c b/contrib/auth_delay/auth_delay.c index 4e0d5959d1..3131e827b8 100644 --- a/contrib/auth_delay/auth_delay.c +++ b/contrib/auth_delay/auth_delay.c @@ -59,7 +59,7 @@ _PG_init(void) NULL, &auth_delay_milliseconds, 0, - 0, INT_MAX, + 0, INT_MAX / 1000, PGC_SIGHUP, GUC_UNIT_MS, NULL, diff --git a/contrib/file_fdw/file_fdw.c b/contrib/file_fdw/file_fdw.c index 81fc4e2900..7ab3ed603a 100644 --- a/contrib/file_fdw/file_fdw.c +++ b/contrib/file_fdw/file_fdw.c @@ -614,13 +614,13 @@ fileIterateForeignScan(ForeignScanState *node) FileFdwExecutionState *festate = (FileFdwExecutionState *) node->fdw_state; TupleTableSlot *slot = node->ss.ss_ScanTupleSlot; bool found; - ErrorContextCallback errcontext; + ErrorContextCallback errcallback; /* Set up callback to identify error line number. */ - errcontext.callback = CopyFromErrorCallback; - errcontext.arg = (void *) festate->cstate; - errcontext.previous = error_context_stack; - error_context_stack = &errcontext; + errcallback.callback = CopyFromErrorCallback; + errcallback.arg = (void *) festate->cstate; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; /* * The protocol for loading a virtual tuple into a slot is first @@ -642,7 +642,7 @@ fileIterateForeignScan(ForeignScanState *node) ExecStoreVirtualTuple(slot); /* Remove error callback. */ - error_context_stack = errcontext.previous; + error_context_stack = errcallback.previous; return slot; } @@ -976,7 +976,7 @@ file_acquire_sample_rows(Relation onerel, int elevel, char *filename; List *options; CopyState cstate; - ErrorContextCallback errcontext; + ErrorContextCallback errcallback; MemoryContext oldcontext = CurrentMemoryContext; MemoryContext tupcontext; @@ -1009,10 +1009,10 @@ file_acquire_sample_rows(Relation onerel, int elevel, rstate = anl_init_selection_state(targrows); /* Set up callback to identify error line number. */ - errcontext.callback = CopyFromErrorCallback; - errcontext.arg = (void *) cstate; - errcontext.previous = error_context_stack; - error_context_stack = &errcontext; + errcallback.callback = CopyFromErrorCallback; + errcallback.arg = (void *) cstate; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; *totalrows = 0; *totaldeadrows = 0; @@ -1072,7 +1072,7 @@ file_acquire_sample_rows(Relation onerel, int elevel, } /* Remove error callback. */ - error_context_stack = errcontext.previous; + error_context_stack = errcallback.previous; /* Clean up. */ MemoryContextDelete(tupcontext); diff --git a/contrib/pageinspect/btreefuncs.c b/contrib/pageinspect/btreefuncs.c index dbb2158ba8..bc34af9b3c 100644 --- a/contrib/pageinspect/btreefuncs.c +++ b/contrib/pageinspect/btreefuncs.c @@ -156,9 +156,9 @@ GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat) } /* ----------------------------------------------- - * bt_page() + * bt_page_stats() * - * Usage: SELECT * FROM bt_page('t1_pkey', 1); + * Usage: SELECT * FROM bt_page_stats('t1_pkey', 1); * ----------------------------------------------- */ Datum @@ -204,6 +204,7 @@ bt_page_stats(PG_FUNCTION_ARGS) CHECK_RELATION_BLOCK_RANGE(rel, blkno); buffer = ReadBuffer(rel, blkno); + LockBuffer(buffer, BUFFER_LOCK_SHARE); /* keep compiler quiet */ stat.btpo_prev = stat.btpo_next = InvalidBlockNumber; @@ -211,6 +212,9 @@ bt_page_stats(PG_FUNCTION_ARGS) GetBTPageStatistics(blkno, buffer, &stat); + UnlockReleaseBuffer(buffer); + relation_close(rel, AccessShareLock); + /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); @@ -247,10 +251,6 @@ bt_page_stats(PG_FUNCTION_ARGS) result = HeapTupleGetDatum(tuple); - ReleaseBuffer(buffer); - - relation_close(rel, AccessShareLock); - PG_RETURN_DATUM(result); } @@ -322,6 +322,7 @@ bt_page_items(PG_FUNCTION_ARGS) CHECK_RELATION_BLOCK_RANGE(rel, blkno); buffer = ReadBuffer(rel, blkno); + LockBuffer(buffer, BUFFER_LOCK_SHARE); /* * We copy the page into local storage to avoid holding pin on the @@ -335,7 +336,7 @@ bt_page_items(PG_FUNCTION_ARGS) uargs->page = palloc(BLCKSZ); memcpy(uargs->page, BufferGetPage(buffer), BLCKSZ); - ReleaseBuffer(buffer); + UnlockReleaseBuffer(buffer); relation_close(rel, AccessShareLock); uargs->offset = FirstOffsetNumber; @@ -466,6 +467,8 @@ bt_metap(PG_FUNCTION_ARGS) errmsg("cannot access temporary tables of other sessions"))); buffer = ReadBuffer(rel, 0); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + page = BufferGetPage(buffer); metad = BTPageGetMeta(page); @@ -492,8 +495,7 @@ bt_metap(PG_FUNCTION_ARGS) result = HeapTupleGetDatum(tuple); - ReleaseBuffer(buffer); - + UnlockReleaseBuffer(buffer); relation_close(rel, AccessShareLock); PG_RETURN_DATUM(result); diff --git a/contrib/pg_test_fsync/pg_test_fsync.c b/contrib/pg_test_fsync/pg_test_fsync.c index efa1b3284e..ec4b90c797 100644 --- a/contrib/pg_test_fsync/pg_test_fsync.c +++ b/contrib/pg_test_fsync/pg_test_fsync.c @@ -140,6 +140,7 @@ handle_args(int argc, char *argv[]) {"secs-per-test", required_argument, NULL, 's'}, {NULL, 0, NULL, 0} }; + int option; /* Command line option */ int optindex = 0; /* used by getopt_long */ diff --git a/contrib/pg_test_timing/pg_test_timing.c b/contrib/pg_test_timing/pg_test_timing.c index 8d79c7bd74..191c621376 100644 --- a/contrib/pg_test_timing/pg_test_timing.c +++ b/contrib/pg_test_timing/pg_test_timing.c @@ -43,6 +43,7 @@ handle_args(int argc, char *argv[]) {"duration", required_argument, NULL, 'd'}, {NULL, 0, NULL, 0} }; + int option; /* Command line option */ int optindex = 0; /* used by getopt_long */ diff --git a/contrib/pg_upgrade/check.c b/contrib/pg_upgrade/check.c index e9e9a4fe33..bccceb1e35 100644 --- a/contrib/pg_upgrade/check.c +++ b/contrib/pg_upgrade/check.c @@ -72,7 +72,7 @@ output_check_banner(bool *live_check) void -check_old_cluster(bool live_check, char **sequence_script_file_name) +check_and_dump_old_cluster(bool live_check, char **sequence_script_file_name) { /* -- OLD -- */ @@ -131,10 +131,7 @@ check_old_cluster(bool live_check, char **sequence_script_file_name) * the old server is running. */ if (!user_opts.check) - { generate_old_dump(); - split_old_dump(); - } if (!live_check) stop_postmaster(false); @@ -987,7 +984,7 @@ get_canonical_locale_name(int category, const char *locale) if (!setlocale(category, save)) pg_log(PG_FATAL, "failed to restore old locale \"%s\"\n", save); - free(save); + pg_free(save); return res; } diff --git a/contrib/pg_upgrade/dump.c b/contrib/pg_upgrade/dump.c index 577ccac01f..2c1b65b255 100644 --- a/contrib/pg_upgrade/dump.c +++ b/contrib/pg_upgrade/dump.c @@ -16,95 +16,35 @@ void generate_old_dump(void) { - /* run new pg_dumpall binary */ - prep_status("Creating catalog dump"); + int dbnum; - /* - * --binary-upgrade records the width of dropped columns in pg_class, and - * restores the frozenid's for databases and relations. - */ + prep_status("Creating dump of global objects"); + + /* run new pg_dumpall binary for globals */ exec_prog(UTILITY_LOG_FILE, NULL, true, - "\"%s/pg_dumpall\" %s --schema-only --binary-upgrade %s -f %s", + "\"%s/pg_dumpall\" %s --schema-only --globals-only --binary-upgrade %s -f %s", new_cluster.bindir, cluster_conn_opts(&old_cluster), log_opts.verbose ? "--verbose" : "", - ALL_DUMP_FILE); + GLOBALS_DUMP_FILE); check_ok(); -} + prep_status("Creating dump of database schemas\n"); -/* - * split_old_dump - * - * This function splits pg_dumpall output into global values and - * database creation, and per-db schemas. This allows us to create - * the support functions between restoring these two parts of the - * dump. We split on the first "\connect " after a CREATE ROLE - * username match; this is where the per-db restore starts. - * - * We suppress recreation of our own username so we don't generate - * an error during restore - */ -void -split_old_dump(void) -{ - FILE *all_dump, - *globals_dump, - *db_dump; - FILE *current_output; - char line[LINE_ALLOC]; - bool start_of_line = true; - char create_role_str[MAX_STRING]; - char create_role_str_quote[MAX_STRING]; - char filename[MAXPGPATH]; - bool suppressed_username = false; - - - /* - * Open all files in binary mode to avoid line end translation on Windows, - * both for input and output. - */ - - snprintf(filename, sizeof(filename), "%s", ALL_DUMP_FILE); - if ((all_dump = fopen(filename, PG_BINARY_R)) == NULL) - pg_log(PG_FATAL, "Could not open dump file \"%s\": %s\n", filename, getErrorText(errno)); - snprintf(filename, sizeof(filename), "%s", GLOBALS_DUMP_FILE); - if ((globals_dump = fopen_priv(filename, PG_BINARY_W)) == NULL) - pg_log(PG_FATAL, "Could not write to dump file \"%s\": %s\n", filename, getErrorText(errno)); - snprintf(filename, sizeof(filename), "%s", DB_DUMP_FILE); - if ((db_dump = fopen_priv(filename, PG_BINARY_W)) == NULL) - pg_log(PG_FATAL, "Could not write to dump file \"%s\": %s\n", filename, getErrorText(errno)); - - current_output = globals_dump; - - /* patterns used to prevent our own username from being recreated */ - snprintf(create_role_str, sizeof(create_role_str), - "CREATE ROLE %s;", os_info.user); - snprintf(create_role_str_quote, sizeof(create_role_str_quote), - "CREATE ROLE %s;", quote_identifier(os_info.user)); - - while (fgets(line, sizeof(line), all_dump) != NULL) + /* create per-db dump files */ + for (dbnum = 0; dbnum < old_cluster.dbarr.ndbs; dbnum++) { - /* switch to db_dump file output? */ - if (current_output == globals_dump && start_of_line && - suppressed_username && - strncmp(line, "\\connect ", strlen("\\connect ")) == 0) - current_output = db_dump; + char file_name[MAXPGPATH]; + DbInfo *old_db = &old_cluster.dbarr.dbs[dbnum]; - /* output unless we are recreating our own username */ - if (current_output != globals_dump || !start_of_line || - (strncmp(line, create_role_str, strlen(create_role_str)) != 0 && - strncmp(line, create_role_str_quote, strlen(create_role_str_quote)) != 0)) - fputs(line, current_output); - else - suppressed_username = true; + pg_log(PG_REPORT, OVERWRITE_MESSAGE, old_db->db_name); + snprintf(file_name, sizeof(file_name), DB_DUMP_FILE_MASK, old_db->db_oid); - if (strlen(line) > 0 && line[strlen(line) - 1] == '\n') - start_of_line = true; - else - start_of_line = false; + exec_prog(RESTORE_LOG_FILE, NULL, true, + "\"%s/pg_dump\" %s --schema-only --binary-upgrade --format=custom %s --file=\"%s\" \"%s\"", + new_cluster.bindir, cluster_conn_opts(&old_cluster), + log_opts.verbose ? "--verbose" : "", file_name, old_db->db_name); } - fclose(all_dump); - fclose(globals_dump); - fclose(db_dump); + end_progress_output(); + check_ok(); } diff --git a/contrib/pg_upgrade/exec.c b/contrib/pg_upgrade/exec.c index 76247fdbc8..35de5413f4 100644 --- a/contrib/pg_upgrade/exec.c +++ b/contrib/pg_upgrade/exec.c @@ -104,8 +104,10 @@ exec_prog(const char *log_file, const char *opt_log_file, if (result != 0) { - report_status(PG_REPORT, "*failure*"); + /* we might be in on a progress status line, so go to the next line */ + report_status(PG_REPORT, "\n*failure*"); fflush(stdout); + pg_log(PG_VERBOSE, "There were problems executing \"%s\"\n", cmd); if (opt_log_file) pg_log(throw_error ? PG_FATAL : PG_REPORT, diff --git a/contrib/pg_upgrade/file.c b/contrib/pg_upgrade/file.c index a5d92c62fc..b5d78a57de 100644 --- a/contrib/pg_upgrade/file.c +++ b/contrib/pg_upgrade/file.c @@ -133,6 +133,8 @@ copy_file(const char *srcfile, const char *dstfile, bool force) int src_fd; int dest_fd; char *buffer; + int ret = 0; + int save_errno = 0; if ((srcfile == NULL) || (dstfile == NULL)) return -1; @@ -148,18 +150,7 @@ copy_file(const char *srcfile, const char *dstfile, bool force) return -1; } - buffer = (char *) malloc(COPY_BUF_SIZE); - - if (buffer == NULL) - { - if (src_fd != 0) - close(src_fd); - - if (dest_fd != 0) - close(dest_fd); - - return -1; - } + buffer = (char *) pg_malloc(COPY_BUF_SIZE); /* perform data copying i.e read src source, write to destination */ while (true) @@ -168,19 +159,9 @@ copy_file(const char *srcfile, const char *dstfile, bool force) if (nbytes < 0) { - int save_errno = errno; - - if (buffer != NULL) - free(buffer); - - if (src_fd != 0) - close(src_fd); - - if (dest_fd != 0) - close(dest_fd); - - errno = save_errno; - return -1; + save_errno = errno; + ret = -1; + break; } if (nbytes == 0) @@ -190,25 +171,13 @@ copy_file(const char *srcfile, const char *dstfile, bool force) if (write(dest_fd, buffer, nbytes) != nbytes) { - /* if write didn't set errno, assume problem is no disk space */ - int save_errno = errno ? errno : ENOSPC; - - if (buffer != NULL) - free(buffer); - - if (src_fd != 0) - close(src_fd); - - if (dest_fd != 0) - close(dest_fd); - - errno = save_errno; - return -1; + save_errno = errno; + ret = -1; + break; } } - if (buffer != NULL) - free(buffer); + pg_free(buffer); if (src_fd != 0) close(src_fd); @@ -216,64 +185,12 @@ copy_file(const char *srcfile, const char *dstfile, bool force) if (dest_fd != 0) close(dest_fd); - return 1; -} -#endif - + if (save_errno != 0) + errno = save_errno; -/* - * load_directory() - * - * Read all the file names in the specified directory, and return them as - * an array of "char *" pointers. The array address is returned in - * *namelist, and the function result is the count of file names. - * - * To free the result data, free each (char *) array member, then free the - * namelist array itself. - */ -int -load_directory(const char *dirname, char ***namelist) -{ - DIR *dirdesc; - struct dirent *direntry; - int count = 0; - int allocsize = 64; /* initial array size */ - - *namelist = (char **) pg_malloc(allocsize * sizeof(char *)); - - if ((dirdesc = opendir(dirname)) == NULL) - pg_log(PG_FATAL, "could not open directory \"%s\": %s\n", - dirname, getErrorText(errno)); - - while (errno = 0, (direntry = readdir(dirdesc)) != NULL) - { - if (count >= allocsize) - { - allocsize *= 2; - *namelist = (char **) - pg_realloc(*namelist, allocsize * sizeof(char *)); - } - - (*namelist)[count++] = pg_strdup(direntry->d_name); - } - -#ifdef WIN32 - /* - * This fix is in mingw cvs (runtime/mingwex/dirent.c rev 1.4), but not in - * released version - */ - if (GetLastError() == ERROR_NO_MORE_FILES) - errno = 0; -#endif - - if (errno) - pg_log(PG_FATAL, "could not read directory \"%s\": %s\n", - dirname, getErrorText(errno)); - - closedir(dirdesc); - - return count; + return ret; } +#endif void diff --git a/contrib/pg_upgrade/info.c b/contrib/pg_upgrade/info.c index c406941c98..2250442706 100644 --- a/contrib/pg_upgrade/info.c +++ b/contrib/pg_upgrade/info.c @@ -18,6 +18,7 @@ static void create_rel_filename_map(const char *old_data, const char *new_data, const DbInfo *old_db, const DbInfo *new_db, const RelInfo *old_rel, const RelInfo *new_rel, FileNameMap *map); +static void free_db_and_rel_infos(DbInfoArr *db_arr); static void get_db_infos(ClusterInfo *cluster); static void get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo); static void free_rel_infos(RelInfoArr *rel_arr); @@ -370,7 +371,7 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo) } -void +static void free_db_and_rel_infos(DbInfoArr *db_arr) { int dbnum; diff --git a/contrib/pg_upgrade/pg_upgrade.c b/contrib/pg_upgrade/pg_upgrade.c index 4d2e79cd48..63df52996d 100644 --- a/contrib/pg_upgrade/pg_upgrade.c +++ b/contrib/pg_upgrade/pg_upgrade.c @@ -92,7 +92,7 @@ main(int argc, char **argv) check_cluster_compatibility(live_check); - check_old_cluster(live_check, &sequence_script_file_name); + check_and_dump_old_cluster(live_check, &sequence_script_file_name); /* -- NEW -- */ @@ -150,6 +150,12 @@ main(int argc, char **argv) new_cluster.pgdata); check_ok(); + prep_status("Sync data directory to disk"); + exec_prog(UTILITY_LOG_FILE, NULL, true, + "\"%s/initdb\" --sync-only \"%s\"", new_cluster.bindir, + new_cluster.pgdata); + check_ok(); + create_script_for_cluster_analyze(&analyze_script_file_name); create_script_for_old_cluster_deletion(&deletion_script_file_name); @@ -248,7 +254,7 @@ prepare_new_databases(void) set_frozenxids(); - prep_status("Creating databases in the new cluster"); + prep_status("Restoring global objects in the new cluster"); /* * Install support functions in the global-object restore database to @@ -282,6 +288,11 @@ create_new_objects(void) prep_status("Adding support functions to new cluster"); + /* + * Technically, we only need to install these support functions in new + * databases that also exist in the old cluster, but for completeness + * we process all new databases. + */ for (dbnum = 0; dbnum < new_cluster.dbarr.ndbs; dbnum++) { DbInfo *new_db = &new_cluster.dbarr.dbs[dbnum]; @@ -292,11 +303,27 @@ create_new_objects(void) } check_ok(); - prep_status("Restoring database schema to new cluster"); - exec_prog(RESTORE_LOG_FILE, NULL, true, - "\"%s/psql\" " EXEC_PSQL_ARGS " %s -f \"%s\"", - new_cluster.bindir, cluster_conn_opts(&new_cluster), - DB_DUMP_FILE); + prep_status("Restoring database schemas in the new cluster\n"); + + for (dbnum = 0; dbnum < old_cluster.dbarr.ndbs; dbnum++) + { + char file_name[MAXPGPATH]; + DbInfo *old_db = &old_cluster.dbarr.dbs[dbnum]; + + pg_log(PG_REPORT, OVERWRITE_MESSAGE, old_db->db_name); + snprintf(file_name, sizeof(file_name), DB_DUMP_FILE_MASK, old_db->db_oid); + + /* + * Using pg_restore --single-transaction is faster than other + * methods, like --jobs. pg_dump only produces its output at the + * end, so there is little parallelism using the pipe. + */ + exec_prog(RESTORE_LOG_FILE, NULL, true, + "\"%s/pg_restore\" %s --exit-on-error --single-transaction --verbose --dbname \"%s\" \"%s\"", + new_cluster.bindir, cluster_conn_opts(&new_cluster), + old_db->db_name, file_name); + } + end_progress_output(); check_ok(); /* regenerate now that we have objects in the databases */ @@ -455,14 +482,23 @@ cleanup(void) /* Remove dump and log files? */ if (!log_opts.retain) { + int dbnum; char **filename; for (filename = output_files; *filename != NULL; filename++) unlink(*filename); - /* remove SQL files */ - unlink(ALL_DUMP_FILE); + /* remove dump files */ unlink(GLOBALS_DUMP_FILE); - unlink(DB_DUMP_FILE); + + if (old_cluster.dbarr.dbs) + for (dbnum = 0; dbnum < old_cluster.dbarr.ndbs; dbnum++) + { + char file_name[MAXPGPATH]; + DbInfo *old_db = &old_cluster.dbarr.dbs[dbnum]; + + snprintf(file_name, sizeof(file_name), DB_DUMP_FILE_MASK, old_db->db_oid); + unlink(file_name); + } } } diff --git a/contrib/pg_upgrade/pg_upgrade.h b/contrib/pg_upgrade/pg_upgrade.h index 305834375f..d98103508b 100644 --- a/contrib/pg_upgrade/pg_upgrade.h +++ b/contrib/pg_upgrade/pg_upgrade.h @@ -7,7 +7,6 @@ #include #include -#include #include #include @@ -30,10 +29,9 @@ #define OVERWRITE_MESSAGE " %-" MESSAGE_WIDTH "." MESSAGE_WIDTH "s\r" #define GET_MAJOR_VERSION(v) ((v) / 100) -#define ALL_DUMP_FILE "pg_upgrade_dump_all.sql" /* contains both global db information and CREATE DATABASE commands */ #define GLOBALS_DUMP_FILE "pg_upgrade_dump_globals.sql" -#define DB_DUMP_FILE "pg_upgrade_dump_db.sql" +#define DB_DUMP_FILE_MASK "pg_upgrade_dump_%u.custom" #define SERVER_LOG_FILE "pg_upgrade_server.log" #define RESTORE_LOG_FILE "pg_upgrade_restore.log" @@ -297,12 +295,12 @@ extern OSInfo os_info; /* check.c */ void output_check_banner(bool *live_check); -void check_old_cluster(bool live_check, +void check_and_dump_old_cluster(bool live_check, char **sequence_script_file_name); void check_new_cluster(void); void report_clusters_compatible(void); void issue_warnings(char *sequence_script_file_name); -void output_completion_banner(char *analyze_script_file_name, +void output_completion_banner(char *analyze_script_file_name, char *deletion_script_file_name); void check_cluster_versions(void); void check_cluster_compatibility(bool live_check); @@ -320,7 +318,6 @@ void disable_old_cluster(void); /* dump.c */ void generate_old_dump(void); -void split_old_dump(void); /* exec.c */ @@ -366,7 +363,6 @@ const char *setupPageConverter(pageCnvCtx **result); typedef void *pageCnvCtx; #endif -int load_directory(const char *dirname, char ***namelist); const char *copyAndUpdateFile(pageCnvCtx *pageConverter, const char *src, const char *dst, bool force); const char *linkAndUpdateFile(pageCnvCtx *pageConverter, const char *src, @@ -388,7 +384,6 @@ FileNameMap *gen_db_file_maps(DbInfo *old_db, DbInfo *new_db, int *nmaps, const char *old_pgdata, const char *new_pgdata); void get_db_and_rel_infos(ClusterInfo *cluster); -void free_db_and_rel_infos(DbInfoArr *db_arr); void print_maps(FileNameMap *maps, int n, const char *db_name); @@ -436,6 +431,7 @@ __attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 3))); void pg_log(eLogType type, char *fmt,...) __attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 3))); +void end_progress_output(void); void prep_status(const char *fmt,...) __attribute__((format(PG_PRINTF_ATTRIBUTE, 1, 2))); diff --git a/contrib/pg_upgrade/relfilenode.c b/contrib/pg_upgrade/relfilenode.c index 33a867f0d0..14e66df500 100644 --- a/contrib/pg_upgrade/relfilenode.c +++ b/contrib/pg_upgrade/relfilenode.c @@ -17,9 +17,8 @@ static void transfer_single_new_db(pageCnvCtx *pageConverter, FileNameMap *maps, int size); -static void transfer_relfile(pageCnvCtx *pageConverter, - const char *fromfile, const char *tofile, - const char *nspname, const char *relname); +static void transfer_relfile(pageCnvCtx *pageConverter, FileNameMap *map, + const char *suffix); /* @@ -83,9 +82,7 @@ transfer_all_new_dbs(DbInfoArr *old_db_arr, } } - prep_status(" "); /* in case nothing printed; pass a space so - * gcc doesn't complain about empty format - * string */ + end_progress_output(); check_ok(); return msg; @@ -131,55 +128,21 @@ static void transfer_single_new_db(pageCnvCtx *pageConverter, FileNameMap *maps, int size) { - char old_dir[MAXPGPATH]; - char file_pattern[MAXPGPATH]; - char **namelist = NULL; - int numFiles = 0; int mapnum; - int fileno; - bool vm_crashsafe_change = false; - - old_dir[0] = '\0'; - - /* Do not copy non-crashsafe vm files for binaries that assume crashsafety */ + bool vm_crashsafe_match = true; + + /* + * Do the old and new cluster disagree on the crash-safetiness of the vm + * files? If so, do not copy them. + */ if (old_cluster.controldata.cat_ver < VISIBILITY_MAP_CRASHSAFE_CAT_VER && new_cluster.controldata.cat_ver >= VISIBILITY_MAP_CRASHSAFE_CAT_VER) - vm_crashsafe_change = true; + vm_crashsafe_match = false; for (mapnum = 0; mapnum < size; mapnum++) { - char old_file[MAXPGPATH]; - char new_file[MAXPGPATH]; - - /* Changed tablespaces? Need a new directory scan? */ - if (strcmp(maps[mapnum].old_dir, old_dir) != 0) - { - if (numFiles > 0) - { - for (fileno = 0; fileno < numFiles; fileno++) - pg_free(namelist[fileno]); - pg_free(namelist); - } - - snprintf(old_dir, sizeof(old_dir), "%s", maps[mapnum].old_dir); - numFiles = load_directory(old_dir, &namelist); - } - - /* Copying files might take some time, so give feedback. */ - - snprintf(old_file, sizeof(old_file), "%s/%u", maps[mapnum].old_dir, - maps[mapnum].old_relfilenode); - snprintf(new_file, sizeof(new_file), "%s/%u", maps[mapnum].new_dir, - maps[mapnum].new_relfilenode); - pg_log(PG_REPORT, OVERWRITE_MESSAGE, old_file); - - /* - * Copy/link the relation's primary file (segment 0 of main fork) - * to the new cluster - */ - unlink(new_file); - transfer_relfile(pageConverter, old_file, new_file, - maps[mapnum].nspname, maps[mapnum].relname); + /* transfer primary file */ + transfer_relfile(pageConverter, &maps[mapnum], ""); /* fsm/vm files added in PG 8.4 */ if (GET_MAJOR_VERSION(old_cluster.major_version) >= 804) @@ -187,67 +150,11 @@ transfer_single_new_db(pageCnvCtx *pageConverter, /* * Copy/link any fsm and vm files, if they exist */ - snprintf(file_pattern, sizeof(file_pattern), "%u_", - maps[mapnum].old_relfilenode); - - for (fileno = 0; fileno < numFiles; fileno++) - { - char *vm_offset = strstr(namelist[fileno], "_vm"); - bool is_vm_file = false; - - /* Is a visibility map file? (name ends with _vm) */ - if (vm_offset && strlen(vm_offset) == strlen("_vm")) - is_vm_file = true; - - if (strncmp(namelist[fileno], file_pattern, - strlen(file_pattern)) == 0 && - (!is_vm_file || !vm_crashsafe_change)) - { - snprintf(old_file, sizeof(old_file), "%s/%s", maps[mapnum].old_dir, - namelist[fileno]); - snprintf(new_file, sizeof(new_file), "%s/%u%s", maps[mapnum].new_dir, - maps[mapnum].new_relfilenode, strchr(namelist[fileno], '_')); - - unlink(new_file); - transfer_relfile(pageConverter, old_file, new_file, - maps[mapnum].nspname, maps[mapnum].relname); - } - } - } - - /* - * Now copy/link any related segments as well. Remember, PG breaks - * large files into 1GB segments, the first segment has no extension, - * subsequent segments are named relfilenode.1, relfilenode.2, - * relfilenode.3, ... 'fsm' and 'vm' files use underscores so are not - * copied. - */ - snprintf(file_pattern, sizeof(file_pattern), "%u.", - maps[mapnum].old_relfilenode); - - for (fileno = 0; fileno < numFiles; fileno++) - { - if (strncmp(namelist[fileno], file_pattern, - strlen(file_pattern)) == 0) - { - snprintf(old_file, sizeof(old_file), "%s/%s", maps[mapnum].old_dir, - namelist[fileno]); - snprintf(new_file, sizeof(new_file), "%s/%u%s", maps[mapnum].new_dir, - maps[mapnum].new_relfilenode, strchr(namelist[fileno], '.')); - - unlink(new_file); - transfer_relfile(pageConverter, old_file, new_file, - maps[mapnum].nspname, maps[mapnum].relname); - } + transfer_relfile(pageConverter, &maps[mapnum], "_fsm"); + if (vm_crashsafe_match) + transfer_relfile(pageConverter, &maps[mapnum], "_vm"); } } - - if (numFiles > 0) - { - for (fileno = 0; fileno < numFiles; fileno++) - pg_free(namelist[fileno]); - pg_free(namelist); - } } @@ -257,31 +164,79 @@ transfer_single_new_db(pageCnvCtx *pageConverter, * Copy or link file from old cluster to new one. */ static void -transfer_relfile(pageCnvCtx *pageConverter, const char *old_file, - const char *new_file, const char *nspname, const char *relname) +transfer_relfile(pageCnvCtx *pageConverter, FileNameMap *map, + const char *type_suffix) { const char *msg; - - if ((user_opts.transfer_mode == TRANSFER_MODE_LINK) && (pageConverter != NULL)) - pg_log(PG_FATAL, "This upgrade requires page-by-page conversion, " - "you must use copy mode instead of link mode.\n"); - - if (user_opts.transfer_mode == TRANSFER_MODE_COPY) + char old_file[MAXPGPATH]; + char new_file[MAXPGPATH]; + int fd; + int segno; + char extent_suffix[65]; + + /* + * Now copy/link any related segments as well. Remember, PG breaks + * large files into 1GB segments, the first segment has no extension, + * subsequent segments are named relfilenode.1, relfilenode.2, + * relfilenode.3. + * copied. + */ + for (segno = 0;; segno++) { - pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", old_file, new_file); + if (segno == 0) + extent_suffix[0] = '\0'; + else + snprintf(extent_suffix, sizeof(extent_suffix), ".%d", segno); + + snprintf(old_file, sizeof(old_file), "%s/%u%s%s", map->old_dir, + map->old_relfilenode, type_suffix, extent_suffix); + snprintf(new_file, sizeof(new_file), "%s/%u%s%s", map->new_dir, + map->new_relfilenode, type_suffix, extent_suffix); + + /* Is it an extent, fsm, or vm file? */ + if (type_suffix[0] != '\0' || segno != 0) + { + /* Did file open fail? */ + if ((fd = open(old_file, O_RDONLY, 0)) == -1) + { + /* File does not exist? That's OK, just return */ + if (errno == ENOENT) + return; + else + pg_log(PG_FATAL, "error while checking for file existance \"%s.%s\" (\"%s\" to \"%s\"): %s\n", + map->nspname, map->relname, old_file, new_file, + getErrorText(errno)); + } + close(fd); + } - if ((msg = copyAndUpdateFile(pageConverter, old_file, new_file, true)) != NULL) - pg_log(PG_FATAL, "error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", - nspname, relname, old_file, new_file, msg); - } - else - { - pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"\n", old_file, new_file); + unlink(new_file); + + /* Copying files might take some time, so give feedback. */ + pg_log(PG_REPORT, OVERWRITE_MESSAGE, old_file); + + if ((user_opts.transfer_mode == TRANSFER_MODE_LINK) && (pageConverter != NULL)) + pg_log(PG_FATAL, "This upgrade requires page-by-page conversion, " + "you must use copy mode instead of link mode.\n"); + + if (user_opts.transfer_mode == TRANSFER_MODE_COPY) + { + pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", old_file, new_file); + + if ((msg = copyAndUpdateFile(pageConverter, old_file, new_file, true)) != NULL) + pg_log(PG_FATAL, "error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", + map->nspname, map->relname, old_file, new_file, msg); + } + else + { + pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"\n", old_file, new_file); + + if ((msg = linkAndUpdateFile(pageConverter, old_file, new_file)) != NULL) + pg_log(PG_FATAL, + "error while creating link for relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", + map->nspname, map->relname, old_file, new_file, msg); + } + } - if ((msg = linkAndUpdateFile(pageConverter, old_file, new_file)) != NULL) - pg_log(PG_FATAL, - "error while creating link for relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", - nspname, relname, old_file, new_file, msg); - } return; } diff --git a/contrib/pg_upgrade/server.c b/contrib/pg_upgrade/server.c index a9f9d85b78..db62249637 100644 --- a/contrib/pg_upgrade/server.c +++ b/contrib/pg_upgrade/server.c @@ -208,13 +208,20 @@ start_postmaster(ClusterInfo *cluster) * maximum. We assume all datfrozenxid and relfrozen values are less than * a gap of 2000000000 from the current xid counter, so autovacuum will * not touch them. + * + * Turn off durability requirements to improve object creation speed, and + * we only modify the new cluster, so only use it there. If there is a + * crash, the new cluster has to be recreated anyway. fsync=off is a big + * win on ext4. */ snprintf(cmd, sizeof(cmd), - "\"%s/pg_ctl\" -w -l \"%s\" -D \"%s\" -o \"-p %d %s %s%s\" start", + "\"%s/pg_ctl\" -w -l \"%s\" -D \"%s\" -o \"-p %d%s%s%s%s\" start", cluster->bindir, SERVER_LOG_FILE, cluster->pgconfig, cluster->port, (cluster->controldata.cat_ver >= - BINARY_UPGRADE_SERVER_FLAG_CAT_VER) ? "-b" : - "-c autovacuum=off -c autovacuum_freeze_max_age=2000000000", + BINARY_UPGRADE_SERVER_FLAG_CAT_VER) ? " -b" : + " -c autovacuum=off -c autovacuum_freeze_max_age=2000000000", + (cluster == &new_cluster) ? + " -c synchronous_commit=off -c fsync=off -c full_page_writes=off" : "", cluster->pgopts ? cluster->pgopts : "", socket_string); /* diff --git a/contrib/pg_upgrade/test.sh b/contrib/pg_upgrade/test.sh index 32fb6bf56f..0ccb8a5985 100644 --- a/contrib/pg_upgrade/test.sh +++ b/contrib/pg_upgrade/test.sh @@ -15,7 +15,7 @@ set -e : ${PGPORT=50432} export PGPORT -testhost=`uname -o` +testhost=`uname -s` temp_root=$PWD/tmp_check @@ -58,10 +58,20 @@ PGDATA=$temp_root/data export PGDATA rm -rf "$PGDATA" "$PGDATA".old +unset PGDATABASE +unset PGUSER +unset PGSERVICE +unset PGSSLMODE +unset PGREQUIRESSL +unset PGCONNECT_TIMEOUT +unset PGHOST +unset PGHOSTADDR + logdir=$PWD/log rm -rf "$logdir" mkdir "$logdir" +# enable echo so the user can see what is being executed set -x $oldbindir/initdb -N @@ -110,23 +120,27 @@ pg_upgrade -d "${PGDATA}.old" -D "${PGDATA}" -b "$oldbindir" -B "$bindir" pg_ctl start -l "$logdir/postmaster2.log" -o '-F' -w -if [ $testhost = Msys ] ; then - cmd /c analyze_new_cluster.bat -else - sh ./analyze_new_cluster.sh -fi +case $testhost in + MINGW*) cmd /c analyze_new_cluster.bat ;; + *) sh ./analyze_new_cluster.sh ;; +esac + pg_dumpall -f "$temp_root"/dump2.sql || pg_dumpall2_status=$? pg_ctl -m fast stop + +# no need to echo commands anymore +set +x +echo + if [ -n "$pg_dumpall2_status" ]; then echo "pg_dumpall of post-upgrade database cluster failed" exit 1 fi -if [ $testhost = Msys ] ; then - cmd /c delete_old_cluster.bat -else - sh ./delete_old_cluster.sh -fi +case $testhost in + MINGW*) cmd /c delete_old_cluster.bat ;; + *) sh ./delete_old_cluster.sh ;; +esac if diff -q "$temp_root"/dump1.sql "$temp_root"/dump2.sql; then echo PASSED diff --git a/contrib/pg_upgrade/util.c b/contrib/pg_upgrade/util.c index 1d4bc89f0b..0c1ecccaa7 100644 --- a/contrib/pg_upgrade/util.c +++ b/contrib/pg_upgrade/util.c @@ -35,6 +35,18 @@ report_status(eLogType type, const char *fmt,...) } +/* force blank output for progress display */ +void +end_progress_output(void) +{ + /* + * In case nothing printed; pass a space so gcc doesn't complain about + * empty format string. + */ + prep_status(" "); +} + + /* * prep_status * diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c index 5d48aeeae4..e376452228 100644 --- a/contrib/pgbench/pgbench.c +++ b/contrib/pgbench/pgbench.c @@ -1444,7 +1444,7 @@ init(bool is_no_vacuum) if (j % 100000 == 0) fprintf(stderr, "%d of %d tuples (%d%%) done.\n", j, naccounts * scale, - j * 100 / (naccounts * scale)); + (int) (((int64) j * 100) / (naccounts * scale))); } if (PQputline(con, "\\.\n")) { @@ -1915,6 +1915,15 @@ printResults(int ttype, int normal_xacts, int nclients, int main(int argc, char **argv) { + static struct option long_options[] = { + {"foreign-keys", no_argument, &foreign_keys, 1}, + {"index-tablespace", required_argument, NULL, 3}, + {"tablespace", required_argument, NULL, 2}, + {"unlogged-tables", no_argument, &unlogged_tables, 1}, + {"sampling-rate", required_argument, NULL, 4}, + {NULL, 0, NULL, 0} + }; + int c; int nclients = 1; /* default number of simulated clients */ int nthreads = 1; /* default number of threads */ @@ -1937,15 +1946,6 @@ main(int argc, char **argv) int i; - static struct option long_options[] = { - {"foreign-keys", no_argument, &foreign_keys, 1}, - {"index-tablespace", required_argument, NULL, 3}, - {"tablespace", required_argument, NULL, 2}, - {"unlogged-tables", no_argument, &unlogged_tables, 1}, - {"sampling-rate", required_argument, NULL, 4}, - {NULL, 0, NULL, 0} - }; - #ifdef HAVE_GETRLIMIT struct rlimit rlim; #endif diff --git a/contrib/sepgsql/expected/ddl.out b/contrib/sepgsql/expected/ddl.out index e7a8d9c301..1f7ea886b0 100644 --- a/contrib/sepgsql/expected/ddl.out +++ b/contrib/sepgsql/expected/ddl.out @@ -34,6 +34,8 @@ LOG: SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_ LOG: SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column ctid" LOG: SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column x" LOG: SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column y" +LOG: SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema" +LOG: SELinux: allowed { setattr } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_table name="table regtest_table" ALTER TABLE regtest_table ADD COLUMN z int; LOG: SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column z" CREATE TABLE regtest_table_2 (a int) WITH OIDS; @@ -93,6 +95,55 @@ LOG: SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfine LOG: SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_proc_exec_t:s0 tclass=db_procedure name="function regtest_func_2(integer)" RESET SESSION AUTHORIZATION; -- +-- ALTER and CREATE/DROP extra attribute permissions +-- +CREATE TABLE regtest_table_4 (x int primary key, y int, z int); +LOG: SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema" +LOG: SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_table name="table regtest_table_4" +LOG: SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_4 column tableoid" +LOG: SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_4 column cmax" +LOG: SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_4 column xmax" +LOG: SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_4 column cmin" +LOG: SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_4 column xmin" +LOG: SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_4 column ctid" +LOG: SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_4 column x" +LOG: SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_4 column y" +LOG: SELinux: allowed { create } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_4 column z" +LOG: SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema" +LOG: SELinux: allowed { setattr } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_table name="table regtest_table_4" +CREATE INDEX regtest_index_tbl4_y ON regtest_table_4(y); +LOG: SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema" +LOG: SELinux: allowed { setattr } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_table name="table regtest_table_4" +CREATE INDEX regtest_index_tbl4_z ON regtest_table_4(z); +LOG: SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema" +LOG: SELinux: allowed { setattr } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_table name="table regtest_table_4" +ALTER TABLE regtest_table_4 ALTER COLUMN y TYPE float; +DROP INDEX regtest_index_tbl4_y; +LOG: SELinux: allowed { remove_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema" +LOG: SELinux: allowed { setattr } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_table name="table regtest_table_4" +ALTER TABLE regtest_table_4 + ADD CONSTRAINT regtest_tbl4_con EXCLUDE USING btree (z WITH =); +LOG: SELinux: allowed { add_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema" +LOG: SELinux: allowed { setattr } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_table name="table regtest_table_4" +DROP TABLE regtest_table_4 CASCADE; +LOG: SELinux: allowed { remove_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema" +LOG: SELinux: allowed { setattr } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_table name="table regtest_table_4" +LOG: SELinux: allowed { remove_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema" +LOG: SELinux: allowed { setattr } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_table name="table regtest_table_4" +LOG: SELinux: allowed { remove_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema" +LOG: SELinux: allowed { setattr } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_table name="table regtest_table_4" +LOG: SELinux: allowed { remove_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema" +LOG: SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_table name="table regtest_table_4" +LOG: SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_4 column tableoid" +LOG: SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_4 column cmax" +LOG: SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_4 column xmax" +LOG: SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_4 column cmin" +LOG: SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_4 column xmin" +LOG: SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_4 column ctid" +LOG: SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_4 column x" +LOG: SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_4 column y" +LOG: SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table_4 column z" +-- -- DROP Permission checks (with clean-up) -- DROP FUNCTION regtest_func(text,int[]); @@ -115,6 +166,8 @@ DROP TABLE regtest_table; LOG: SELinux: allowed { remove_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema" LOG: SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_seq_t:s0 tclass=db_sequence name="sequence regtest_table_x_seq" LOG: SELinux: allowed { remove_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema" +LOG: SELinux: allowed { setattr } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_table name="table regtest_table" +LOG: SELinux: allowed { remove_name } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_schema_t:s0 tclass=db_schema name="schema regtest_schema" LOG: SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_table name="table regtest_table" LOG: SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column tableoid" LOG: SELinux: allowed { drop } scontext=unconfined_u:unconfined_r:unconfined_t:s0 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table regtest_table column cmax" diff --git a/contrib/sepgsql/hooks.c b/contrib/sepgsql/hooks.c index f3cf1c5f88..ab55d6ea4b 100644 --- a/contrib/sepgsql/hooks.c +++ b/contrib/sepgsql/hooks.c @@ -38,7 +38,6 @@ void _PG_init(void); static object_access_hook_type next_object_access_hook = NULL; static ExecutorCheckPerms_hook_type next_exec_check_perms_hook = NULL; static ProcessUtility_hook_type next_ProcessUtility_hook = NULL; -static ExecutorStart_hook_type next_ExecutorStart_hook = NULL; /* * Contextual information on DDL commands @@ -97,53 +96,55 @@ sepgsql_object_access(ObjectAccessType access, switch (access) { case OAT_POST_CREATE: - switch (classId) { - case DatabaseRelationId: - sepgsql_database_post_create(objectId, - sepgsql_context_info.createdb_dtemplate); - break; + ObjectAccessPostCreate *pc_arg = arg; + bool is_internal; - case NamespaceRelationId: - sepgsql_schema_post_create(objectId); - break; + is_internal = pc_arg ? pc_arg->is_internal : false; - case RelationRelationId: - if (subId == 0) - { - /* - * All cases we want to apply permission checks on - * creation of a new relation are invocation of the - * heap_create_with_catalog via DefineRelation or - * OpenIntoRel. Elsewhere, we need neither assignment - * of security label nor permission checks. - */ - switch (sepgsql_context_info.cmdtype) + switch (classId) + { + case DatabaseRelationId: + Assert(!is_internal); + sepgsql_database_post_create(objectId, + sepgsql_context_info.createdb_dtemplate); + break; + + case NamespaceRelationId: + Assert(!is_internal); + sepgsql_schema_post_create(objectId); + break; + + case RelationRelationId: + if (subId == 0) { - case T_CreateStmt: - case T_ViewStmt: - case T_CreateSeqStmt: - case T_CompositeTypeStmt: - case T_CreateForeignTableStmt: - case T_SelectStmt: - sepgsql_relation_post_create(objectId); - break; - default: - /* via make_new_heap() */ + /* + * The cases in which we want to apply permission + * checks on creation of a new relation correspond + * to direct user invocation. For internal uses, + * that is creation of toast tables, index rebuild + * or ALTER TABLE commands, we need neither + * assignment of security labels nor permission + * checks. + */ + if (is_internal) break; + + sepgsql_relation_post_create(objectId); } - } - else - sepgsql_attribute_post_create(objectId, subId); - break; + else + sepgsql_attribute_post_create(objectId, subId); + break; - case ProcedureRelationId: - sepgsql_proc_post_create(objectId); - break; + case ProcedureRelationId: + Assert(!is_internal); + sepgsql_proc_post_create(objectId); + break; - default: - /* Ignore unsupported object classes */ - break; + default: + /* Ignore unsupported object classes */ + break; + } } break; @@ -215,46 +216,6 @@ sepgsql_exec_check_perms(List *rangeTabls, bool abort) return true; } -/* - * sepgsql_executor_start - * - * It saves contextual information during ExecutorStart to distinguish - * a case with/without permission checks later. - */ -static void -sepgsql_executor_start(QueryDesc *queryDesc, int eflags) -{ - sepgsql_context_info_t saved_context_info = sepgsql_context_info; - - PG_TRY(); - { - if (queryDesc->operation == CMD_SELECT) - sepgsql_context_info.cmdtype = T_SelectStmt; - else if (queryDesc->operation == CMD_INSERT) - sepgsql_context_info.cmdtype = T_InsertStmt; - else if (queryDesc->operation == CMD_DELETE) - sepgsql_context_info.cmdtype = T_DeleteStmt; - else if (queryDesc->operation == CMD_UPDATE) - sepgsql_context_info.cmdtype = T_UpdateStmt; - - /* - * XXX - If queryDesc->operation is not above four cases, an error - * shall be raised on the following executor stage soon. - */ - if (next_ExecutorStart_hook) - (*next_ExecutorStart_hook) (queryDesc, eflags); - else - standard_ExecutorStart(queryDesc, eflags); - } - PG_CATCH(); - { - sepgsql_context_info = saved_context_info; - PG_RE_THROW(); - } - PG_END_TRY(); - sepgsql_context_info = saved_context_info; -} - /* * sepgsql_utility_command * @@ -425,10 +386,6 @@ _PG_init(void) next_ProcessUtility_hook = ProcessUtility_hook; ProcessUtility_hook = sepgsql_utility_command; - /* ExecutorStart hook */ - next_ExecutorStart_hook = ExecutorStart_hook; - ExecutorStart_hook = sepgsql_executor_start; - /* init contextual info */ memset(&sepgsql_context_info, 0, sizeof(sepgsql_context_info)); } diff --git a/contrib/sepgsql/relation.c b/contrib/sepgsql/relation.c index 4ab7fc8be9..783f330d1c 100644 --- a/contrib/sepgsql/relation.c +++ b/contrib/sepgsql/relation.c @@ -23,11 +23,14 @@ #include "utils/fmgroids.h" #include "utils/catcache.h" #include "utils/lsyscache.h" +#include "utils/rel.h" #include "utils/syscache.h" #include "utils/tqual.h" #include "sepgsql.h" +static void sepgsql_index_modify(Oid indexOid); + /* * sepgsql_attribute_post_create * @@ -229,6 +232,23 @@ sepgsql_relation_post_create(Oid relOid) classForm = (Form_pg_class) GETSTRUCT(tuple); + /* ignore indexes on toast tables */ + if (classForm->relkind == RELKIND_INDEX && + classForm->relnamespace == PG_TOAST_NAMESPACE) + goto out; + + /* + * check db_schema:{add_name} permission of the namespace + */ + object.classId = NamespaceRelationId; + object.objectId = classForm->relnamespace; + object.objectSubId = 0; + sepgsql_avc_check_perms(&object, + SEPG_CLASS_DB_SCHEMA, + SEPG_DB_SCHEMA__ADD_NAME, + getObjectDescription(&object), + true); + switch (classForm->relkind) { case RELKIND_RELATION: @@ -243,22 +263,15 @@ sepgsql_relation_post_create(Oid relOid) tclass = SEPG_CLASS_DB_VIEW; tclass_text = "view"; break; + case RELKIND_INDEX: + /* deal with indexes specially; no need for tclass */ + sepgsql_index_modify(relOid); + goto out; default: + /* ignore other relkinds */ goto out; } - /* - * check db_schema:{add_name} permission of the namespace - */ - object.classId = NamespaceRelationId; - object.objectId = classForm->relnamespace; - object.objectSubId = 0; - sepgsql_avc_check_perms(&object, - SEPG_CLASS_DB_SCHEMA, - SEPG_DB_SCHEMA__ADD_NAME, - getObjectDescription(&object), - true); - /* * Compute a default security label when we create a new relation object * under the specified namespace. @@ -342,6 +355,7 @@ sepgsql_relation_post_create(Oid relOid) heap_close(arel, AccessShareLock); } pfree(rcontext); + out: systable_endscan(sscan); heap_close(rel, AccessShareLock); @@ -357,18 +371,31 @@ sepgsql_relation_drop(Oid relOid) { ObjectAddress object; char *audit_name; - uint16_t tclass = 0; + uint16_t tclass; char relkind; relkind = get_rel_relkind(relOid); - if (relkind == RELKIND_RELATION) - tclass = SEPG_CLASS_DB_TABLE; - else if (relkind == RELKIND_SEQUENCE) - tclass = SEPG_CLASS_DB_SEQUENCE; - else if (relkind == RELKIND_VIEW) - tclass = SEPG_CLASS_DB_VIEW; - else - return; + switch (relkind) + { + case RELKIND_RELATION: + tclass = SEPG_CLASS_DB_TABLE; + break; + case RELKIND_SEQUENCE: + tclass = SEPG_CLASS_DB_SEQUENCE; + break; + case RELKIND_VIEW: + tclass = SEPG_CLASS_DB_VIEW; + break; + case RELKIND_INDEX: + /* ignore indexes on toast tables */ + if (get_rel_namespace(relOid) == PG_TOAST_NAMESPACE) + return; + /* other indexes are handled specially below; no need for tclass */ + break; + default: + /* ignore other relkinds */ + return; + } /* * check db_schema:{remove_name} permission @@ -385,6 +412,13 @@ sepgsql_relation_drop(Oid relOid) true); pfree(audit_name); + /* deal with indexes specially */ + if (relkind == RELKIND_INDEX) + { + sepgsql_index_modify(relOid); + return; + } + /* * check db_table/sequence/view:{drop} permission */ @@ -486,3 +520,121 @@ sepgsql_relation_relabel(Oid relOid, const char *seclabel) true); pfree(audit_name); } + +/* + * sepgsql_relation_setattr + * + * It checks privileges to set attribute of the supplied relation + */ +void +sepgsql_relation_setattr(Oid relOid) +{ + ObjectAddress object; + char *audit_name; + uint16_t tclass; + + switch (get_rel_relkind(relOid)) + { + case RELKIND_RELATION: + tclass = SEPG_CLASS_DB_TABLE; + break; + case RELKIND_SEQUENCE: + tclass = SEPG_CLASS_DB_SEQUENCE; + break; + case RELKIND_VIEW: + tclass = SEPG_CLASS_DB_VIEW; + break; + case RELKIND_INDEX: + /* deal with indexes specially */ + sepgsql_index_modify(relOid); + return; + default: + /* other relkinds don't need additional work */ + return; + } + + object.classId = RelationRelationId; + object.objectId = relOid; + object.objectSubId = 0; + audit_name = getObjectDescription(&object); + + /* + * XXX - we should add checks related to namespace stuff, when + * object_access_hook get support for ALTER statement. Right now, there is + * no invocation path on ALTER ... RENAME TO / SET SCHEMA. + */ + + /* + * check db_xxx:{setattr} permission + */ + sepgsql_avc_check_perms(&object, + tclass, + SEPG_DB_TABLE__SETATTR, + audit_name, + true); + pfree(audit_name); +} + +/* + * sepgsql_relation_setattr_extra + * + * It checks permission of the relation being referenced by extra attributes, + * such as pg_index entries. Like core PostgreSQL, sepgsql also does not deal + * with such entries as individual "objects", thus, modification of these + * entries shall be considered as setting an attribute of the underlying + * relation. + */ +static void +sepgsql_relation_setattr_extra(Relation catalog, + Oid catindex_id, + Oid extra_oid, + AttrNumber anum_relation_id, + AttrNumber anum_extra_id) +{ + ScanKeyData skey; + SysScanDesc sscan; + HeapTuple tuple; + Datum datum; + bool isnull; + + ScanKeyInit(&skey, anum_extra_id, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(extra_oid)); + + sscan = systable_beginscan(catalog, catindex_id, true, + SnapshotSelf, 1, &skey); + tuple = systable_getnext(sscan); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "catalog lookup failed for object %u in catalog \"%s\"", + extra_oid, RelationGetRelationName(catalog)); + + datum = heap_getattr(tuple, anum_relation_id, + RelationGetDescr(catalog), &isnull); + Assert(!isnull); + + sepgsql_relation_setattr(DatumGetObjectId(datum)); + + systable_endscan(sscan); +} + +/* + * sepgsql_index_modify + * Handle index create, update, drop + * + * Unlike other relation kinds, indexes do not have their own security labels, + * so instead of doing checks directly, treat them as extra attributes of their + * owning tables; so check 'setattr' permissions on the table. + */ +static void +sepgsql_index_modify(Oid indexOid) +{ + Relation catalog = heap_open(IndexRelationId, AccessShareLock); + + /* check db_table:{setattr} permission of the table being indexed */ + sepgsql_relation_setattr_extra(catalog, + IndexRelidIndexId, + indexOid, + Anum_pg_index_indrelid, + Anum_pg_index_indexrelid); + heap_close(catalog, AccessShareLock); +} diff --git a/contrib/sepgsql/sepgsql.h b/contrib/sepgsql/sepgsql.h index 9c89eaa893..b6dcb86e55 100644 --- a/contrib/sepgsql/sepgsql.h +++ b/contrib/sepgsql/sepgsql.h @@ -145,7 +145,6 @@ #define SEPG_DB_TABLE__INSERT (1<<8) #define SEPG_DB_TABLE__DELETE (1<<9) #define SEPG_DB_TABLE__LOCK (1<<10) -#define SEPG_DB_TABLE__INDEXON (1<<11) #define SEPG_DB_SEQUENCE__CREATE (SEPG_DB_DATABASE__CREATE) #define SEPG_DB_SEQUENCE__DROP (SEPG_DB_DATABASE__DROP) @@ -312,6 +311,7 @@ extern void sepgsql_attribute_relabel(Oid relOid, AttrNumber attnum, extern void sepgsql_relation_post_create(Oid relOid); extern void sepgsql_relation_drop(Oid relOid); extern void sepgsql_relation_relabel(Oid relOid, const char *seclabel); +extern void sepgsql_relation_setattr(Oid relOid); /* * proc.c diff --git a/contrib/sepgsql/sql/ddl.sql b/contrib/sepgsql/sql/ddl.sql index 8dd57e0eaf..5afe1ba193 100644 --- a/contrib/sepgsql/sql/ddl.sql +++ b/contrib/sepgsql/sql/ddl.sql @@ -59,6 +59,18 @@ CREATE FUNCTION regtest_func_2(int) RETURNS bool LANGUAGE plpgsql RESET SESSION AUTHORIZATION; +-- +-- ALTER and CREATE/DROP extra attribute permissions +-- +CREATE TABLE regtest_table_4 (x int primary key, y int, z int); +CREATE INDEX regtest_index_tbl4_y ON regtest_table_4(y); +CREATE INDEX regtest_index_tbl4_z ON regtest_table_4(z); +ALTER TABLE regtest_table_4 ALTER COLUMN y TYPE float; +DROP INDEX regtest_index_tbl4_y; +ALTER TABLE regtest_table_4 + ADD CONSTRAINT regtest_tbl4_con EXCLUDE USING btree (z WITH =); +DROP TABLE regtest_table_4 CASCADE; + -- -- DROP Permission checks (with clean-up) -- diff --git a/contrib/tcn/tcn.c b/contrib/tcn/tcn.c index 6a8a96f603..c9594f68ad 100644 --- a/contrib/tcn/tcn.c +++ b/contrib/tcn/tcn.c @@ -141,8 +141,8 @@ triggered_change_notification(PG_FUNCTION_ARGS) if (!HeapTupleIsValid(indexTuple)) /* should not happen */ elog(ERROR, "cache lookup failed for index %u", indexoid); index = (Form_pg_index) GETSTRUCT(indexTuple); - /* we're only interested if it is the primary key */ - if (index->indisprimary) + /* we're only interested if it is the primary key and valid */ + if (index->indisprimary && IndexIsValid(index)) { int numatts = index->indnatts; diff --git a/doc/src/sgml/Makefile b/doc/src/sgml/Makefile index 5c3afad32b..f40a1fe7fc 100644 --- a/doc/src/sgml/Makefile +++ b/doc/src/sgml/Makefile @@ -255,8 +255,12 @@ postgres.xml: $(srcdir)/postgres.sgml $(ALMOSTALLSGML) rm postgres.xmltmp # ' hello Emacs -xslthtml: stylesheet.xsl postgres.xml +xslthtml: xslthtml-stamp + +xslthtml-stamp: stylesheet.xsl postgres.xml $(XSLTPROC) $(XSLTPROCFLAGS) $(XSLTPROC_HTML_FLAGS) $^ + cp $(srcdir)/stylesheet.css html/ + touch $@ htmlhelp: stylesheet-hh.xsl postgres.xml $(XSLTPROC) $(XSLTPROCFLAGS) $^ diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index f99919093c..5f270404bf 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -3480,7 +3480,7 @@ index is possibly incomplete: it must still be modified by INSERT/UPDATE operations, but it cannot safely be used for queries. If it is unique, the uniqueness property is not - true either. + guaranteed true either. @@ -3507,6 +3507,16 @@ + + indislive + bool + + + If false, the index is in process of being dropped, and should be + ignored for all purposes (including HOT-safety decisions) + + + indkey int2vector diff --git a/doc/src/sgml/client-auth.sgml b/doc/src/sgml/client-auth.sgml index 5b39269067..909c81bd40 100644 --- a/doc/src/sgml/client-auth.sgml +++ b/doc/src/sgml/client-auth.sgml @@ -1486,6 +1486,34 @@ omicron bryanh guest1 + + ldapurl + + + An RFC 4516 LDAP URL. This is an alternative way to write most of the + other LDAP options in a more compact and standard form. The format is + +ldap://[user[:password]@]host[:port]/basedn[?[attribute][?[scope]]] + + scope must be one + of base, one, sub, + typically the latter. Only one attribute is used, and some other + components of standard LDAP URLs such as filters and extensions are + not supported. + + + + To use encrypted LDAP connections, the ldaptls + option has to be used in addition to ldapurl. + The ldaps URL scheme (direct SSL connection) is not + supported. + + + + LDAP URLs are currently only supported with OpenLDAP, not on Windows. + + + @@ -1507,7 +1535,7 @@ host ... ldap ldapserver=ldap.example.net ldapprefix="cn=" ldapsuffix=", dc=exam - Here is an exaple for a search+bind configuration: + Here is an example for a search+bind configuration: host ... ldap ldapserver=ldap.example.net ldapbasedn="dc=example, dc=net" ldapsearchattribute=uid @@ -1520,6 +1548,15 @@ host ... ldap ldapserver=ldap.example.net ldapbasedn="dc=example, dc=net" ldapse If that second connection succeeds, the database access is granted. + + Here is the same search+bind configuration written as a URL: + +host ... ldap lapurl="ldap://ldap.example.net/dc=example,dc=net?uid?sub" + + Some other software that supports authentication against LDAP uses the + same URL format, so it will be easier to share the configuration. + + Since LDAP often uses commas and spaces to separate the different diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index b4fcbaf9c7..b7df8ce612 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -675,8 +675,9 @@ include 'filename' an otherwise idle connection. A value of 0 uses the system default. This parameter is supported only on systems that support the TCP_KEEPIDLE or TCP_KEEPALIVE symbols, and on - Windows; on other systems, it must be zero. This parameter is ignored - for connections made via a Unix-domain socket. + Windows; on other systems, it must be zero. + In sessions connected via a Unix-domain socket, this parameter is + ignored and always reads as zero. @@ -698,8 +699,9 @@ include 'filename' otherwise idle connection. A value of 0 uses the system default. This parameter is supported only on systems that support the TCP_KEEPINTVL symbol, and on Windows; on other systems, it - must be zero. This parameter is ignored for connections made via a - Unix-domain socket. + must be zero. + In sessions connected via a Unix-domain socket, this parameter is + ignored and always reads as zero. @@ -720,8 +722,9 @@ include 'filename' Specifies the number of keepalive packets to send on an otherwise idle connection. A value of 0 uses the system default. This parameter is supported only on systems that support the TCP_KEEPCNT - symbol; on other systems, it must be zero. This parameter is ignored - for connections made via a Unix-domain socket. + symbol; on other systems, it must be zero. + In sessions connected via a Unix-domain socket, this parameter is + ignored and always reads as zero. @@ -1693,6 +1696,15 @@ include 'filename' turning off fsync. + + For reliable recovery when changing fsync + off to on, it is necessary to force all modified buffers in the + kernel to durable storage. This can be done while the cluster + is shutdown or while fsync is on by running initdb + --sync-only, running sync, unmounting the + file system, or rebooting the server. + + In many situations, turning off for noncritical transactions can provide much of the potential @@ -4807,7 +4819,7 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv; Likewise, the current session's temporary-table schema, pg_temp_nnn, is always searched if it exists. It can be explicitly listed in the path by using the - alias pg_temp. If it is not listed in the path then + alias pg_temppg_temp. If it is not listed in the path then it is searched first (even before pg_catalog). However, the temporary schema is only searched for relation (table, view, sequence, etc) and data type names. It is never searched for diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml index 71cf59e38b..02763b58e6 100644 --- a/doc/src/sgml/datatype.sgml +++ b/doc/src/sgml/datatype.sgml @@ -3596,7 +3596,7 @@ SELECT person.name, holidays.num_weeks FROM person, holidays as the canonical form for MAC addresses, and specifies the first form (with colons) as the bit-reversed notation, so that 08-00-2b-01-02-03 = 01:00:4D:08:04:0C. This convention is widely - ignored nowadays, and it is only relevant for obsolete network + ignored nowadays, and it is relevant only for obsolete network protocols (such as Token Ring). PostgreSQL makes no provisions for bit reversal, and all accepted formats use the canonical LSB order. diff --git a/doc/src/sgml/docguide.sgml b/doc/src/sgml/docguide.sgml index 74eb5f28f2..09a46975e1 100644 --- a/doc/src/sgml/docguide.sgml +++ b/doc/src/sgml/docguide.sgml @@ -106,7 +106,7 @@ - DocBook DSSSL Stylesheets + DocBook DSSSL Stylesheets These contain the processing instructions for converting the @@ -117,7 +117,7 @@ - DocBook XSL Stylesheets + DocBook XSL Stylesheets This is another stylesheet for converting DocBook to other diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index f8f63d89f9..c8a2a149fa 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -7872,7 +7872,7 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple path(polygon) - point + path polygon to path path(polygon '((0,0),(1,1),(2,0))') diff --git a/doc/src/sgml/indices.sgml b/doc/src/sgml/indices.sgml index 3a46b96a9d..e6a14de504 100644 --- a/doc/src/sgml/indices.sgml +++ b/doc/src/sgml/indices.sgml @@ -196,8 +196,10 @@ CREATE INDEX name ON table Hash index operations are not presently WAL-logged, so hash indexes might need to be rebuilt with REINDEX - after a database crash. They are also not replicated over streaming or - file-based replication. + after a database crash if there were unwritten changes. + Also, changes to hash indexes are not replicated over streaming or + file-based replication after the initial base backup, so they + give wrong answers to queries that subsequently use them. For these reasons, hash index use is presently discouraged. diff --git a/doc/src/sgml/libpq.sgml b/doc/src/sgml/libpq.sgml index 255c5c1abb..e7ad066fe5 100644 --- a/doc/src/sgml/libpq.sgml +++ b/doc/src/sgml/libpq.sgml @@ -496,6 +496,30 @@ typedef struct + + PQconninfoPQconninfo + + + Returns the connection options used by a live connection. + +PQconninfoOption *PQconninfo(PGconn *conn); + + + + + Returns a connection options array. This can be used to determine + all possible PQconnectdb options and the + values that were used to connect to the server. The return + value points to an array of PQconninfoOption + structures, which ends with an entry having a null keyword + pointer. All notes above for PQconndefaults also + apply to the result of PQconninfo. + + + + + + PQconninfoParsePQconninfoParse diff --git a/doc/src/sgml/pgupgrade.sgml b/doc/src/sgml/pgupgrade.sgml index 301222c55d..998cb2fc9a 100644 --- a/doc/src/sgml/pgupgrade.sgml +++ b/doc/src/sgml/pgupgrade.sgml @@ -529,7 +529,10 @@ psql --username postgres --file script.sql postgres A Log-Shipping Standby Server () cannot be upgraded because the server must allow writes. The simplest way - is to upgrade the primary and use rsync to rebuild the standbys. + is to upgrade the primary and use rsync to rebuild the + standbys. You can run rsync while the primary is down, + or as part of a base backup () + which overwrites the old standby cluster. diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml index 3d72a162eb..f87020c909 100644 --- a/doc/src/sgml/protocol.sgml +++ b/doc/src/sgml/protocol.sgml @@ -1359,14 +1359,18 @@ The commands accepted in walsender mode are: has already been recycled. On success, server responds with a CopyBothResponse message, and then starts to stream WAL to the frontend. WAL will continue to be streamed until the connection is broken; - no further commands will be accepted. + no further commands will be accepted. If the WAL sender process is + terminated normally (during postmaster shutdown), it will send a + CommandComplete message before exiting. This might not happen during an + abnormal shutdown, of course. WAL data is sent as a series of CopyData messages. (This allows other information to be intermixed; in particular the server can send an ErrorResponse message if it encounters a failure after beginning - to stream.) The payload in each CopyData message follows this format: + to stream.) The payload of each CopyData message from server to the + client contains a message of one of the following formats: @@ -1390,34 +1394,32 @@ The commands accepted in walsender mode are: - Byte8 + Int64 - The starting point of the WAL data in this message, given in - XLogRecPtr format. + The starting point of the WAL data in this message. - Byte8 + Int64 - The current end of WAL on the server, given in - XLogRecPtr format. + The current end of WAL on the server. - Byte8 + Int64 - The server's system clock at the time of transmission, - given in TimestampTz format. + The server's system clock at the time of transmission, as + microseconds since midnight on 2000-01-01. @@ -1429,42 +1431,19 @@ The commands accepted in walsender mode are: A section of the WAL data stream. + + A single WAL record is never split across two XLogData messages. + When a WAL record crosses a WAL page boundary, and is therefore + already split using continuation records, it can be split at the page + boundary. In other words, the first main WAL record and its + continuation records can be sent in different XLogData messages. + - - - - A single WAL record is never split across two CopyData messages. - When a WAL record crosses a WAL page boundary, and is therefore - already split using continuation records, it can be split at the page - boundary. In other words, the first main WAL record and its - continuation records can be sent in different CopyData messages. - - - Note that all fields within the WAL data and the above-described header - will be in the sending server's native format. Endianness, and the - format for the timestamp, are unpredictable unless the receiver has - verified that the sender's system identifier matches its own - pg_control contents. - - - If the WAL sender process is terminated normally (during postmaster - shutdown), it will send a CommandComplete message before exiting. - This might not happen during an abnormal shutdown, of course. - - - - The receiving process can send replies back to the sender at any time, - using one of the following message formats (also in the payload of a - CopyData message): - - - - Primary keepalive message (B) @@ -1484,23 +1463,33 @@ The commands accepted in walsender mode are: - Byte8 + Int64 - The current end of WAL on the server, given in - XLogRecPtr format. + The current end of WAL on the server. - Byte8 + Int64 - The server's system clock at the time of transmission, - given in TimestampTz format. + The server's system clock at the time of transmission, as + microseconds since midnight on 2000-01-01. + + + + + + Byte1 + + + + 1 means that the client should reply to this message as soon as + possible, to avoid a timeout disconnect. 0 otherwise. @@ -1511,6 +1500,12 @@ The commands accepted in walsender mode are: + + The receiving process can send replies back to the sender at any time, + using one of the following message formats (also in the payload of a + CopyData message): + + @@ -1532,45 +1527,56 @@ The commands accepted in walsender mode are: - Byte8 + Int64 The location of the last WAL byte + 1 received and written to disk - in the standby, in XLogRecPtr format. + in the standby. - Byte8 + Int64 The location of the last WAL byte + 1 flushed to disk in - the standby, in XLogRecPtr format. + the standby. + + + + + + Int64 + + + + The location of the last WAL byte + 1 applied in the standby. - Byte8 + Int64 - The location of the last WAL byte + 1 applied in the standby, in - XLogRecPtr format. + The client's system clock at the time of transmission, as + microseconds since midnight on 2000-01-01. - Byte8 + Byte1 - The server's system clock at the time of transmission, - given in TimestampTz format. + If 1, the client requests the server to reply to this message + immediately. This can be used to ping the server, to test if + the connection is still healthy. @@ -1602,28 +1608,29 @@ The commands accepted in walsender mode are: - Byte8 + Int64 - The server's system clock at the time of transmission, - given in TimestampTz format. + The client's system clock at the time of transmission, as + microseconds since midnight on 2000-01-01. - Byte4 + Int32 - The standby's current xmin. + The standby's current xmin. This may be 0, if the standby does not + support feedback, or is not yet in Hot Standby state. - Byte4 + Int32 diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml index b2a6dd797c..6d34c31988 100644 --- a/doc/src/sgml/ref/copy.sgml +++ b/doc/src/sgml/ref/copy.sgml @@ -34,6 +34,7 @@ COPY { table_name [ ( format_name OIDS [ boolean ] + FREEZE [ boolean ] DELIMITER 'delimiter_character' NULL 'null_string' HEADER [ boolean ] @@ -181,6 +182,28 @@ COPY { table_name [ ( + + FREEZE + + + Specifies copying the data with rows already frozen, just as they + would be after running the VACUUM FREEZE command. + This is intended as a performance option for initial data loading. + Rows will be frozen only if the table being loaded has been created + in the current subtransaction, there are no cursors open and there + are no older snapshots held by this transaction. If those conditions + are not met the command will continue without error though will not + freeze rows. + + + Note that all sessions will immediately be able to see the data + once it has been successfully loaded. This violates the normal rules + of MVCC visibility and by specifying this option the user acknowledges + explicitly that this is understood. + + + + DELIMITER diff --git a/doc/src/sgml/ref/create_function.sgml b/doc/src/sgml/ref/create_function.sgml index 4336e4b218..fee6f53ba2 100644 --- a/doc/src/sgml/ref/create_function.sgml +++ b/doc/src/sgml/ref/create_function.sgml @@ -684,7 +684,7 @@ SELECT * FROM dup(42); temporary-table schema, which is searched first by default, and is normally writable by anyone. A secure arrangement can be had by forcing the temporary schema to be searched last. To do this, - write pg_temp as the last entry in search_path. + write pg_temppg_tempsecuring functions as the last entry in search_path. This function illustrates safe usage: diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml index 17b433a47e..d800701ff4 100644 --- a/doc/src/sgml/ref/create_index.sgml +++ b/doc/src/sgml/ref/create_index.sgml @@ -466,6 +466,18 @@ Indexes: they can be useful. + + + Hash index operations are not presently WAL-logged, + so hash indexes might need to be rebuilt with REINDEX + after a database crash if there were unwritten changes. + Also, changes to hash indexes are not replicated over streaming or + file-based replication after the initial base backup, so they + give wrong answers to queries that subsequently use them. + For these reasons, hash index use is presently discouraged. + + + Currently, only the B-tree, GiST and GIN index methods support multicolumn indexes. Up to 32 fields can be specified by default. diff --git a/doc/src/sgml/ref/create_table.sgml b/doc/src/sgml/ref/create_table.sgml index 445ca40695..8872920446 100644 --- a/doc/src/sgml/ref/create_table.sgml +++ b/doc/src/sgml/ref/create_table.sgml @@ -1453,6 +1453,7 @@ CREATE TABLE employees OF employee_type ( + diff --git a/doc/src/sgml/ref/create_view.sgml b/doc/src/sgml/ref/create_view.sgml index 838bf486a3..9e3bc2954f 100644 --- a/doc/src/sgml/ref/create_view.sgml +++ b/doc/src/sgml/ref/create_view.sgml @@ -130,9 +130,12 @@ CREATE [ OR REPLACE ] [ TEMP | TEMPORARY ] VIEW n Currently, views are read only: the system will not allow an insert, update, or delete on a view. You can get the effect of an updatable - view by creating rules that rewrite inserts, etc. on the view into + view by creating INSTEAD triggers on the view, which + must convert attempted inserts, etc. on the view into appropriate actions on other tables. For more information see - . + . Another possibility is to create + rules (see ), but in practice triggers + are easier to understand and use correctly. diff --git a/doc/src/sgml/ref/drop_index.sgml b/doc/src/sgml/ref/drop_index.sgml index 343f7aca00..98fd9c966c 100644 --- a/doc/src/sgml/ref/drop_index.sgml +++ b/doc/src/sgml/ref/drop_index.sgml @@ -40,34 +40,33 @@ DROP INDEX [ CONCURRENTLY ] [ IF EXISTS ] name - IF EXISTS + CONCURRENTLY - Do not throw an error if the index does not exist. A notice is issued - in this case. + Drop the index without locking out concurrent selects, inserts, updates, + and deletes on the index's table. A normal DROP INDEX + acquires exclusive lock on the table, blocking other accesses until the + index drop can be completed. With this option, the command instead + waits until conflicting transactions have completed. + + + There are several caveats to be aware of when using this option. + Only one index name can be specified, and the CASCADE option + is not supported. (Thus, an index that supports a UNIQUE or + PRIMARY KEY constraint cannot be dropped this way.) + Also, regular DROP INDEX commands can be + performed within a transaction block, but + DROP INDEX CONCURRENTLY cannot. - CONCURRENTLY + IF EXISTS - When this option is used, PostgreSQL will drop the - index without taking any locks that prevent concurrent selects, inserts, - updates, or deletes on the table; whereas a standard index drop - waits for a lock that locks out everything on the table until it's done. - Concurrent drop index is a two stage process. First, we mark the index - both invalid and not ready then commit the change. Next we wait until - there are no users locking the table who can see the index. - - - There are several caveats to be aware of when using this option. - Only one index name can be specified if the CONCURRENTLY - parameter is specified. Regular DROP INDEX command can be - performed within a transaction block, but - DROP INDEX CONCURRENTLY cannot. - The CASCADE option is not supported when dropping an index concurrently. + Do not throw an error if the index does not exist. A notice is issued + in this case. diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml index 08ee37e7d8..a1e46eb4c6 100644 --- a/doc/src/sgml/ref/initdb.sgml +++ b/doc/src/sgml/ref/initdb.sgml @@ -244,6 +244,17 @@ PostgreSQL documentation + + + + + + Safely write all database files to disk and exit. This does not + perform any of the normal initdb operations. + + + + diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml index a951d6b0f0..0bc3ca27b1 100644 --- a/doc/src/sgml/ref/pg_basebackup.sgml +++ b/doc/src/sgml/ref/pg_basebackup.sgml @@ -377,10 +377,10 @@ PostgreSQL documentation Specifies the number of seconds between status packets sent back to the - server. This is required when streaming the transaction log (using - --xlog=stream) if replication timeout is configured - on the server, and allows for easier monitoring. A value of zero disables - the status updates completely. The default value is 10 seconds. + server. This allows for easier monitoring of the progress from server. + A value of zero disables the periodic status updates completely, + although an update will still be sent when requested by the server, to + avoid timeout disconnect. The default value is 10 seconds. diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml index 450383083d..d936cf185d 100644 --- a/doc/src/sgml/ref/pg_dump.sgml +++ b/doc/src/sgml/ref/pg_dump.sgml @@ -143,7 +143,8 @@ PostgreSQL documentation Output commands to clean (drop) database objects prior to outputting the commands for creating them. - (Restore might generate some harmless errors.) + (Restore might generate some harmless error messages, if any objects + were not present in the destination database.) @@ -161,8 +162,10 @@ PostgreSQL documentation Begin the output with a command to create the database itself and reconnect to the created database. (With a - script of this form, it doesn't matter which database you connect - to before running the script.) + script of this form, it doesn't matter which database in the + destination installation you connect to before running the script.) + If is also specified, the script drops and + recreates the target database before reconnecting to it. @@ -427,7 +430,7 @@ PostgreSQL documentation Specify the superuser user name to use when disabling triggers. - This is only relevant if @@ -602,7 +605,7 @@ PostgreSQL documentation - This option is only relevant when creating a data-only dump. + This option is relevant only when creating a data-only dump. It instructs pg_dump to include commands to temporarily disable triggers on the target tables while the data is reloaded. Use this if you have referential @@ -736,11 +739,11 @@ PostgreSQL documentation sections. The default is to dump all sections. - The data section contains actual table data as well as large-object - definitions. - Post-data items consist of definitions of indexes, triggers, rules + The data section contains actual table data, large-object + contents, and sequence values. + Post-data items include definitions of indexes, triggers, rules, and constraints other than validated check constraints. - Pre-data items consist of all other data definition items. + Pre-data items include all other data definition items. diff --git a/doc/src/sgml/ref/pg_dumpall.sgml b/doc/src/sgml/ref/pg_dumpall.sgml index 7c49c0364f..253ee01c0e 100644 --- a/doc/src/sgml/ref/pg_dumpall.sgml +++ b/doc/src/sgml/ref/pg_dumpall.sgml @@ -190,7 +190,7 @@ PostgreSQL documentation Specify the superuser user name to use when disabling triggers. - This is only relevant if @@ -283,7 +283,7 @@ PostgreSQL documentation - This option is only relevant when creating a data-only dump. + This option is relevant only when creating a data-only dump. It instructs pg_dumpall to include commands to temporarily disable triggers on the target tables while the data is reloaded. Use this if you have referential diff --git a/doc/src/sgml/ref/pg_receivexlog.sgml b/doc/src/sgml/ref/pg_receivexlog.sgml index 7f62fd9e61..d06dd1f171 100644 --- a/doc/src/sgml/ref/pg_receivexlog.sgml +++ b/doc/src/sgml/ref/pg_receivexlog.sgml @@ -155,9 +155,10 @@ PostgreSQL documentation Specifies the number of seconds between status packets sent back to the - server. This is required if replication timeout is configured on the - server, and allows for easier monitoring. A value of zero disables the - status updates completely. The default value is 10 seconds. + server. This allows for easier monitoring of the progress from server. + A value of zero disables the periodic status updates completely, + although an update will still be sent when requested by the server, to + avoid timeout disconnect. The default value is 10 seconds. diff --git a/doc/src/sgml/ref/pg_restore.sgml b/doc/src/sgml/ref/pg_restore.sgml index b276da6afe..f4668e73f6 100644 --- a/doc/src/sgml/ref/pg_restore.sgml +++ b/doc/src/sgml/ref/pg_restore.sgml @@ -109,6 +109,8 @@ Clean (drop) database objects before recreating them. + (This might generate some harmless error messages, if any objects + were not present in the destination database.) @@ -118,11 +120,16 @@ - Create the database before restoring into it. (When this - option is used, the database named with is - used only to issue the initial CREATE DATABASE - command. All data is restored into the database name that - appears in the archive.) + Create the database before restoring into it. + If is also specified, drop and + recreate the target database before connecting to it. + + + + When this option is used, the database named with + is used only to issue the initial DROP DATABASE and + CREATE DATABASE commands. All data is restored into the + database name that appears in the archive. @@ -383,7 +390,7 @@ Specify the superuser user name to use when disabling triggers. - This is only relevant if @@ -458,7 +465,7 @@ - This option is only relevant when performing a data-only restore. + This option is relevant only when performing a data-only restore. It instructs pg_restore to execute commands to temporarily disable triggers on the target tables while the data is reloaded. Use this if you have referential @@ -468,9 +475,9 @@ Presently, the commands emitted for - diff --git a/doc/src/sgml/release-8.3.sgml b/doc/src/sgml/release-8.3.sgml index d3b163a4d3..082dc349bc 100644 --- a/doc/src/sgml/release-8.3.sgml +++ b/doc/src/sgml/release-8.3.sgml @@ -1,6 +1,302 @@ + + Release 8.3.22 + + + Release Date + 2012-12-06 + + + + This release contains a variety of fixes from 8.3.21. + For information about new features in the 8.3 major release, see + . + + + + The PostgreSQL community will stop releasing updates + for the 8.3.X release series in February 2013. + Users are encouraged to update to a newer release branch soon. + + + + Migration to Version 8.3.22 + + + A dump/restore is not required for those running 8.3.X. + + + + However, if you are upgrading from a version earlier than 8.3.17, + see the release notes for 8.3.17. + + + + + + Changes + + + + + + Fix multiple bugs associated with CREATE INDEX + CONCURRENTLY (Andres Freund, Tom Lane) + + + + Fix CREATE INDEX CONCURRENTLY to use + in-place updates when changing the state of an index's + pg_index row. This prevents race conditions that could + cause concurrent sessions to miss updating the target index, thus + resulting in corrupt concurrently-created indexes. + + + + Also, fix various other operations to ensure that they ignore + invalid indexes resulting from a failed CREATE INDEX + CONCURRENTLY command. The most important of these is + VACUUM, because an auto-vacuum could easily be launched + on the table before corrective action can be taken to fix or remove + the invalid index. + + + + + + Avoid corruption of internal hash tables when out of memory + (Hitoshi Harada) + + + + + + Fix planning of non-strict equivalence clauses above outer joins + (Tom Lane) + + + + The planner could derive incorrect constraints from a clause equating + a non-strict construct to something else, for example + WHERE COALESCE(foo, 0) = 0 + when foo is coming from the nullable side of an outer join. + + + + + + Improve planner's ability to prove exclusion constraints from + equivalence classes (Tom Lane) + + + + + + Fix partial-row matching in hashed subplans to handle cross-type cases + correctly (Tom Lane) + + + + This affects multicolumn NOT IN subplans, such as + WHERE (a, b) NOT IN (SELECT x, y FROM ...) + when for instance b and y are int4 + and int8 respectively. This mistake led to wrong answers + or crashes depending on the specific datatypes involved. + + + + + + Acquire buffer lock when re-fetching the old tuple for an + AFTER ROW UPDATE/DELETE trigger (Andres Freund) + + + + In very unusual circumstances, this oversight could result in passing + incorrect data to the precheck logic for a foreign-key enforcement + trigger. That could result in a crash, or in an incorrect decision + about whether to fire the trigger. + + + + + + Fix REASSIGN OWNED to handle grants on tablespaces + (Álvaro Herrera) + + + + + + Ignore incorrect pg_attribute entries for system + columns for views (Tom Lane) + + + + Views do not have any system columns. However, we forgot to + remove such entries when converting a table to a view. That's fixed + properly for 9.3 and later, but in previous branches we need to defend + against existing mis-converted views. + + + + + + Fix rule printing to dump INSERT INTO table + DEFAULT VALUES correctly (Tom Lane) + + + + + + Guard against stack overflow when there are too many + UNION/INTERSECT/EXCEPT clauses + in a query (Tom Lane) + + + + + + Prevent platform-dependent failures when dividing the minimum possible + integer value by -1 (Xi Wang, Tom Lane) + + + + + + Fix possible access past end of string in date parsing + (Hitoshi Harada) + + + + + + Produce an understandable error message if the length of the path name + for a Unix-domain socket exceeds the platform-specific limit + (Tom Lane, Andrew Dunstan) + + + + Formerly, this would result in something quite unhelpful, such as + Non-recoverable failure in name resolution. + + + + + + Fix memory leaks when sending composite column values to the client + (Tom Lane) + + + + + + Make pg_ctl more robust about reading the + postmaster.pid file (Heikki Linnakangas) + + + + Fix race conditions and possible file descriptor leakage. + + + + + + Fix possible crash in psql if incorrectly-encoded data + is presented and the client_encoding setting is a + client-only encoding, such as SJIS (Jiang Guiqing) + + + + + + Fix bugs in the restore.sql script emitted by + pg_dump in tar output format (Tom Lane) + + + + The script would fail outright on tables whose names include + upper-case characters. Also, make the script capable of restoring + data in + + + + + Fix pg_restore to accept POSIX-conformant + tar files (Brian Weaver, Tom Lane) + + + + The original coding of pg_dump's tar + output mode produced files that are not fully conformant with the + POSIX standard. This has been corrected for version 9.3. This + patch updates previous branches so that they will accept both the + incorrect and the corrected formats, in hopes of avoiding + compatibility problems when 9.3 comes out. + + + + + + Fix pg_resetxlog to locate postmaster.pid + correctly when given a relative path to the data directory (Tom Lane) + + + + This mistake could lead to pg_resetxlog not noticing + that there is an active postmaster using the data directory. + + + + + + Fix libpq's lo_import() and + lo_export() functions to report file I/O errors properly + (Tom Lane) + + + + + + Fix ecpg's processing of nested structure pointer + variables (Muhammad Usama) + + + + + + Make contrib/pageinspect's btree page inspection + functions take buffer locks while examining pages (Tom Lane) + + + + + + Fix pgxs support for building loadable modules on AIX + (Tom Lane) + + + + Building modules outside the original source tree didn't work on AIX. + + + + + + Update time zone data files to tzdata release 2012j + for DST law changes in Cuba, Israel, Jordan, Libya, Palestine, Western + Samoa, and portions of Brazil. + + + + + + + + Release 8.3.21 diff --git a/doc/src/sgml/release-8.4.sgml b/doc/src/sgml/release-8.4.sgml index 54f5c131af..f6a938c2ce 100644 --- a/doc/src/sgml/release-8.4.sgml +++ b/doc/src/sgml/release-8.4.sgml @@ -1,6 +1,308 @@ + + Release 8.4.15 + + + Release Date + 2012-12-06 + + + + This release contains a variety of fixes from 8.4.14. + For information about new features in the 8.4 major release, see + . + + + + Migration to Version 8.4.15 + + + A dump/restore is not required for those running 8.4.X. + + + + However, if you are upgrading from a version earlier than 8.4.10, + see the release notes for 8.4.10. + + + + + + Changes + + + + + + Fix multiple bugs associated with CREATE INDEX + CONCURRENTLY (Andres Freund, Tom Lane) + + + + Fix CREATE INDEX CONCURRENTLY to use + in-place updates when changing the state of an index's + pg_index row. This prevents race conditions that could + cause concurrent sessions to miss updating the target index, thus + resulting in corrupt concurrently-created indexes. + + + + Also, fix various other operations to ensure that they ignore + invalid indexes resulting from a failed CREATE INDEX + CONCURRENTLY command. The most important of these is + VACUUM, because an auto-vacuum could easily be launched + on the table before corrective action can be taken to fix or remove + the invalid index. + + + + + + Avoid corruption of internal hash tables when out of memory + (Hitoshi Harada) + + + + + + Fix planning of non-strict equivalence clauses above outer joins + (Tom Lane) + + + + The planner could derive incorrect constraints from a clause equating + a non-strict construct to something else, for example + WHERE COALESCE(foo, 0) = 0 + when foo is coming from the nullable side of an outer join. + + + + + + Improve planner's ability to prove exclusion constraints from + equivalence classes (Tom Lane) + + + + + + Fix partial-row matching in hashed subplans to handle cross-type cases + correctly (Tom Lane) + + + + This affects multicolumn NOT IN subplans, such as + WHERE (a, b) NOT IN (SELECT x, y FROM ...) + when for instance b and y are int4 + and int8 respectively. This mistake led to wrong answers + or crashes depending on the specific datatypes involved. + + + + + + Acquire buffer lock when re-fetching the old tuple for an + AFTER ROW UPDATE/DELETE trigger (Andres Freund) + + + + In very unusual circumstances, this oversight could result in passing + incorrect data to the precheck logic for a foreign-key enforcement + trigger. That could result in a crash, or in an incorrect decision + about whether to fire the trigger. + + + + + + Fix ALTER COLUMN TYPE to handle inherited check + constraints properly (Pavan Deolasee) + + + + This worked correctly in pre-8.4 releases, and now works correctly + in 8.4 and later. + + + + + + Fix REASSIGN OWNED to handle grants on tablespaces + (Álvaro Herrera) + + + + + + Ignore incorrect pg_attribute entries for system + columns for views (Tom Lane) + + + + Views do not have any system columns. However, we forgot to + remove such entries when converting a table to a view. That's fixed + properly for 9.3 and later, but in previous branches we need to defend + against existing mis-converted views. + + + + + + Fix rule printing to dump INSERT INTO table + DEFAULT VALUES correctly (Tom Lane) + + + + + + Guard against stack overflow when there are too many + UNION/INTERSECT/EXCEPT clauses + in a query (Tom Lane) + + + + + + Prevent platform-dependent failures when dividing the minimum possible + integer value by -1 (Xi Wang, Tom Lane) + + + + + + Fix possible access past end of string in date parsing + (Hitoshi Harada) + + + + + + Produce an understandable error message if the length of the path name + for a Unix-domain socket exceeds the platform-specific limit + (Tom Lane, Andrew Dunstan) + + + + Formerly, this would result in something quite unhelpful, such as + Non-recoverable failure in name resolution. + + + + + + Fix memory leaks when sending composite column values to the client + (Tom Lane) + + + + + + Make pg_ctl more robust about reading the + postmaster.pid file (Heikki Linnakangas) + + + + Fix race conditions and possible file descriptor leakage. + + + + + + Fix possible crash in psql if incorrectly-encoded data + is presented and the client_encoding setting is a + client-only encoding, such as SJIS (Jiang Guiqing) + + + + + + Fix bugs in the restore.sql script emitted by + pg_dump in tar output format (Tom Lane) + + + + The script would fail outright on tables whose names include + upper-case characters. Also, make the script capable of restoring + data in + + + + + Fix pg_restore to accept POSIX-conformant + tar files (Brian Weaver, Tom Lane) + + + + The original coding of pg_dump's tar + output mode produced files that are not fully conformant with the + POSIX standard. This has been corrected for version 9.3. This + patch updates previous branches so that they will accept both the + incorrect and the corrected formats, in hopes of avoiding + compatibility problems when 9.3 comes out. + + + + + + Fix pg_resetxlog to locate postmaster.pid + correctly when given a relative path to the data directory (Tom Lane) + + + + This mistake could lead to pg_resetxlog not noticing + that there is an active postmaster using the data directory. + + + + + + Fix libpq's lo_import() and + lo_export() functions to report file I/O errors properly + (Tom Lane) + + + + + + Fix ecpg's processing of nested structure pointer + variables (Muhammad Usama) + + + + + + Make contrib/pageinspect's btree page inspection + functions take buffer locks while examining pages (Tom Lane) + + + + + + Fix pgxs support for building loadable modules on AIX + (Tom Lane) + + + + Building modules outside the original source tree didn't work on AIX. + + + + + + Update time zone data files to tzdata release 2012j + for DST law changes in Cuba, Israel, Jordan, Libya, Palestine, Western + Samoa, and portions of Brazil. + + + + + + + + Release 8.4.14 diff --git a/doc/src/sgml/release-9.0.sgml b/doc/src/sgml/release-9.0.sgml index af99af82be..495738cd3c 100644 --- a/doc/src/sgml/release-9.0.sgml +++ b/doc/src/sgml/release-9.0.sgml @@ -1,6 +1,384 @@ + + Release 9.0.11 + + + Release Date + 2012-12-06 + + + + This release contains a variety of fixes from 9.0.10. + For information about new features in the 9.0 major release, see + . + + + + Migration to Version 9.0.11 + + + A dump/restore is not required for those running 9.0.X. + + + + However, if you are upgrading from a version earlier than 9.0.6, + see the release notes for 9.0.6. + + + + + + Changes + + + + + + Fix multiple bugs associated with CREATE INDEX + CONCURRENTLY (Andres Freund, Tom Lane) + + + + Fix CREATE INDEX CONCURRENTLY to use + in-place updates when changing the state of an index's + pg_index row. This prevents race conditions that could + cause concurrent sessions to miss updating the target index, thus + resulting in corrupt concurrently-created indexes. + + + + Also, fix various other operations to ensure that they ignore + invalid indexes resulting from a failed CREATE INDEX + CONCURRENTLY command. The most important of these is + VACUUM, because an auto-vacuum could easily be launched + on the table before corrective action can be taken to fix or remove + the invalid index. + + + + + + Fix buffer locking during WAL replay (Tom Lane) + + + + The WAL replay code was insufficiently careful about locking buffers + when replaying WAL records that affect more than one page. This could + result in hot standby queries transiently seeing inconsistent states, + resulting in wrong answers or unexpected failures. + + + + + + Fix an error in WAL generation logic for GIN indexes (Tom Lane) + + + + This could result in index corruption, if a torn-page failure occurred. + + + + + + Properly remove startup process's virtual XID lock when promoting a + hot standby server to normal running (Simon Riggs) + + + + This oversight could prevent subsequent execution of certain + operations such as CREATE INDEX CONCURRENTLY. + + + + + + Avoid bogus out-of-sequence timeline ID errors in standby + mode (Heikki Linnakangas) + + + + + + Prevent the postmaster from launching new child processes after it's + received a shutdown signal (Tom Lane) + + + + This mistake could result in shutdown taking longer than it should, or + even never completing at all without additional user action. + + + + + + Avoid corruption of internal hash tables when out of memory + (Hitoshi Harada) + + + + + + Fix planning of non-strict equivalence clauses above outer joins + (Tom Lane) + + + + The planner could derive incorrect constraints from a clause equating + a non-strict construct to something else, for example + WHERE COALESCE(foo, 0) = 0 + when foo is coming from the nullable side of an outer join. + + + + + + Improve planner's ability to prove exclusion constraints from + equivalence classes (Tom Lane) + + + + + + Fix partial-row matching in hashed subplans to handle cross-type cases + correctly (Tom Lane) + + + + This affects multicolumn NOT IN subplans, such as + WHERE (a, b) NOT IN (SELECT x, y FROM ...) + when for instance b and y are int4 + and int8 respectively. This mistake led to wrong answers + or crashes depending on the specific datatypes involved. + + + + + + Acquire buffer lock when re-fetching the old tuple for an + AFTER ROW UPDATE/DELETE trigger (Andres Freund) + + + + In very unusual circumstances, this oversight could result in passing + incorrect data to the precheck logic for a foreign-key enforcement + trigger. That could result in a crash, or in an incorrect decision + about whether to fire the trigger. + + + + + + Fix ALTER COLUMN TYPE to handle inherited check + constraints properly (Pavan Deolasee) + + + + This worked correctly in pre-8.4 releases, and now works correctly + in 8.4 and later. + + + + + + Fix REASSIGN OWNED to handle grants on tablespaces + (Álvaro Herrera) + + + + + + Ignore incorrect pg_attribute entries for system + columns for views (Tom Lane) + + + + Views do not have any system columns. However, we forgot to + remove such entries when converting a table to a view. That's fixed + properly for 9.3 and later, but in previous branches we need to defend + against existing mis-converted views. + + + + + + Fix rule printing to dump INSERT INTO table + DEFAULT VALUES correctly (Tom Lane) + + + + + + Guard against stack overflow when there are too many + UNION/INTERSECT/EXCEPT clauses + in a query (Tom Lane) + + + + + + Prevent platform-dependent failures when dividing the minimum possible + integer value by -1 (Xi Wang, Tom Lane) + + + + + + Fix possible access past end of string in date parsing + (Hitoshi Harada) + + + + + + Fix failure to advance XID epoch if XID wraparound happens during a + checkpoint and wal_level is hot_standby + (Tom Lane, Andres Freund) + + + + While this mistake had no particular impact on + PostgreSQL itself, it was bad for + applications that rely on txid_current() and related + functions: the TXID value would appear to go backwards. + + + + + + Produce an understandable error message if the length of the path name + for a Unix-domain socket exceeds the platform-specific limit + (Tom Lane, Andrew Dunstan) + + + + Formerly, this would result in something quite unhelpful, such as + Non-recoverable failure in name resolution. + + + + + + Fix memory leaks when sending composite column values to the client + (Tom Lane) + + + + + + Make pg_ctl more robust about reading the + postmaster.pid file (Heikki Linnakangas) + + + + Fix race conditions and possible file descriptor leakage. + + + + + + Fix possible crash in psql if incorrectly-encoded data + is presented and the client_encoding setting is a + client-only encoding, such as SJIS (Jiang Guiqing) + + + + + + Fix bugs in the restore.sql script emitted by + pg_dump in tar output format (Tom Lane) + + + + The script would fail outright on tables whose names include + upper-case characters. Also, make the script capable of restoring + data in + + + + + Fix pg_restore to accept POSIX-conformant + tar files (Brian Weaver, Tom Lane) + + + + The original coding of pg_dump's tar + output mode produced files that are not fully conformant with the + POSIX standard. This has been corrected for version 9.3. This + patch updates previous branches so that they will accept both the + incorrect and the corrected formats, in hopes of avoiding + compatibility problems when 9.3 comes out. + + + + + + Fix pg_resetxlog to locate postmaster.pid + correctly when given a relative path to the data directory (Tom Lane) + + + + This mistake could lead to pg_resetxlog not noticing + that there is an active postmaster using the data directory. + + + + + + Fix libpq's lo_import() and + lo_export() functions to report file I/O errors properly + (Tom Lane) + + + + + + Fix ecpg's processing of nested structure pointer + variables (Muhammad Usama) + + + + + + Fix ecpg's ecpg_get_data function to + handle arrays properly (Michael Meskes) + + + + + + Make contrib/pageinspect's btree page inspection + functions take buffer locks while examining pages (Tom Lane) + + + + + + Fix pgxs support for building loadable modules on AIX + (Tom Lane) + + + + Building modules outside the original source tree didn't work on AIX. + + + + + + Update time zone data files to tzdata release 2012j + for DST law changes in Cuba, Israel, Jordan, Libya, Palestine, Western + Samoa, and portions of Brazil. + + + + + + + + Release 9.0.10 diff --git a/doc/src/sgml/release-9.1.sgml b/doc/src/sgml/release-9.1.sgml index 5fbdd7a195..1143fdfdcd 100644 --- a/doc/src/sgml/release-9.1.sgml +++ b/doc/src/sgml/release-9.1.sgml @@ -1,6 +1,469 @@ + + Release 9.1.7 + + + Release Date + 2012-12-06 + + + + This release contains a variety of fixes from 9.1.6. + For information about new features in the 9.1 major release, see + . + + + + Migration to Version 9.1.7 + + + A dump/restore is not required for those running 9.1.X. + + + + However, if you are upgrading from a version earlier than 9.1.6, + see the release notes for 9.1.6. + + + + + + Changes + + + + + + Fix multiple bugs associated with CREATE INDEX + CONCURRENTLY (Andres Freund, Tom Lane) + + + + Fix CREATE INDEX CONCURRENTLY to use + in-place updates when changing the state of an index's + pg_index row. This prevents race conditions that could + cause concurrent sessions to miss updating the target index, thus + resulting in corrupt concurrently-created indexes. + + + + Also, fix various other operations to ensure that they ignore + invalid indexes resulting from a failed CREATE INDEX + CONCURRENTLY command. The most important of these is + VACUUM, because an auto-vacuum could easily be launched + on the table before corrective action can be taken to fix or remove + the invalid index. + + + + + + Fix buffer locking during WAL replay (Tom Lane) + + + + The WAL replay code was insufficiently careful about locking buffers + when replaying WAL records that affect more than one page. This could + result in hot standby queries transiently seeing inconsistent states, + resulting in wrong answers or unexpected failures. + + + + + + Fix an error in WAL generation logic for GIN indexes (Tom Lane) + + + + This could result in index corruption, if a torn-page failure occurred. + + + + + + Properly remove startup process's virtual XID lock when promoting a + hot standby server to normal running (Simon Riggs) + + + + This oversight could prevent subsequent execution of certain + operations such as CREATE INDEX CONCURRENTLY. + + + + + + Avoid bogus out-of-sequence timeline ID errors in standby + mode (Heikki Linnakangas) + + + + + + Prevent the postmaster from launching new child processes after it's + received a shutdown signal (Tom Lane) + + + + This mistake could result in shutdown taking longer than it should, or + even never completing at all without additional user action. + + + + + + Avoid corruption of internal hash tables when out of memory + (Hitoshi Harada) + + + + + + Prevent file descriptors for dropped tables from being held open past + transaction end (Tom Lane) + + + + This should reduce problems with long-since-dropped tables continuing + to occupy disk space. + + + + + + Prevent database-wide crash and restart when a new child process is + unable to create a pipe for its latch (Tom Lane) + + + + Although the new process must fail, there is no good reason to force a + database-wide restart, so avoid that. This improves robustness when + the kernel is nearly out of file descriptors. + + + + + + Fix planning of non-strict equivalence clauses above outer joins + (Tom Lane) + + + + The planner could derive incorrect constraints from a clause equating + a non-strict construct to something else, for example + WHERE COALESCE(foo, 0) = 0 + when foo is coming from the nullable side of an outer join. + + + + + + Fix SELECT DISTINCT with index-optimized + MIN/MAX on an inheritance tree (Tom Lane) + + + + The planner would fail with failed to re-find MinMaxAggInfo + record given this combination of factors. + + + + + + Improve planner's ability to prove exclusion constraints from + equivalence classes (Tom Lane) + + + + + + Fix partial-row matching in hashed subplans to handle cross-type cases + correctly (Tom Lane) + + + + This affects multicolumn NOT IN subplans, such as + WHERE (a, b) NOT IN (SELECT x, y FROM ...) + when for instance b and y are int4 + and int8 respectively. This mistake led to wrong answers + or crashes depending on the specific datatypes involved. + + + + + + Acquire buffer lock when re-fetching the old tuple for an + AFTER ROW UPDATE/DELETE trigger (Andres Freund) + + + + In very unusual circumstances, this oversight could result in passing + incorrect data to a trigger WHEN condition, or to the + precheck logic for a foreign-key enforcement trigger. That could + result in a crash, or in an incorrect decision about whether to + fire the trigger. + + + + + + Fix ALTER COLUMN TYPE to handle inherited check + constraints properly (Pavan Deolasee) + + + + This worked correctly in pre-8.4 releases, and now works correctly + in 8.4 and later. + + + + + + Fix ALTER EXTENSION SET SCHEMA's failure to move some + subsidiary objects into the new schema (Álvaro Herrera, Dimitri + Fontaine) + + + + + + Fix REASSIGN OWNED to handle grants on tablespaces + (Álvaro Herrera) + + + + + + Ignore incorrect pg_attribute entries for system + columns for views (Tom Lane) + + + + Views do not have any system columns. However, we forgot to + remove such entries when converting a table to a view. That's fixed + properly for 9.3 and later, but in previous branches we need to defend + against existing mis-converted views. + + + + + + Fix rule printing to dump INSERT INTO table + DEFAULT VALUES correctly (Tom Lane) + + + + + + Guard against stack overflow when there are too many + UNION/INTERSECT/EXCEPT clauses + in a query (Tom Lane) + + + + + + Prevent platform-dependent failures when dividing the minimum possible + integer value by -1 (Xi Wang, Tom Lane) + + + + + + Fix possible access past end of string in date parsing + (Hitoshi Harada) + + + + + + Fix failure to advance XID epoch if XID wraparound happens during a + checkpoint and wal_level is hot_standby + (Tom Lane, Andres Freund) + + + + While this mistake had no particular impact on + PostgreSQL itself, it was bad for + applications that rely on txid_current() and related + functions: the TXID value would appear to go backwards. + + + + + + Fix display of + pg_stat_replication.sync_state at a + page boundary (Kyotaro Horiguchi) + + + + + + Produce an understandable error message if the length of the path name + for a Unix-domain socket exceeds the platform-specific limit + (Tom Lane, Andrew Dunstan) + + + + Formerly, this would result in something quite unhelpful, such as + Non-recoverable failure in name resolution. + + + + + + Fix memory leaks when sending composite column values to the client + (Tom Lane) + + + + + + Make pg_ctl more robust about reading the + postmaster.pid file (Heikki Linnakangas) + + + + Fix race conditions and possible file descriptor leakage. + + + + + + Fix possible crash in psql if incorrectly-encoded data + is presented and the client_encoding setting is a + client-only encoding, such as SJIS (Jiang Guiqing) + + + + + + Make pg_dump dump SEQUENCE SET items in + the data not pre-data section of the archive (Tom Lane) + + + + This change fixes dumping of sequences that are marked as extension + configuration tables. + + + + + + Fix bugs in the restore.sql script emitted by + pg_dump in tar output format (Tom Lane) + + + + The script would fail outright on tables whose names include + upper-case characters. Also, make the script capable of restoring + data in + + + + + Fix pg_restore to accept POSIX-conformant + tar files (Brian Weaver, Tom Lane) + + + + The original coding of pg_dump's tar + output mode produced files that are not fully conformant with the + POSIX standard. This has been corrected for version 9.3. This + patch updates previous branches so that they will accept both the + incorrect and the corrected formats, in hopes of avoiding + compatibility problems when 9.3 comes out. + + + + + + Fix tar files emitted by pg_basebackup to + be POSIX conformant (Brian Weaver, Tom Lane) + + + + + + Fix pg_resetxlog to locate postmaster.pid + correctly when given a relative path to the data directory (Tom Lane) + + + + This mistake could lead to pg_resetxlog not noticing + that there is an active postmaster using the data directory. + + + + + + Fix libpq's lo_import() and + lo_export() functions to report file I/O errors properly + (Tom Lane) + + + + + + Fix ecpg's processing of nested structure pointer + variables (Muhammad Usama) + + + + + + Fix ecpg's ecpg_get_data function to + handle arrays properly (Michael Meskes) + + + + + + Make contrib/pageinspect's btree page inspection + functions take buffer locks while examining pages (Tom Lane) + + + + + + Ensure that make install for an extension creates the + extension installation directory (Cédric Villemain) + + + + Previously, this step was missed if MODULEDIR was set in + the extension's Makefile. + + + + + + Fix pgxs support for building loadable modules on AIX + (Tom Lane) + + + + Building modules outside the original source tree didn't work on AIX. + + + + + + Update time zone data files to tzdata release 2012j + for DST law changes in Cuba, Israel, Jordan, Libya, Palestine, Western + Samoa, and portions of Brazil. + + + + + + + + Release 9.1.6 @@ -146,7 +609,7 @@ Disallow extensions from containing the schema they are assigned to - (Thom Brown) + (Thom Brown) diff --git a/doc/src/sgml/release-9.2.sgml b/doc/src/sgml/release-9.2.sgml index 840e7a7a0d..5f4f4baff3 100644 --- a/doc/src/sgml/release-9.2.sgml +++ b/doc/src/sgml/release-9.2.sgml @@ -1,6 +1,730 @@ + + Release 9.2.2 + + + Release Date + 2012-12-06 + + + + This release contains a variety of fixes from 9.2.1. + For information about new features in the 9.2 major release, see + . + + + + Migration to Version 9.2.2 + + + A dump/restore is not required for those running 9.2.X. + + + + However, you may need to perform REINDEX operations to + correct problems in concurrently-built indexes, as described in the first + changelog item below. + + + + Also, if you are upgrading from version 9.2.0, + see the release notes for 9.2.1. + + + + + + Changes + + + + + + Fix multiple bugs associated with CREATE/DROP INDEX + CONCURRENTLY (Andres Freund, Tom Lane, Simon Riggs, Pavan Deolasee) + + + + An error introduced while adding DROP INDEX CONCURRENTLY + allowed incorrect indexing decisions to be made during the initial + phase of CREATE INDEX CONCURRENTLY; so that indexes built + by that command could be corrupt. It is recommended that indexes + built in 9.2.X with CREATE INDEX CONCURRENTLY be rebuilt + after applying this update. + + + + In addition, fix CREATE/DROP INDEX CONCURRENTLY to use + in-place updates when changing the state of an index's + pg_index row. This prevents race conditions that could + cause concurrent sessions to miss updating the target index, thus + again resulting in corrupt concurrently-created indexes. + + + + Also, fix various other operations to ensure that they ignore + invalid indexes resulting from a failed CREATE INDEX + CONCURRENTLY command. The most important of these is + VACUUM, because an auto-vacuum could easily be launched + on the table before corrective action can be taken to fix or remove + the invalid index. + + + + Also fix DROP INDEX CONCURRENTLY to not disable + insertions into the target index until all queries using it are done. + + + + Also fix misbehavior if DROP INDEX CONCURRENTLY is + canceled: the previous coding could leave an un-droppable index behind. + + + + + + Correct predicate locking for DROP INDEX CONCURRENTLY + (Kevin Grittner) + + + + Previously, SSI predicate locks were processed at the wrong time, + possibly leading to incorrect behavior of serializable transactions + executing in parallel with the DROP. + + + + + + Fix buffer locking during WAL replay (Tom Lane) + + + + The WAL replay code was insufficiently careful about locking buffers + when replaying WAL records that affect more than one page. This could + result in hot standby queries transiently seeing inconsistent states, + resulting in wrong answers or unexpected failures. + + + + + + Fix an error in WAL generation logic for GIN indexes (Tom Lane) + + + + This could result in index corruption, if a torn-page failure occurred. + + + + + + Fix an error in WAL replay logic for SP-GiST indexes (Tom Lane) + + + + This could result in index corruption after a crash, or on a standby + server. + + + + + + Fix incorrect detection of end-of-base-backup location during WAL + recovery (Heikki Linnakangas) + + + + This mistake allowed hot standby mode to start up before the database + reaches a consistent state. + + + + + + Properly remove startup process's virtual XID lock when promoting a + hot standby server to normal running (Simon Riggs) + + + + This oversight could prevent subsequent execution of certain + operations such as CREATE INDEX CONCURRENTLY. + + + + + + Avoid bogus out-of-sequence timeline ID errors in standby + mode (Heikki Linnakangas) + + + + + + Prevent the postmaster from launching new child processes after it's + received a shutdown signal (Tom Lane) + + + + This mistake could result in shutdown taking longer than it should, or + even never completing at all without additional user action. + + + + + + Fix the syslogger process to not fail when + log_rotation_age exceeds 2^31 milliseconds (about 25 days) + (Tom Lane) + + + + + + Fix WaitLatch() to return promptly when the requested + timeout expires (Jeff Janes, Tom Lane) + + + + With the previous coding, a steady stream of non-wait-terminating + interrupts could delay return from WaitLatch() + indefinitely. This has been shown to be a problem for the autovacuum + launcher process, and might cause trouble elsewhere as well. + + + + + + Avoid corruption of internal hash tables when out of memory + (Hitoshi Harada) + + + + + + Prevent file descriptors for dropped tables from being held open past + transaction end (Tom Lane) + + + + This should reduce problems with long-since-dropped tables continuing + to occupy disk space. + + + + + + Prevent database-wide crash and restart when a new child process is + unable to create a pipe for its latch (Tom Lane) + + + + Although the new process must fail, there is no good reason to force a + database-wide restart, so avoid that. This improves robustness when + the kernel is nearly out of file descriptors. + + + + + + Avoid planner crash with joins to unflattened subqueries (Tom Lane) + + + + + + Fix planning of non-strict equivalence clauses above outer joins + (Tom Lane) + + + + The planner could derive incorrect constraints from a clause equating + a non-strict construct to something else, for example + WHERE COALESCE(foo, 0) = 0 + when foo is coming from the nullable side of an outer join. + 9.2 showed this type of error in more cases than previous releases, + but the basic bug has been there for a long time. + + + + + + Fix SELECT DISTINCT with index-optimized + MIN/MAX on an inheritance tree (Tom Lane) + + + + The planner would fail with failed to re-find MinMaxAggInfo + record given this combination of factors. + + + + + + Make sure the planner sees implicit and explicit casts as equivalent + for all purposes, except in the minority of cases where there's + actually a semantic difference (Tom Lane) + + + + + + Include join clauses when considering whether partial indexes can be + used for a query (Tom Lane) + + + + A strict join clause can be sufficient to establish an + x IS NOT NULL predicate, for example. + This fixes a planner regression in 9.2, since previous versions could + make comparable deductions. + + + + + + Limit growth of planning time when there are many indexable join + clauses for the same index (Tom Lane) + + + + + + Improve planner's ability to prove exclusion constraints from + equivalence classes (Tom Lane) + + + + + + Fix partial-row matching in hashed subplans to handle cross-type cases + correctly (Tom Lane) + + + + This affects multicolumn NOT IN subplans, such as + WHERE (a, b) NOT IN (SELECT x, y FROM ...) + when for instance b and y are int4 + and int8 respectively. This mistake led to wrong answers + or crashes depending on the specific datatypes involved. + + + + + + Fix btree mark/restore functions to handle array keys (Tom Lane) + + + + This oversight could result in wrong answers from merge joins whose + inner side is an index scan using an + indexed_column = + ANY(array) condition. + + + + + + Revert patch for taking fewer snapshots (Tom Lane) + + + + The 9.2 change to reduce the number of snapshots taken during query + execution led to some anomalous behaviors not seen in previous + releases, because execution would proceed with a snapshot acquired + before locking the tables used by the query. Thus, for example, + a query would not be guaranteed to see updates committed by a + preceding transaction even if that transaction had exclusive lock. + We'll probably revisit this in future releases, but meanwhile put it + back the way it was before 9.2. + + + + + + Acquire buffer lock when re-fetching the old tuple for an + AFTER ROW UPDATE/DELETE trigger (Andres Freund) + + + + In very unusual circumstances, this oversight could result in passing + incorrect data to a trigger WHEN condition, or to the + precheck logic for a foreign-key enforcement trigger. That could + result in a crash, or in an incorrect decision about whether to + fire the trigger. + + + + + + Fix ALTER COLUMN TYPE to handle inherited check + constraints properly (Pavan Deolasee) + + + + This worked correctly in pre-8.4 releases, and now works correctly + in 8.4 and later. + + + + + + Fix ALTER EXTENSION SET SCHEMA's failure to move some + subsidiary objects into the new schema (Álvaro Herrera, Dimitri + Fontaine) + + + + + + Handle CREATE TABLE AS EXECUTE correctly in extended query + protocol (Tom Lane) + + + + + + Don't modify the input parse tree in DROP RULE IF NOT + EXISTS and DROP TRIGGER IF NOT EXISTS (Tom Lane) + + + + This mistake would cause errors if a cached statement of one of these + types was re-executed. + + + + + + Fix REASSIGN OWNED to handle grants on tablespaces + (Álvaro Herrera) + + + + + + Ignore incorrect pg_attribute entries for system + columns for views (Tom Lane) + + + + Views do not have any system columns. However, we forgot to + remove such entries when converting a table to a view. That's fixed + properly for 9.3 and later, but in previous branches we need to defend + against existing mis-converted views. + + + + + + Fix rule printing to dump INSERT INTO table + DEFAULT VALUES correctly (Tom Lane) + + + + + + Guard against stack overflow when there are too many + UNION/INTERSECT/EXCEPT clauses + in a query (Tom Lane) + + + + + + Prevent platform-dependent failures when dividing the minimum possible + integer value by -1 (Xi Wang, Tom Lane) + + + + + + Fix possible access past end of string in date parsing + (Hitoshi Harada) + + + + + + Fix failure to advance XID epoch if XID wraparound happens during a + checkpoint and wal_level is hot_standby + (Tom Lane, Andres Freund) + + + + While this mistake had no particular impact on + PostgreSQL itself, it was bad for + applications that rely on txid_current() and related + functions: the TXID value would appear to go backwards. + + + + + + Fix pg_terminate_backend() and + pg_cancel_backend() to not throw error for a non-existent + target process (Josh Kupershmidt) + + + + This case already worked as intended when called by a superuser, + but not so much when called by ordinary users. + + + + + + Fix display of + pg_stat_replication.sync_state at a + page boundary (Kyotaro Horiguchi) + + + + + + Produce an understandable error message if the length of the path name + for a Unix-domain socket exceeds the platform-specific limit + (Tom Lane, Andrew Dunstan) + + + + Formerly, this would result in something quite unhelpful, such as + Non-recoverable failure in name resolution. + + + + + + Fix memory leaks when sending composite column values to the client + (Tom Lane) + + + + + + Save some cycles by not searching for subtransaction locks at commit + (Simon Riggs) + + + + In a transaction holding many exclusive locks, this useless activity + could be quite costly. + + + + + + Make pg_ctl more robust about reading the + postmaster.pid file (Heikki Linnakangas) + + + + This fixes race conditions and possible file descriptor leakage. + + + + + + Fix possible crash in psql if incorrectly-encoded data + is presented and the client_encoding setting is a + client-only encoding, such as SJIS (Jiang Guiqing) + + + + + + Make pg_dump dump SEQUENCE SET items in + the data not pre-data section of the archive (Tom Lane) + + + + This fixes an undesirable inconsistency between the meanings of + + + + + + Fix pg_dump's handling of DROP DATABASE + commands in + + + Beginning in 9.2.0, pg_dump --clean would issue a + DROP DATABASE command, which was either useless or + dangerous depending on the usage scenario. It no longer does that. + This change also fixes the combination of + + + + + Fix pg_dump for views with circular dependencies and + no relation options (Tom Lane) + + + + The previous fix to dump relation options when a view is + involved in a circular dependency didn't work right for the case + that the view has no options; it emitted ALTER VIEW foo + SET () which is invalid syntax. + + + + + + Fix bugs in the restore.sql script emitted by + pg_dump in tar output format (Tom Lane) + + + + The script would fail outright on tables whose names include + upper-case characters. Also, make the script capable of restoring + data in + + + + + Fix pg_restore to accept POSIX-conformant + tar files (Brian Weaver, Tom Lane) + + + + The original coding of pg_dump's tar + output mode produced files that are not fully conformant with the + POSIX standard. This has been corrected for version 9.3. This + patch updates previous branches so that they will accept both the + incorrect and the corrected formats, in hopes of avoiding + compatibility problems when 9.3 comes out. + + + + + + Fix tar files emitted by pg_basebackup to + be POSIX conformant (Brian Weaver, Tom Lane) + + + + + + Fix pg_resetxlog to locate postmaster.pid + correctly when given a relative path to the data directory (Tom Lane) + + + + This mistake could lead to pg_resetxlog not noticing + that there is an active postmaster using the data directory. + + + + + + Fix libpq's lo_import() and + lo_export() functions to report file I/O errors properly + (Tom Lane) + + + + + + Fix ecpg's processing of nested structure pointer + variables (Muhammad Usama) + + + + + + Fix ecpg's ecpg_get_data function to + handle arrays properly (Michael Meskes) + + + + + + Prevent pg_upgrade from trying to process TOAST tables + for system catalogs (Bruce Momjian) + + + + This fixes an error seen when the information_schema has + been dropped and recreated. Other failures were also possible. + + + + + + Improve pg_upgrade performance by setting + synchronous_commit to off in the new cluster + (Bruce Momjian) + + + + + + Make contrib/pageinspect's btree page inspection + functions take buffer locks while examining pages (Tom Lane) + + + + + + Work around unportable behavior of malloc(0) and + realloc(NULL, 0) (Tom Lane) + + + + On platforms where these calls return NULL, some code + mistakenly thought that meant out-of-memory. + This is known to have broken pg_dump for databases + containing no user-defined aggregates. There might be other cases + as well. + + + + + + Ensure that make install for an extension creates the + extension installation directory (Cédric Villemain) + + + + Previously, this step was missed if MODULEDIR was set in + the extension's Makefile. + + + + + + Fix pgxs support for building loadable modules on AIX + (Tom Lane) + + + + Building modules outside the original source tree didn't work on AIX. + + + + + + Update time zone data files to tzdata release 2012j + for DST law changes in Cuba, Israel, Jordan, Libya, Palestine, Western + Samoa, and portions of Brazil. + + + + + + + + Release 9.2.1 @@ -809,13 +1533,6 @@ - - - Take fewer MVCC snapshots - (Robert Haas) - - - Make the number of CLOG buffers scale based on remove_name on the schema. + + When objects that are subsidiary of other objects (such as a table's indexes + or triggers) are created or dropped, setattr permission will be + checked on the main object, instead of the subsidiary object itself. + + When is executed, setattr and relabelfrom will be checked on the object being relabeled diff --git a/doc/src/sgml/stylesheet.xsl b/doc/src/sgml/stylesheet.xsl index 72dcd2c986..09d8981e56 100644 --- a/doc/src/sgml/stylesheet.xsl +++ b/doc/src/sgml/stylesheet.xsl @@ -9,7 +9,7 @@ - + diff --git a/src/Makefile.global.in b/src/Makefile.global.in index fbaaaf995b..e10eeada87 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -31,6 +31,9 @@ all: # started to update the file. .DELETE_ON_ERROR: +# Never delete any intermediate files automatically. +.SECONDARY: + # PostgreSQL version number VERSION = @PACKAGE_VERSION@ MAJORVERSION = @PG_MAJORVERSION@ @@ -683,17 +686,6 @@ clean distclean maintainer-clean: clean-deps clean-deps: @rm -rf $(DEPDIR) -# When in automatic dependency mode, never delete any intermediate -# files automatically. Otherwise, the following could happen: When -# starting from a clean source tree, the first build would delete the -# intermediate file, but also create the dependency file, which -# mentions the intermediate file, thus making it non-intermediate. -# The second build will then need to rebuild the now non-intermediate -# missing file. So the second build will do work even though nothing -# had changed. One place where this happens is the .c -> .o -> .so -# chain for some contrib modules. -.SECONDARY: - endif # autodepend diff --git a/src/backend/access/Makefile b/src/backend/access/Makefile index 0366d59624..c32088f81d 100644 --- a/src/backend/access/Makefile +++ b/src/backend/access/Makefile @@ -8,6 +8,6 @@ subdir = src/backend/access top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -SUBDIRS = common gist hash heap index nbtree transam gin spgist +SUBDIRS = common gin gist hash heap index nbtree rmgrdesc spgist transam include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c index b9bfde2ee4..55df02ace6 100644 --- a/src/backend/access/gin/ginfast.c +++ b/src/backend/access/gin/ginfast.c @@ -290,7 +290,7 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) if (metadata->head == InvalidBlockNumber) { /* - * Main list is empty, so just copy sublist into main list + * Main list is empty, so just insert sublist as main list */ START_CRIT_SECTION(); @@ -313,6 +313,14 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) LockBuffer(buffer, GIN_EXCLUSIVE); page = BufferGetPage(buffer); + rdata[0].next = rdata + 1; + + rdata[1].buffer = buffer; + rdata[1].buffer_std = true; + rdata[1].data = NULL; + rdata[1].len = 0; + rdata[1].next = NULL; + Assert(GinPageGetOpaque(page)->rightlink == InvalidBlockNumber); START_CRIT_SECTION(); diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index 250619cb2c..0ff66c875b 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -77,6 +77,9 @@ ginRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record) MetaBuffer; Page page; + /* Backup blocks are not used in create_index records */ + Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); + MetaBuffer = XLogReadBuffer(*node, GIN_METAPAGE_BLKNO, true); Assert(BufferIsValid(MetaBuffer)); page = (Page) BufferGetPage(MetaBuffer); @@ -109,6 +112,9 @@ ginRedoCreatePTree(XLogRecPtr lsn, XLogRecord *record) Buffer buffer; Page page; + /* Backup blocks are not used in create_ptree records */ + Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); + buffer = XLogReadBuffer(data->node, data->blkno, true); Assert(BufferIsValid(buffer)); page = (Page) BufferGetPage(buffer); @@ -159,9 +165,12 @@ ginRedoInsert(XLogRecPtr lsn, XLogRecord *record) } } - /* nothing else to do if page was backed up */ - if (record->xl_info & XLR_BKP_BLOCK_1) + /* If we have a full-page image, restore it and we're done */ + if (record->xl_info & XLR_BKP_BLOCK(0)) + { + (void) RestoreBackupBlock(lsn, record, 0, false, false); return; + } buffer = XLogReadBuffer(data->node, data->blkno, false); if (!BufferIsValid(buffer)) @@ -256,6 +265,9 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record) if (data->isData) flags |= GIN_DATA; + /* Backup blocks are not used in split records */ + Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); + lbuffer = XLogReadBuffer(data->node, data->lblkno, true); Assert(BufferIsValid(lbuffer)); lpage = (Page) BufferGetPage(lbuffer); @@ -369,9 +381,12 @@ ginRedoVacuumPage(XLogRecPtr lsn, XLogRecord *record) Buffer buffer; Page page; - /* nothing to do if page was backed up (and no info to do it with) */ - if (record->xl_info & XLR_BKP_BLOCK_1) + /* If we have a full-page image, restore it and we're done */ + if (record->xl_info & XLR_BKP_BLOCK(0)) + { + (void) RestoreBackupBlock(lsn, record, 0, false, false); return; + } buffer = XLogReadBuffer(data->node, data->blkno, false); if (!BufferIsValid(buffer)) @@ -420,33 +435,38 @@ static void ginRedoDeletePage(XLogRecPtr lsn, XLogRecord *record) { ginxlogDeletePage *data = (ginxlogDeletePage *) XLogRecGetData(record); - Buffer buffer; + Buffer dbuffer; + Buffer pbuffer; + Buffer lbuffer; Page page; - if (!(record->xl_info & XLR_BKP_BLOCK_1)) + if (record->xl_info & XLR_BKP_BLOCK(0)) + dbuffer = RestoreBackupBlock(lsn, record, 0, false, true); + else { - buffer = XLogReadBuffer(data->node, data->blkno, false); - if (BufferIsValid(buffer)) + dbuffer = XLogReadBuffer(data->node, data->blkno, false); + if (BufferIsValid(dbuffer)) { - page = BufferGetPage(buffer); + page = BufferGetPage(dbuffer); if (!XLByteLE(lsn, PageGetLSN(page))) { Assert(GinPageIsData(page)); GinPageGetOpaque(page)->flags = GIN_DELETED; PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); - MarkBufferDirty(buffer); + MarkBufferDirty(dbuffer); } - UnlockReleaseBuffer(buffer); } } - if (!(record->xl_info & XLR_BKP_BLOCK_2)) + if (record->xl_info & XLR_BKP_BLOCK(1)) + pbuffer = RestoreBackupBlock(lsn, record, 1, false, true); + else { - buffer = XLogReadBuffer(data->node, data->parentBlkno, false); - if (BufferIsValid(buffer)) + pbuffer = XLogReadBuffer(data->node, data->parentBlkno, false); + if (BufferIsValid(pbuffer)) { - page = BufferGetPage(buffer); + page = BufferGetPage(pbuffer); if (!XLByteLE(lsn, PageGetLSN(page))) { Assert(GinPageIsData(page)); @@ -454,29 +474,35 @@ ginRedoDeletePage(XLogRecPtr lsn, XLogRecord *record) GinPageDeletePostingItem(page, data->parentOffset); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); - MarkBufferDirty(buffer); + MarkBufferDirty(pbuffer); } - UnlockReleaseBuffer(buffer); } } - if (!(record->xl_info & XLR_BKP_BLOCK_3) && data->leftBlkno != InvalidBlockNumber) + if (record->xl_info & XLR_BKP_BLOCK(2)) + (void) RestoreBackupBlock(lsn, record, 2, false, false); + else if (data->leftBlkno != InvalidBlockNumber) { - buffer = XLogReadBuffer(data->node, data->leftBlkno, false); - if (BufferIsValid(buffer)) + lbuffer = XLogReadBuffer(data->node, data->leftBlkno, false); + if (BufferIsValid(lbuffer)) { - page = BufferGetPage(buffer); + page = BufferGetPage(lbuffer); if (!XLByteLE(lsn, PageGetLSN(page))) { Assert(GinPageIsData(page)); GinPageGetOpaque(page)->rightlink = data->rightLink; PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); - MarkBufferDirty(buffer); + MarkBufferDirty(lbuffer); } - UnlockReleaseBuffer(buffer); + UnlockReleaseBuffer(lbuffer); } } + + if (BufferIsValid(pbuffer)) + UnlockReleaseBuffer(pbuffer); + if (BufferIsValid(dbuffer)) + UnlockReleaseBuffer(dbuffer); } static void @@ -505,7 +531,9 @@ ginRedoUpdateMetapage(XLogRecPtr lsn, XLogRecord *record) /* * insert into tail page */ - if (!(record->xl_info & XLR_BKP_BLOCK_1)) + if (record->xl_info & XLR_BKP_BLOCK(0)) + (void) RestoreBackupBlock(lsn, record, 0, false, false); + else { buffer = XLogReadBuffer(data->node, data->metadata.tail, false); if (BufferIsValid(buffer)) @@ -553,20 +581,25 @@ ginRedoUpdateMetapage(XLogRecPtr lsn, XLogRecord *record) /* * New tail */ - buffer = XLogReadBuffer(data->node, data->prevTail, false); - if (BufferIsValid(buffer)) + if (record->xl_info & XLR_BKP_BLOCK(0)) + (void) RestoreBackupBlock(lsn, record, 0, false, false); + else { - Page page = BufferGetPage(buffer); - - if (!XLByteLE(lsn, PageGetLSN(page))) + buffer = XLogReadBuffer(data->node, data->prevTail, false); + if (BufferIsValid(buffer)) { - GinPageGetOpaque(page)->rightlink = data->newRightlink; + Page page = BufferGetPage(buffer); - PageSetLSN(page, lsn); - PageSetTLI(page, ThisTimeLineID); - MarkBufferDirty(buffer); + if (!XLByteLE(lsn, PageGetLSN(page))) + { + GinPageGetOpaque(page)->rightlink = data->newRightlink; + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); } - UnlockReleaseBuffer(buffer); } } @@ -585,8 +618,12 @@ ginRedoInsertListPage(XLogRecPtr lsn, XLogRecord *record) tupsize; IndexTuple tuples = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogInsertListPage)); - if (record->xl_info & XLR_BKP_BLOCK_1) + /* If we have a full-page image, restore it and we're done */ + if (record->xl_info & XLR_BKP_BLOCK(0)) + { + (void) RestoreBackupBlock(lsn, record, 0, false, false); return; + } buffer = XLogReadBuffer(data->node, data->blkno, true); Assert(BufferIsValid(buffer)); @@ -632,6 +669,9 @@ ginRedoDeleteListPages(XLogRecPtr lsn, XLogRecord *record) Page metapage; int i; + /* Backup blocks are not used in delete_listpage records */ + Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); + metabuffer = XLogReadBuffer(data->node, GIN_METAPAGE_BLKNO, false); if (!BufferIsValid(metabuffer)) return; /* assume index was deleted, nothing to do */ @@ -645,6 +685,16 @@ ginRedoDeleteListPages(XLogRecPtr lsn, XLogRecord *record) MarkBufferDirty(metabuffer); } + /* + * In normal operation, shiftList() takes exclusive lock on all the + * pages-to-be-deleted simultaneously. During replay, however, it should + * be all right to lock them one at a time. This is dependent on the fact + * that we are deleting pages from the head of the list, and that readers + * share-lock the next page before releasing the one they are on. So we + * cannot get past a reader that is on, or due to visit, any page we are + * going to delete. New incoming readers will block behind our metapage + * lock and then see a fully updated page list. + */ for (i = 0; i < data->ndeleted; i++) { Buffer buffer = XLogReadBuffer(data->node, data->toDelete[i], false); @@ -678,7 +728,6 @@ gin_redo(XLogRecPtr lsn, XLogRecord *record) * implement a similar optimization as we have in b-tree, and remove * killed tuples outside VACUUM, we'll need to handle that here. */ - RestoreBkpBlocks(lsn, record, false); topCtx = MemoryContextSwitchTo(opCtx); switch (info) @@ -717,69 +766,6 @@ gin_redo(XLogRecPtr lsn, XLogRecord *record) MemoryContextReset(opCtx); } -static void -desc_node(StringInfo buf, RelFileNode node, BlockNumber blkno) -{ - appendStringInfo(buf, "node: %u/%u/%u blkno: %u", - node.spcNode, node.dbNode, node.relNode, blkno); -} - -void -gin_desc(StringInfo buf, uint8 xl_info, char *rec) -{ - uint8 info = xl_info & ~XLR_INFO_MASK; - - switch (info) - { - case XLOG_GIN_CREATE_INDEX: - appendStringInfo(buf, "Create index, "); - desc_node(buf, *(RelFileNode *) rec, GIN_ROOT_BLKNO); - break; - case XLOG_GIN_CREATE_PTREE: - appendStringInfo(buf, "Create posting tree, "); - desc_node(buf, ((ginxlogCreatePostingTree *) rec)->node, ((ginxlogCreatePostingTree *) rec)->blkno); - break; - case XLOG_GIN_INSERT: - appendStringInfo(buf, "Insert item, "); - desc_node(buf, ((ginxlogInsert *) rec)->node, ((ginxlogInsert *) rec)->blkno); - appendStringInfo(buf, " offset: %u nitem: %u isdata: %c isleaf %c isdelete %c updateBlkno:%u", - ((ginxlogInsert *) rec)->offset, - ((ginxlogInsert *) rec)->nitem, - (((ginxlogInsert *) rec)->isData) ? 'T' : 'F', - (((ginxlogInsert *) rec)->isLeaf) ? 'T' : 'F', - (((ginxlogInsert *) rec)->isDelete) ? 'T' : 'F', - ((ginxlogInsert *) rec)->updateBlkno); - break; - case XLOG_GIN_SPLIT: - appendStringInfo(buf, "Page split, "); - desc_node(buf, ((ginxlogSplit *) rec)->node, ((ginxlogSplit *) rec)->lblkno); - appendStringInfo(buf, " isrootsplit: %c", (((ginxlogSplit *) rec)->isRootSplit) ? 'T' : 'F'); - break; - case XLOG_GIN_VACUUM_PAGE: - appendStringInfo(buf, "Vacuum page, "); - desc_node(buf, ((ginxlogVacuumPage *) rec)->node, ((ginxlogVacuumPage *) rec)->blkno); - break; - case XLOG_GIN_DELETE_PAGE: - appendStringInfo(buf, "Delete page, "); - desc_node(buf, ((ginxlogDeletePage *) rec)->node, ((ginxlogDeletePage *) rec)->blkno); - break; - case XLOG_GIN_UPDATE_META_PAGE: - appendStringInfo(buf, "Update metapage, "); - desc_node(buf, ((ginxlogUpdateMeta *) rec)->node, GIN_METAPAGE_BLKNO); - break; - case XLOG_GIN_INSERT_LISTPAGE: - appendStringInfo(buf, "Insert new list page, "); - desc_node(buf, ((ginxlogInsertListPage *) rec)->node, ((ginxlogInsertListPage *) rec)->blkno); - break; - case XLOG_GIN_DELETE_LISTPAGE: - appendStringInfo(buf, "Delete list pages (%d), ", ((ginxlogDeleteListPages *) rec)->ndeleted); - desc_node(buf, ((ginxlogDeleteListPages *) rec)->node, GIN_METAPAGE_BLKNO); - break; - default: - elog(PANIC, "gin_desc: unknown op code %u", info); - } -} - void gin_xlog_startup(void) { diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index 76029d9949..f9c8fcbcf5 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -32,35 +32,48 @@ typedef struct static MemoryContext opCtx; /* working memory for operations */ /* - * Replay the clearing of F_FOLLOW_RIGHT flag. + * Replay the clearing of F_FOLLOW_RIGHT flag on a child page. + * + * Even if the WAL record includes a full-page image, we have to update the + * follow-right flag, because that change is not included in the full-page + * image. To be sure that the intermediate state with the wrong flag value is + * not visible to concurrent Hot Standby queries, this function handles + * restoring the full-page image as well as updating the flag. (Note that + * we never need to do anything else to the child page in the current WAL + * action.) */ static void -gistRedoClearFollowRight(RelFileNode node, XLogRecPtr lsn, - BlockNumber leftblkno) +gistRedoClearFollowRight(XLogRecPtr lsn, XLogRecord *record, int block_index, + RelFileNode node, BlockNumber childblkno) { Buffer buffer; + Page page; - buffer = XLogReadBuffer(node, leftblkno, false); - if (BufferIsValid(buffer)) + if (record->xl_info & XLR_BKP_BLOCK(block_index)) + buffer = RestoreBackupBlock(lsn, record, block_index, false, true); + else { - Page page = (Page) BufferGetPage(buffer); + buffer = XLogReadBuffer(node, childblkno, false); + if (!BufferIsValid(buffer)) + return; /* page was deleted, nothing to do */ + } + page = (Page) BufferGetPage(buffer); - /* - * Note that we still update the page even if page LSN is equal to the - * LSN of this record, because the updated NSN is not included in the - * full page image. - */ - if (!XLByteLT(lsn, PageGetLSN(page))) - { - GistPageGetOpaque(page)->nsn = lsn; - GistClearFollowRight(page); + /* + * Note that we still update the page even if page LSN is equal to the LSN + * of this record, because the updated NSN is not included in the full + * page image. + */ + if (!XLByteLT(lsn, PageGetLSN(page))) + { + GistPageGetOpaque(page)->nsn = lsn; + GistClearFollowRight(page); - PageSetLSN(page, lsn); - PageSetTLI(page, ThisTimeLineID); - MarkBufferDirty(buffer); - } - UnlockReleaseBuffer(buffer); + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); } + UnlockReleaseBuffer(buffer); } /* @@ -75,18 +88,37 @@ gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record) Page page; char *data; + /* + * We need to acquire and hold lock on target page while updating the left + * child page. If we have a full-page image of target page, getting the + * lock is a side-effect of restoring that image. Note that even if the + * target page no longer exists, we'll still attempt to replay the change + * on the child page. + */ + if (record->xl_info & XLR_BKP_BLOCK(0)) + buffer = RestoreBackupBlock(lsn, record, 0, false, true); + else + buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); + + /* Fix follow-right data on left child page */ if (BlockNumberIsValid(xldata->leftchild)) - gistRedoClearFollowRight(xldata->node, lsn, xldata->leftchild); + gistRedoClearFollowRight(lsn, record, 1, + xldata->node, xldata->leftchild); - /* nothing more to do if page was backed up (and no info to do it with) */ - if (record->xl_info & XLR_BKP_BLOCK_1) + /* Done if target page no longer exists */ + if (!BufferIsValid(buffer)) return; - buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); - if (!BufferIsValid(buffer)) + /* nothing more to do if page was backed up (and no info to do it with) */ + if (record->xl_info & XLR_BKP_BLOCK(0)) + { + UnlockReleaseBuffer(buffer); return; + } + page = (Page) BufferGetPage(buffer); + /* nothing more to do if change already applied */ if (XLByteLE(lsn, PageGetLSN(page))) { UnlockReleaseBuffer(buffer); @@ -140,13 +172,16 @@ gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record) GistClearTuplesDeleted(page); } - if (!GistPageIsLeaf(page) && PageGetMaxOffsetNumber(page) == InvalidOffsetNumber && xldata->blkno == GIST_ROOT_BLKNO) - + if (!GistPageIsLeaf(page) && + PageGetMaxOffsetNumber(page) == InvalidOffsetNumber && + xldata->blkno == GIST_ROOT_BLKNO) + { /* * all links on non-leaf root page was deleted by vacuum full, so root * page becomes a leaf */ GistPageSetLeaf(page); + } GistPageGetOpaque(page)->rightlink = InvalidBlockNumber; PageSetLSN(page, lsn); @@ -155,30 +190,6 @@ gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record) UnlockReleaseBuffer(buffer); } -static void -gistRedoPageDeleteRecord(XLogRecPtr lsn, XLogRecord *record) -{ - gistxlogPageDelete *xldata = (gistxlogPageDelete *) XLogRecGetData(record); - Buffer buffer; - Page page; - - /* nothing else to do if page was backed up (and no info to do it with) */ - if (record->xl_info & XLR_BKP_BLOCK_1) - return; - - buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); - if (!BufferIsValid(buffer)) - return; - - page = (Page) BufferGetPage(buffer); - GistPageSetDeleted(page); - - PageSetLSN(page, lsn); - PageSetTLI(page, ThisTimeLineID); - MarkBufferDirty(buffer); - UnlockReleaseBuffer(buffer); -} - static void decodePageSplitRecord(PageSplitRecord *decoded, XLogRecord *record) { @@ -215,15 +226,22 @@ gistRedoPageSplitRecord(XLogRecPtr lsn, XLogRecord *record) { gistxlogPageSplit *xldata = (gistxlogPageSplit *) XLogRecGetData(record); PageSplitRecord xlrec; + Buffer firstbuffer = InvalidBuffer; Buffer buffer; Page page; int i; bool isrootsplit = false; - if (BlockNumberIsValid(xldata->leftchild)) - gistRedoClearFollowRight(xldata->node, lsn, xldata->leftchild); decodePageSplitRecord(&xlrec, record); + /* + * We must hold lock on the first-listed page throughout the action, + * including while updating the left child page (if any). We can unlock + * remaining pages in the list as soon as they've been written, because + * there is no path for concurrent queries to reach those pages without + * first visiting the first-listed page. + */ + /* loop around all pages */ for (i = 0; i < xlrec.data->npage; i++) { @@ -273,8 +291,20 @@ gistRedoPageSplitRecord(XLogRecPtr lsn, XLogRecord *record) PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); - UnlockReleaseBuffer(buffer); + + if (i == 0) + firstbuffer = buffer; + else + UnlockReleaseBuffer(buffer); } + + /* Fix follow-right data on left child page, if any */ + if (BlockNumberIsValid(xldata->leftchild)) + gistRedoClearFollowRight(lsn, record, 0, + xldata->node, xldata->leftchild); + + /* Finally, release lock on the first page */ + UnlockReleaseBuffer(firstbuffer); } static void @@ -284,6 +314,9 @@ gistRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record) Buffer buffer; Page page; + /* Backup blocks are not used in create_index records */ + Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); + buffer = XLogReadBuffer(*node, GIST_ROOT_BLKNO, true); Assert(BufferIsValid(buffer)); page = (Page) BufferGetPage(buffer); @@ -308,7 +341,6 @@ gist_redo(XLogRecPtr lsn, XLogRecord *record) * implement a similar optimization we have in b-tree, and remove killed * tuples outside VACUUM, we'll need to handle that here. */ - RestoreBkpBlocks(lsn, record, false); oldCxt = MemoryContextSwitchTo(opCtx); switch (info) @@ -316,9 +348,6 @@ gist_redo(XLogRecPtr lsn, XLogRecord *record) case XLOG_GIST_PAGE_UPDATE: gistRedoPageUpdateRecord(lsn, record); break; - case XLOG_GIST_PAGE_DELETE: - gistRedoPageDeleteRecord(lsn, record); - break; case XLOG_GIST_PAGE_SPLIT: gistRedoPageSplitRecord(lsn, record); break; @@ -333,66 +362,6 @@ gist_redo(XLogRecPtr lsn, XLogRecord *record) MemoryContextReset(opCtx); } -static void -out_target(StringInfo buf, RelFileNode node) -{ - appendStringInfo(buf, "rel %u/%u/%u", - node.spcNode, node.dbNode, node.relNode); -} - -static void -out_gistxlogPageUpdate(StringInfo buf, gistxlogPageUpdate *xlrec) -{ - out_target(buf, xlrec->node); - appendStringInfo(buf, "; block number %u", xlrec->blkno); -} - -static void -out_gistxlogPageDelete(StringInfo buf, gistxlogPageDelete *xlrec) -{ - appendStringInfo(buf, "page_delete: rel %u/%u/%u; blkno %u", - xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode, - xlrec->blkno); -} - -static void -out_gistxlogPageSplit(StringInfo buf, gistxlogPageSplit *xlrec) -{ - appendStringInfo(buf, "page_split: "); - out_target(buf, xlrec->node); - appendStringInfo(buf, "; block number %u splits to %d pages", - xlrec->origblkno, xlrec->npage); -} - -void -gist_desc(StringInfo buf, uint8 xl_info, char *rec) -{ - uint8 info = xl_info & ~XLR_INFO_MASK; - - switch (info) - { - case XLOG_GIST_PAGE_UPDATE: - appendStringInfo(buf, "page_update: "); - out_gistxlogPageUpdate(buf, (gistxlogPageUpdate *) rec); - break; - case XLOG_GIST_PAGE_DELETE: - out_gistxlogPageDelete(buf, (gistxlogPageDelete *) rec); - break; - case XLOG_GIST_PAGE_SPLIT: - out_gistxlogPageSplit(buf, (gistxlogPageSplit *) rec); - break; - case XLOG_GIST_CREATE_INDEX: - appendStringInfo(buf, "create_index: rel %u/%u/%u", - ((RelFileNode *) rec)->spcNode, - ((RelFileNode *) rec)->dbNode, - ((RelFileNode *) rec)->relNode); - break; - default: - appendStringInfo(buf, "unknown gist op code %u", info); - break; - } -} - void gist_xlog_startup(void) { @@ -498,37 +467,30 @@ gistXLogUpdate(RelFileNode node, Buffer buffer, Buffer leftchildbuf) { XLogRecData *rdata; - gistxlogPageUpdate *xlrec; + gistxlogPageUpdate xlrec; int cur, i; XLogRecPtr recptr; - rdata = (XLogRecData *) palloc(sizeof(XLogRecData) * (4 + ituplen)); - xlrec = (gistxlogPageUpdate *) palloc(sizeof(gistxlogPageUpdate)); + rdata = (XLogRecData *) palloc(sizeof(XLogRecData) * (3 + ituplen)); - xlrec->node = node; - xlrec->blkno = BufferGetBlockNumber(buffer); - xlrec->ntodelete = ntodelete; - xlrec->leftchild = + xlrec.node = node; + xlrec.blkno = BufferGetBlockNumber(buffer); + xlrec.ntodelete = ntodelete; + xlrec.leftchild = BufferIsValid(leftchildbuf) ? BufferGetBlockNumber(leftchildbuf) : InvalidBlockNumber; - rdata[0].buffer = buffer; - rdata[0].buffer_std = true; - rdata[0].data = NULL; - rdata[0].len = 0; + rdata[0].data = (char *) &xlrec; + rdata[0].len = sizeof(gistxlogPageUpdate); + rdata[0].buffer = InvalidBuffer; rdata[0].next = &(rdata[1]); - rdata[1].data = (char *) xlrec; - rdata[1].len = sizeof(gistxlogPageUpdate); - rdata[1].buffer = InvalidBuffer; - rdata[1].next = &(rdata[2]); - - rdata[2].data = (char *) todelete; - rdata[2].len = sizeof(OffsetNumber) * ntodelete; - rdata[2].buffer = buffer; - rdata[2].buffer_std = true; + rdata[1].data = (char *) todelete; + rdata[1].len = sizeof(OffsetNumber) * ntodelete; + rdata[1].buffer = buffer; + rdata[1].buffer_std = true; - cur = 3; + cur = 2; /* new tuples */ for (i = 0; i < ituplen; i++) diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 88026695b6..1efc5ef17c 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -712,8 +712,3 @@ hash_redo(XLogRecPtr lsn, XLogRecord *record) { elog(PANIC, "hash_redo: unimplemented"); } - -void -hash_desc(StringInfo buf, uint8 xl_info, char *rec) -{ -} diff --git a/src/backend/access/heap/README.HOT b/src/backend/access/heap/README.HOT index f12cad44e5..4cf3c3a0d4 100644 --- a/src/backend/access/heap/README.HOT +++ b/src/backend/access/heap/README.HOT @@ -386,6 +386,34 @@ from the index, as well as ensuring that no one can see any inconsistent rows in a broken HOT chain (the first condition is stronger than the second). Finally, we can mark the index valid for searches. +Note that we do not need to set pg_index.indcheckxmin in this code path, +because we have outwaited any transactions that would need to avoid using +the index. (indcheckxmin is only needed because non-concurrent CREATE +INDEX doesn't want to wait; its stronger lock would create too much risk of +deadlock if it did.) + + +DROP INDEX CONCURRENTLY +----------------------- + +DROP INDEX CONCURRENTLY is sort of the reverse sequence of CREATE INDEX +CONCURRENTLY. We first mark the index as not indisvalid, and then wait for +any transactions that could be using it in queries to end. (During this +time, index updates must still be performed as normal, since such +transactions might expect freshly inserted tuples to be findable.) +Then, we clear indisready and indislive, and again wait for transactions +that could be updating the index to end. Finally we can drop the index +normally (though taking only ShareUpdateExclusiveLock on its parent table). + +The reason we need the pg_index.indislive flag is that after the second +wait step begins, we don't want transactions to be touching the index at +all; otherwise they might suffer errors if the DROP finally commits while +they are reading catalog entries for the index. If we had only indisvalid +and indisready, this state would be indistinguishable from the first stage +of CREATE INDEX CONCURRENTLY --- but in that state, we *do* want +transactions to examine the index, since they must consider it in +HOT-safety checks. + Limitations and Restrictions ---------------------------- diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 5a4591e045..74c41fac3e 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -1875,6 +1875,12 @@ FreeBulkInsertState(BulkInsertState bistate) * The HEAP_INSERT_SKIP_FSM option is passed directly to * RelationGetBufferForTuple, which see for more info. * + * HEAP_INSERT_FROZEN should only be specified for inserts into + * relfilenodes created during the current subtransaction and when + * there are no prior snapshots or pre-existing portals open. + * This causes rows to be frozen, which is an MVCC violation and + * requires explicit options chosen by user. + * * Note that these options will be applied when inserting into the heap's * TOAST table, too, if the tuple requires any out-of-line data. * @@ -2078,7 +2084,13 @@ heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, tup->t_data->t_infomask &= ~(HEAP_XACT_MASK); tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK); tup->t_data->t_infomask |= HEAP_XMAX_INVALID; - HeapTupleHeaderSetXmin(tup->t_data, xid); + if (options & HEAP_INSERT_FROZEN) + { + tup->t_data->t_infomask |= HEAP_XMIN_COMMITTED; + HeapTupleHeaderSetXmin(tup->t_data, FrozenTransactionId); + } + else + HeapTupleHeaderSetXmin(tup->t_data, xid); HeapTupleHeaderSetCmin(tup->t_data, cid); HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */ tup->t_tableOid = RelationGetRelid(relation); @@ -2352,27 +2364,26 @@ simple_heap_insert(Relation relation, HeapTuple tup) * * relation - table to be modified (caller must hold suitable lock) * tid - TID of tuple to be deleted - * ctid - output parameter, used only for failure case (see below) - * update_xmax - output parameter, used only for failure case (see below) * cid - delete command ID (used for visibility test, and stored into * cmax if successful) * crosscheck - if not InvalidSnapshot, also check tuple against this * wait - true if should wait for any conflicting update to commit/abort + * hufd - output parameter, filled in failure cases (see below) * * Normal, successful return value is HeapTupleMayBeUpdated, which * actually means we did delete it. Failure return codes are * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated * (the last only possible if wait == false). * - * In the failure cases, the routine returns the tuple's t_ctid and t_xmax. - * If t_ctid is the same as tid, the tuple was deleted; if different, the - * tuple was updated, and t_ctid is the location of the replacement tuple. - * (t_xmax is needed to verify that the replacement tuple matches.) + * In the failure cases, the routine fills *hufd with the tuple's t_ctid, + * t_xmax, and t_cmax (the last only for HeapTupleSelfUpdated, since we + * cannot obtain cmax from a combocid generated by another transaction). + * See comments for struct HeapUpdateFailureData for additional info. */ HTSU_Result heap_delete(Relation relation, ItemPointer tid, - ItemPointer ctid, TransactionId *update_xmax, - CommandId cid, Snapshot crosscheck, bool wait) + CommandId cid, Snapshot crosscheck, bool wait, + HeapUpdateFailureData *hufd) { HTSU_Result result; TransactionId xid = GetCurrentTransactionId(); @@ -2533,8 +2544,12 @@ heap_delete(Relation relation, ItemPointer tid, result == HeapTupleUpdated || result == HeapTupleBeingUpdated); Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID)); - *ctid = tp.t_data->t_ctid; - *update_xmax = HeapTupleHeaderGetXmax(tp.t_data); + hufd->ctid = tp.t_data->t_ctid; + hufd->xmax = HeapTupleHeaderGetXmax(tp.t_data); + if (result == HeapTupleSelfUpdated) + hufd->cmax = HeapTupleHeaderGetCmax(tp.t_data); + else + hufd->cmax = 0; /* for lack of an InvalidCommandId value */ UnlockReleaseBuffer(buffer); if (have_tuple_lock) UnlockTuple(relation, &(tp.t_self), ExclusiveLock); @@ -2666,13 +2681,12 @@ void simple_heap_delete(Relation relation, ItemPointer tid) { HTSU_Result result; - ItemPointerData update_ctid; - TransactionId update_xmax; + HeapUpdateFailureData hufd; result = heap_delete(relation, tid, - &update_ctid, &update_xmax, GetCurrentCommandId(true), InvalidSnapshot, - true /* wait for commit */ ); + true /* wait for commit */, + &hufd); switch (result) { case HeapTupleSelfUpdated: @@ -2703,12 +2717,11 @@ simple_heap_delete(Relation relation, ItemPointer tid) * relation - table to be modified (caller must hold suitable lock) * otid - TID of old tuple to be replaced * newtup - newly constructed tuple data to store - * ctid - output parameter, used only for failure case (see below) - * update_xmax - output parameter, used only for failure case (see below) * cid - update command ID (used for visibility test, and stored into * cmax/cmin if successful) * crosscheck - if not InvalidSnapshot, also check old tuple against this * wait - true if should wait for any conflicting update to commit/abort + * hufd - output parameter, filled in failure cases (see below) * * Normal, successful return value is HeapTupleMayBeUpdated, which * actually means we *did* update it. Failure return codes are @@ -2721,15 +2734,15 @@ simple_heap_delete(Relation relation, ItemPointer tid) * update was done. However, any TOAST changes in the new tuple's * data are not reflected into *newtup. * - * In the failure cases, the routine returns the tuple's t_ctid and t_xmax. - * If t_ctid is the same as otid, the tuple was deleted; if different, the - * tuple was updated, and t_ctid is the location of the replacement tuple. - * (t_xmax is needed to verify that the replacement tuple matches.) + * In the failure cases, the routine fills *hufd with the tuple's t_ctid, + * t_xmax, and t_cmax (the last only for HeapTupleSelfUpdated, since we + * cannot obtain cmax from a combocid generated by another transaction). + * See comments for struct HeapUpdateFailureData for additional info. */ HTSU_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, - ItemPointer ctid, TransactionId *update_xmax, - CommandId cid, Snapshot crosscheck, bool wait) + CommandId cid, Snapshot crosscheck, bool wait, + HeapUpdateFailureData *hufd) { HTSU_Result result; TransactionId xid = GetCurrentTransactionId(); @@ -2908,8 +2921,12 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, result == HeapTupleUpdated || result == HeapTupleBeingUpdated); Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)); - *ctid = oldtup.t_data->t_ctid; - *update_xmax = HeapTupleHeaderGetXmax(oldtup.t_data); + hufd->ctid = oldtup.t_data->t_ctid; + hufd->xmax = HeapTupleHeaderGetXmax(oldtup.t_data); + if (result == HeapTupleSelfUpdated) + hufd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data); + else + hufd->cmax = 0; /* for lack of an InvalidCommandId value */ UnlockReleaseBuffer(buffer); if (have_tuple_lock) UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock); @@ -3379,13 +3396,12 @@ void simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) { HTSU_Result result; - ItemPointerData update_ctid; - TransactionId update_xmax; + HeapUpdateFailureData hufd; result = heap_update(relation, otid, tup, - &update_ctid, &update_xmax, GetCurrentCommandId(true), InvalidSnapshot, - true /* wait for commit */ ); + true /* wait for commit */, + &hufd); switch (result) { case HeapTupleSelfUpdated: @@ -3423,18 +3439,17 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) * Output parameters: * *tuple: all fields filled in * *buffer: set to buffer holding tuple (pinned but not locked at exit) - * *ctid: set to tuple's t_ctid, but only in failure cases - * *update_xmax: set to tuple's xmax, but only in failure cases + * *hufd: filled in failure cases (see below) * * Function result may be: * HeapTupleMayBeUpdated: lock was successfully acquired * HeapTupleSelfUpdated: lock failed because tuple updated by self * HeapTupleUpdated: lock failed because tuple updated by other xact * - * In the failure cases, the routine returns the tuple's t_ctid and t_xmax. - * If t_ctid is the same as t_self, the tuple was deleted; if different, the - * tuple was updated, and t_ctid is the location of the replacement tuple. - * (t_xmax is needed to verify that the replacement tuple matches.) + * In the failure cases, the routine fills *hufd with the tuple's t_ctid, + * t_xmax, and t_cmax (the last only for HeapTupleSelfUpdated, since we + * cannot obtain cmax from a combocid generated by another transaction). + * See comments for struct HeapUpdateFailureData for additional info. * * * NOTES: because the shared-memory lock table is of finite size, but users @@ -3470,9 +3485,9 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) * conflict for a tuple, we don't incur any extra overhead. */ HTSU_Result -heap_lock_tuple(Relation relation, HeapTuple tuple, Buffer *buffer, - ItemPointer ctid, TransactionId *update_xmax, - CommandId cid, LockTupleMode mode, bool nowait) +heap_lock_tuple(Relation relation, HeapTuple tuple, + CommandId cid, LockTupleMode mode, bool nowait, + Buffer *buffer, HeapUpdateFailureData *hufd) { HTSU_Result result; ItemPointer tid = &(tuple->t_self); @@ -3657,8 +3672,12 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, Buffer *buffer, { Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated); Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID)); - *ctid = tuple->t_data->t_ctid; - *update_xmax = HeapTupleHeaderGetXmax(tuple->t_data); + hufd->ctid = tuple->t_data->t_ctid; + hufd->xmax = HeapTupleHeaderGetXmax(tuple->t_data); + if (result == HeapTupleSelfUpdated) + hufd->cmax = HeapTupleHeaderGetCmax(tuple->t_data); + else + hufd->cmax = 0; /* for lack of an InvalidCommandId value */ LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); if (have_tuple_lock) UnlockTuple(relation, tid, tuple_lock_type); @@ -4613,6 +4632,9 @@ heap_xlog_cleanup_info(XLogRecPtr lsn, XLogRecord *record) * conflict processing to occur before we begin index vacuum actions. see * vacuumlazy.c and also comments in btvacuumpage() */ + + /* Backup blocks are not used in cleanup_info records */ + Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); } /* @@ -4645,10 +4667,15 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record) ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node); - RestoreBkpBlocks(lsn, record, true); - - if (record->xl_info & XLR_BKP_BLOCK_1) + /* + * If we have a full-page image, restore it (using a cleanup lock) and + * we're done. + */ + if (record->xl_info & XLR_BKP_BLOCK(0)) + { + (void) RestoreBackupBlock(lsn, record, 0, true, false); return; + } buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, RBM_NORMAL); if (!BufferIsValid(buffer)) @@ -4714,15 +4741,16 @@ heap_xlog_freeze(XLogRecPtr lsn, XLogRecord *record) if (InHotStandby) ResolveRecoveryConflictWithSnapshot(cutoff_xid, xlrec->node); - RestoreBkpBlocks(lsn, record, false); - - if (record->xl_info & XLR_BKP_BLOCK_1) + /* If we have a full-page image, restore it and we're done */ + if (record->xl_info & XLR_BKP_BLOCK(0)) + { + (void) RestoreBackupBlock(lsn, record, 0, false, false); return; + } - buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, RBM_NORMAL); + buffer = XLogReadBuffer(xlrec->node, xlrec->block, false); if (!BufferIsValid(buffer)) return; - LockBufferForCleanup(buffer); page = (Page) BufferGetPage(buffer); if (XLByteLE(lsn, PageGetLSN(page))) @@ -4771,18 +4799,6 @@ heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record) Buffer buffer; Page page; - /* - * Read the heap page, if it still exists. If the heap file has been - * dropped or truncated later in recovery, this might fail. In that case, - * there's no point in doing anything further, since the visibility map - * will have to be cleared out at the same time. - */ - buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, - RBM_NORMAL); - if (!BufferIsValid(buffer)) - return; - page = (Page) BufferGetPage(buffer); - /* * If there are any Hot Standby transactions running that have an xmin * horizon old enough that this page isn't all-visible for them, they @@ -4795,37 +4811,50 @@ heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record) if (InHotStandby) ResolveRecoveryConflictWithSnapshot(xlrec->cutoff_xid, xlrec->node); - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - /* - * We don't bump the LSN of the heap page when setting the visibility map - * bit, because that would generate an unworkable volume of full-page - * writes. This exposes us to torn page hazards, but since we're not - * inspecting the existing page contents in any way, we don't care. - * - * However, all operations that clear the visibility map bit *do* bump the - * LSN, and those operations will only be replayed if the XLOG LSN follows - * the page LSN. Thus, if the page LSN has advanced past our XLOG - * record's LSN, we mustn't mark the page all-visible, because the - * subsequent update won't be replayed to clear the flag. + * Read the heap page, if it still exists. If the heap file has been + * dropped or truncated later in recovery, we don't need to update the + * page, but we'd better still update the visibility map. */ - if (!XLByteLE(lsn, PageGetLSN(page))) + buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, + RBM_NORMAL); + if (BufferIsValid(buffer)) { - PageSetAllVisible(page); - MarkBufferDirty(buffer); - } + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - /* Done with heap page. */ - UnlockReleaseBuffer(buffer); + page = (Page) BufferGetPage(buffer); + + /* + * We don't bump the LSN of the heap page when setting the visibility + * map bit, because that would generate an unworkable volume of + * full-page writes. This exposes us to torn page hazards, but since + * we're not inspecting the existing page contents in any way, we + * don't care. + * + * However, all operations that clear the visibility map bit *do* bump + * the LSN, and those operations will only be replayed if the XLOG LSN + * follows the page LSN. Thus, if the page LSN has advanced past our + * XLOG record's LSN, we mustn't mark the page all-visible, because + * the subsequent update won't be replayed to clear the flag. + */ + if (!XLByteLE(lsn, PageGetLSN(page))) + { + PageSetAllVisible(page); + MarkBufferDirty(buffer); + } + + /* Done with heap page. */ + UnlockReleaseBuffer(buffer); + } /* - * Even we skipped the heap page update due to the LSN interlock, it's + * Even if we skipped the heap page update due to the LSN interlock, it's * still safe to update the visibility map. Any WAL record that clears * the visibility map bit does so before checking the page LSN, so any * bits that need to be cleared will still be cleared. */ - if (record->xl_info & XLR_BKP_BLOCK_1) - RestoreBkpBlocks(lsn, record, false); + if (record->xl_info & XLR_BKP_BLOCK(0)) + (void) RestoreBackupBlock(lsn, record, 0, false, false); else { Relation reln; @@ -4837,13 +4866,13 @@ heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record) /* * Don't set the bit if replay has already passed this point. * - * It might be safe to do this unconditionally; if replay has past + * It might be safe to do this unconditionally; if replay has passed * this point, we'll replay at least as far this time as we did * before, and if this bit needs to be cleared, the record responsible * for doing so should be again replayed, and clear it. For right * now, out of an abundance of conservatism, we use the same test here - * we did for the heap page; if this results in a dropped bit, no real - * harm is done; and the next VACUUM will fix it. + * we did for the heap page. If this results in a dropped bit, no + * real harm is done; and the next VACUUM will fix it. */ if (!XLByteLE(lsn, PageGetLSN(BufferGetPage(vmbuffer)))) visibilitymap_set(reln, xlrec->block, lsn, vmbuffer, @@ -4861,6 +4890,9 @@ heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record) Buffer buffer; Page page; + /* Backup blocks are not used in newpage records */ + Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); + /* * Note: the NEWPAGE log record is used for both heaps and indexes, so do * not do anything that assumes we are touching a heap. @@ -4916,8 +4948,12 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record) FreeFakeRelcacheEntry(reln); } - if (record->xl_info & XLR_BKP_BLOCK_1) + /* If we have a full-page image, restore it and we're done */ + if (record->xl_info & XLR_BKP_BLOCK(0)) + { + (void) RestoreBackupBlock(lsn, record, 0, false, false); return; + } buffer = XLogReadBuffer(xlrec->target.node, blkno, false); if (!BufferIsValid(buffer)) @@ -4997,8 +5033,12 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record) FreeFakeRelcacheEntry(reln); } - if (record->xl_info & XLR_BKP_BLOCK_1) + /* If we have a full-page image, restore it and we're done */ + if (record->xl_info & XLR_BKP_BLOCK(0)) + { + (void) RestoreBackupBlock(lsn, record, 0, false, false); return; + } if (record->xl_info & XLOG_HEAP_INIT_PAGE) { @@ -5100,8 +5140,6 @@ heap_xlog_multi_insert(XLogRecPtr lsn, XLogRecord *record) * required. */ - RestoreBkpBlocks(lsn, record, false); - xlrec = (xl_heap_multi_insert *) recdata; recdata += SizeOfHeapMultiInsert; @@ -5130,8 +5168,12 @@ heap_xlog_multi_insert(XLogRecPtr lsn, XLogRecord *record) FreeFakeRelcacheEntry(reln); } - if (record->xl_info & XLR_BKP_BLOCK_1) + /* If we have a full-page image, restore it and we're done */ + if (record->xl_info & XLR_BKP_BLOCK(0)) + { + (void) RestoreBackupBlock(lsn, record, 0, false, false); return; + } if (isinit) { @@ -5225,9 +5267,10 @@ static void heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) { xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record); - Buffer buffer; bool samepage = (ItemPointerGetBlockNumber(&(xlrec->newtid)) == ItemPointerGetBlockNumber(&(xlrec->target.tid))); + Buffer obuffer, + nbuffer; Page page; OffsetNumber offnum; ItemId lp = NULL; @@ -5258,27 +5301,44 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) FreeFakeRelcacheEntry(reln); } - if (record->xl_info & XLR_BKP_BLOCK_1) + /* + * In normal operation, it is important to lock the two pages in + * page-number order, to avoid possible deadlocks against other update + * operations going the other way. However, during WAL replay there can + * be no other update happening, so we don't need to worry about that. But + * we *do* need to worry that we don't expose an inconsistent state to Hot + * Standby queries --- so the original page can't be unlocked before we've + * added the new tuple to the new page. + */ + + if (record->xl_info & XLR_BKP_BLOCK(0)) { + obuffer = RestoreBackupBlock(lsn, record, 0, false, true); if (samepage) - return; /* backup block covered both changes */ + { + /* backup block covered both changes, so we're done */ + UnlockReleaseBuffer(obuffer); + return; + } goto newt; } /* Deal with old tuple version */ - buffer = XLogReadBuffer(xlrec->target.node, - ItemPointerGetBlockNumber(&(xlrec->target.tid)), - false); - if (!BufferIsValid(buffer)) + obuffer = XLogReadBuffer(xlrec->target.node, + ItemPointerGetBlockNumber(&(xlrec->target.tid)), + false); + if (!BufferIsValid(obuffer)) goto newt; - page = (Page) BufferGetPage(buffer); + page = (Page) BufferGetPage(obuffer); if (XLByteLE(lsn, PageGetLSN(page))) /* changes are applied */ { - UnlockReleaseBuffer(buffer); if (samepage) + { + UnlockReleaseBuffer(obuffer); return; + } goto newt; } @@ -5316,11 +5376,14 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update) * is already applied */ if (samepage) + { + nbuffer = obuffer; goto newsame; + } + PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); - MarkBufferDirty(buffer); - UnlockReleaseBuffer(buffer); + MarkBufferDirty(obuffer); /* Deal with new tuple */ @@ -5342,31 +5405,38 @@ newt:; FreeFakeRelcacheEntry(reln); } - if (record->xl_info & XLR_BKP_BLOCK_2) + if (record->xl_info & XLR_BKP_BLOCK(1)) + { + (void) RestoreBackupBlock(lsn, record, 1, false, false); + if (BufferIsValid(obuffer)) + UnlockReleaseBuffer(obuffer); return; + } if (record->xl_info & XLOG_HEAP_INIT_PAGE) { - buffer = XLogReadBuffer(xlrec->target.node, - ItemPointerGetBlockNumber(&(xlrec->newtid)), - true); - Assert(BufferIsValid(buffer)); - page = (Page) BufferGetPage(buffer); + nbuffer = XLogReadBuffer(xlrec->target.node, + ItemPointerGetBlockNumber(&(xlrec->newtid)), + true); + Assert(BufferIsValid(nbuffer)); + page = (Page) BufferGetPage(nbuffer); - PageInit(page, BufferGetPageSize(buffer), 0); + PageInit(page, BufferGetPageSize(nbuffer), 0); } else { - buffer = XLogReadBuffer(xlrec->target.node, - ItemPointerGetBlockNumber(&(xlrec->newtid)), - false); - if (!BufferIsValid(buffer)) + nbuffer = XLogReadBuffer(xlrec->target.node, + ItemPointerGetBlockNumber(&(xlrec->newtid)), + false); + if (!BufferIsValid(nbuffer)) return; - page = (Page) BufferGetPage(buffer); + page = (Page) BufferGetPage(nbuffer); if (XLByteLE(lsn, PageGetLSN(page))) /* changes are applied */ { - UnlockReleaseBuffer(buffer); + UnlockReleaseBuffer(nbuffer); + if (BufferIsValid(obuffer)) + UnlockReleaseBuffer(obuffer); return; } } @@ -5411,11 +5481,14 @@ newsame:; PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); - MarkBufferDirty(buffer); - UnlockReleaseBuffer(buffer); + MarkBufferDirty(nbuffer); + UnlockReleaseBuffer(nbuffer); + + if (BufferIsValid(obuffer) && obuffer != nbuffer) + UnlockReleaseBuffer(obuffer); /* - * If the page is running low on free space, update the FSM as well. + * If the new page is running low on free space, update the FSM as well. * Arbitrarily, our definition of "low" is less than 20%. We can't do much * better than that without knowing the fill-factor for the table. * @@ -5431,7 +5504,8 @@ newsame:; */ if (!hot_update && freespace < BLCKSZ / 5) XLogRecordPageWithFreeSpace(xlrec->target.node, - ItemPointerGetBlockNumber(&(xlrec->newtid)), freespace); + ItemPointerGetBlockNumber(&(xlrec->newtid)), + freespace); } static void @@ -5444,8 +5518,12 @@ heap_xlog_lock(XLogRecPtr lsn, XLogRecord *record) ItemId lp = NULL; HeapTupleHeader htup; - if (record->xl_info & XLR_BKP_BLOCK_1) + /* If we have a full-page image, restore it and we're done */ + if (record->xl_info & XLR_BKP_BLOCK(0)) + { + (void) RestoreBackupBlock(lsn, record, 0, false, false); return; + } buffer = XLogReadBuffer(xlrec->target.node, ItemPointerGetBlockNumber(&(xlrec->target.tid)), @@ -5503,8 +5581,12 @@ heap_xlog_inplace(XLogRecPtr lsn, XLogRecord *record) uint32 oldlen; uint32 newlen; - if (record->xl_info & XLR_BKP_BLOCK_1) + /* If we have a full-page image, restore it and we're done */ + if (record->xl_info & XLR_BKP_BLOCK(0)) + { + (void) RestoreBackupBlock(lsn, record, 0, false, false); return; + } buffer = XLogReadBuffer(xlrec->target.node, ItemPointerGetBlockNumber(&(xlrec->target.tid)), @@ -5553,8 +5635,6 @@ heap_redo(XLogRecPtr lsn, XLogRecord *record) * required. The ones in heap2 rmgr do. */ - RestoreBkpBlocks(lsn, record, false); - switch (info & XLOG_HEAP_OPMASK) { case XLOG_HEAP_INSERT: @@ -5588,11 +5668,6 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record) { uint8 info = record->xl_info & ~XLR_INFO_MASK; - /* - * Note that RestoreBkpBlocks() is called after conflict processing within - * each record type handling function. - */ - switch (info & XLOG_HEAP_OPMASK) { case XLOG_HEAP2_FREEZE: @@ -5615,154 +5690,6 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record) } } -static void -out_target(StringInfo buf, xl_heaptid *target) -{ - appendStringInfo(buf, "rel %u/%u/%u; tid %u/%u", - target->node.spcNode, target->node.dbNode, target->node.relNode, - ItemPointerGetBlockNumber(&(target->tid)), - ItemPointerGetOffsetNumber(&(target->tid))); -} - -void -heap_desc(StringInfo buf, uint8 xl_info, char *rec) -{ - uint8 info = xl_info & ~XLR_INFO_MASK; - - info &= XLOG_HEAP_OPMASK; - if (info == XLOG_HEAP_INSERT) - { - xl_heap_insert *xlrec = (xl_heap_insert *) rec; - - if (xl_info & XLOG_HEAP_INIT_PAGE) - appendStringInfo(buf, "insert(init): "); - else - appendStringInfo(buf, "insert: "); - out_target(buf, &(xlrec->target)); - } - else if (info == XLOG_HEAP_DELETE) - { - xl_heap_delete *xlrec = (xl_heap_delete *) rec; - - appendStringInfo(buf, "delete: "); - out_target(buf, &(xlrec->target)); - } - else if (info == XLOG_HEAP_UPDATE) - { - xl_heap_update *xlrec = (xl_heap_update *) rec; - - if (xl_info & XLOG_HEAP_INIT_PAGE) - appendStringInfo(buf, "update(init): "); - else - appendStringInfo(buf, "update: "); - out_target(buf, &(xlrec->target)); - appendStringInfo(buf, "; new %u/%u", - ItemPointerGetBlockNumber(&(xlrec->newtid)), - ItemPointerGetOffsetNumber(&(xlrec->newtid))); - } - else if (info == XLOG_HEAP_HOT_UPDATE) - { - xl_heap_update *xlrec = (xl_heap_update *) rec; - - if (xl_info & XLOG_HEAP_INIT_PAGE) /* can this case happen? */ - appendStringInfo(buf, "hot_update(init): "); - else - appendStringInfo(buf, "hot_update: "); - out_target(buf, &(xlrec->target)); - appendStringInfo(buf, "; new %u/%u", - ItemPointerGetBlockNumber(&(xlrec->newtid)), - ItemPointerGetOffsetNumber(&(xlrec->newtid))); - } - else if (info == XLOG_HEAP_NEWPAGE) - { - xl_heap_newpage *xlrec = (xl_heap_newpage *) rec; - - appendStringInfo(buf, "newpage: rel %u/%u/%u; fork %u, blk %u", - xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode, xlrec->forknum, - xlrec->blkno); - } - else if (info == XLOG_HEAP_LOCK) - { - xl_heap_lock *xlrec = (xl_heap_lock *) rec; - - if (xlrec->shared_lock) - appendStringInfo(buf, "shared_lock: "); - else - appendStringInfo(buf, "exclusive_lock: "); - if (xlrec->xid_is_mxact) - appendStringInfo(buf, "mxid "); - else - appendStringInfo(buf, "xid "); - appendStringInfo(buf, "%u ", xlrec->locking_xid); - out_target(buf, &(xlrec->target)); - } - else if (info == XLOG_HEAP_INPLACE) - { - xl_heap_inplace *xlrec = (xl_heap_inplace *) rec; - - appendStringInfo(buf, "inplace: "); - out_target(buf, &(xlrec->target)); - } - else - appendStringInfo(buf, "UNKNOWN"); -} - -void -heap2_desc(StringInfo buf, uint8 xl_info, char *rec) -{ - uint8 info = xl_info & ~XLR_INFO_MASK; - - info &= XLOG_HEAP_OPMASK; - if (info == XLOG_HEAP2_FREEZE) - { - xl_heap_freeze *xlrec = (xl_heap_freeze *) rec; - - appendStringInfo(buf, "freeze: rel %u/%u/%u; blk %u; cutoff %u", - xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode, xlrec->block, - xlrec->cutoff_xid); - } - else if (info == XLOG_HEAP2_CLEAN) - { - xl_heap_clean *xlrec = (xl_heap_clean *) rec; - - appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u remxid %u", - xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode, xlrec->block, - xlrec->latestRemovedXid); - } - else if (info == XLOG_HEAP2_CLEANUP_INFO) - { - xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) rec; - - appendStringInfo(buf, "cleanup info: remxid %u", - xlrec->latestRemovedXid); - } - else if (info == XLOG_HEAP2_VISIBLE) - { - xl_heap_visible *xlrec = (xl_heap_visible *) rec; - - appendStringInfo(buf, "visible: rel %u/%u/%u; blk %u", - xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode, xlrec->block); - } - else if (info == XLOG_HEAP2_MULTI_INSERT) - { - xl_heap_multi_insert *xlrec = (xl_heap_multi_insert *) rec; - - if (xl_info & XLOG_HEAP_INIT_PAGE) - appendStringInfo(buf, "multi-insert (init): "); - else - appendStringInfo(buf, "multi-insert: "); - appendStringInfo(buf, "rel %u/%u/%u; blk %u; %d tuples", - xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode, - xlrec->blkno, xlrec->ntuples); - } - else - appendStringInfo(buf, "UNKNOWN"); -} - /* * heap_sync - sync a heap, for use when no WAL has been written * diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 72ea1719e7..9f850ab05f 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -218,10 +218,9 @@ btree_xlog_insert(bool isleaf, bool ismeta, datalen -= sizeof(xl_btree_metadata); } - if ((record->xl_info & XLR_BKP_BLOCK_1) && !ismeta && isleaf) - return; /* nothing to do */ - - if (!(record->xl_info & XLR_BKP_BLOCK_1)) + if (record->xl_info & XLR_BKP_BLOCK(0)) + (void) RestoreBackupBlock(lsn, record, 0, false, false); + else { buffer = XLogReadBuffer(xlrec->target.node, ItemPointerGetBlockNumber(&(xlrec->target.tid)), @@ -249,6 +248,13 @@ btree_xlog_insert(bool isleaf, bool ismeta, } } + /* + * Note: in normal operation, we'd update the metapage while still holding + * lock on the page we inserted into. But during replay it's not + * necessary to hold that lock, since no other index updates can be + * happening concurrently, and readers will cope fine with following an + * obsolete link from the metapage. + */ if (ismeta) _bt_restore_meta(xlrec->target.node, lsn, md.root, md.level, @@ -290,7 +296,7 @@ btree_xlog_split(bool onleft, bool isroot, forget_matching_split(xlrec->node, downlink, false); /* Extract left hikey and its size (still assuming 16-bit alignment) */ - if (!(record->xl_info & XLR_BKP_BLOCK_1)) + if (!(record->xl_info & XLR_BKP_BLOCK(0))) { /* We assume 16-bit alignment is enough for IndexTupleSize */ left_hikey = (Item) datapos; @@ -310,7 +316,7 @@ btree_xlog_split(bool onleft, bool isroot, datalen -= sizeof(OffsetNumber); } - if (onleft && !(record->xl_info & XLR_BKP_BLOCK_1)) + if (onleft && !(record->xl_info & XLR_BKP_BLOCK(0))) { /* * We assume that 16-bit alignment is enough to apply IndexTupleSize @@ -323,7 +329,7 @@ btree_xlog_split(bool onleft, bool isroot, datalen -= newitemsz; } - /* Reconstruct right (new) sibling from scratch */ + /* Reconstruct right (new) sibling page from scratch */ rbuf = XLogReadBuffer(xlrec->node, xlrec->rightsib, true); Assert(BufferIsValid(rbuf)); rpage = (Page) BufferGetPage(rbuf); @@ -357,18 +363,21 @@ btree_xlog_split(bool onleft, bool isroot, /* don't release the buffer yet; we touch right page's first item below */ - /* - * Reconstruct left (original) sibling if needed. Note that this code - * ensures that the items remaining on the left page are in the correct - * item number order, but it does not reproduce the physical order they - * would have had. Is this worth changing? See also _bt_restore_page(). - */ - if (!(record->xl_info & XLR_BKP_BLOCK_1)) + /* Now reconstruct left (original) sibling page */ + if (record->xl_info & XLR_BKP_BLOCK(0)) + (void) RestoreBackupBlock(lsn, record, 0, false, false); + else { Buffer lbuf = XLogReadBuffer(xlrec->node, xlrec->leftsib, false); if (BufferIsValid(lbuf)) { + /* + * Note that this code ensures that the items remaining on the + * left page are in the correct item number order, but it does not + * reproduce the physical order they would have had. Is this + * worth changing? See also _bt_restore_page(). + */ Page lpage = (Page) BufferGetPage(lbuf); BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); @@ -432,8 +441,17 @@ btree_xlog_split(bool onleft, bool isroot, /* We no longer need the right buffer */ UnlockReleaseBuffer(rbuf); - /* Fix left-link of the page to the right of the new right sibling */ - if (xlrec->rnext != P_NONE && !(record->xl_info & XLR_BKP_BLOCK_2)) + /* + * Fix left-link of the page to the right of the new right sibling. + * + * Note: in normal operation, we do this while still holding lock on the + * two split pages. However, that's not necessary for correctness in WAL + * replay, because no other index update can be in progress, and readers + * will cope properly when following an obsolete left-link. + */ + if (record->xl_info & XLR_BKP_BLOCK(1)) + (void) RestoreBackupBlock(lsn, record, 1, false, false); + else if (xlrec->rnext != P_NONE) { Buffer buffer = XLogReadBuffer(xlrec->node, xlrec->rnext, false); @@ -463,13 +481,11 @@ btree_xlog_split(bool onleft, bool isroot, static void btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record) { - xl_btree_vacuum *xlrec; + xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record); Buffer buffer; Page page; BTPageOpaque opaque; - xlrec = (xl_btree_vacuum *) XLogRecGetData(record); - /* * If queries might be active then we need to ensure every block is * unpinned between the lastBlockVacuumed and the current block, if there @@ -502,13 +518,14 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record) } /* - * If the block was restored from a full page image, nothing more to do. - * The RestoreBkpBlocks() call already pinned and took cleanup lock on it. - * XXX: Perhaps we should call RestoreBkpBlocks() *after* the loop above, - * to make the disk access more sequential. + * If we have a full-page image, restore it (using a cleanup lock) and + * we're done. */ - if (record->xl_info & XLR_BKP_BLOCK_1) + if (record->xl_info & XLR_BKP_BLOCK(0)) + { + (void) RestoreBackupBlock(lsn, record, 0, true, false); return; + } /* * Like in btvacuumpage(), we need to take a cleanup lock on every leaf @@ -563,9 +580,8 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record) * XXX optimise later with something like XLogPrefetchBuffer() */ static TransactionId -btree_xlog_delete_get_latestRemovedXid(XLogRecord *record) +btree_xlog_delete_get_latestRemovedXid(xl_btree_delete *xlrec) { - xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record); OffsetNumber *unused; Buffer ibuffer, hbuffer; @@ -702,15 +718,35 @@ btree_xlog_delete_get_latestRemovedXid(XLogRecord *record) static void btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record) { - xl_btree_delete *xlrec; + xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record); Buffer buffer; Page page; BTPageOpaque opaque; - if (record->xl_info & XLR_BKP_BLOCK_1) - return; + /* + * If we have any conflict processing to do, it must happen before we + * update the page. + * + * Btree delete records can conflict with standby queries. You might + * think that vacuum records would conflict as well, but we've handled + * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid + * cleaned by the vacuum of the heap and so we can resolve any conflicts + * just once when that arrives. After that we know that no conflicts + * exist from individual btree vacuum records on that index. + */ + if (InHotStandby) + { + TransactionId latestRemovedXid = btree_xlog_delete_get_latestRemovedXid(xlrec); - xlrec = (xl_btree_delete *) XLogRecGetData(record); + ResolveRecoveryConflictWithSnapshot(latestRemovedXid, xlrec->node); + } + + /* If we have a full-page image, restore it and we're done */ + if (record->xl_info & XLR_BKP_BLOCK(0)) + { + (void) RestoreBackupBlock(lsn, record, 0, false, false); + return; + } /* * We don't need to take a cleanup lock to apply these changes. See @@ -766,8 +802,18 @@ btree_xlog_delete_page(uint8 info, XLogRecPtr lsn, XLogRecord *record) leftsib = xlrec->leftblk; rightsib = xlrec->rightblk; + /* + * In normal operation, we would lock all the pages this WAL record + * touches before changing any of them. In WAL replay, it should be okay + * to lock just one page at a time, since no concurrent index updates can + * be happening, and readers should not care whether they arrive at the + * target page or not (since it's surely empty). + */ + /* parent page */ - if (!(record->xl_info & XLR_BKP_BLOCK_1)) + if (record->xl_info & XLR_BKP_BLOCK(0)) + (void) RestoreBackupBlock(lsn, record, 0, false, false); + else { buffer = XLogReadBuffer(xlrec->target.node, parent, false); if (BufferIsValid(buffer)) @@ -813,7 +859,9 @@ btree_xlog_delete_page(uint8 info, XLogRecPtr lsn, XLogRecord *record) } /* Fix left-link of right sibling */ - if (!(record->xl_info & XLR_BKP_BLOCK_2)) + if (record->xl_info & XLR_BKP_BLOCK(1)) + (void) RestoreBackupBlock(lsn, record, 1, false, false); + else { buffer = XLogReadBuffer(xlrec->target.node, rightsib, false); if (BufferIsValid(buffer)) @@ -837,7 +885,9 @@ btree_xlog_delete_page(uint8 info, XLogRecPtr lsn, XLogRecord *record) } /* Fix right-link of left sibling, if any */ - if (!(record->xl_info & XLR_BKP_BLOCK_3)) + if (record->xl_info & XLR_BKP_BLOCK(2)) + (void) RestoreBackupBlock(lsn, record, 2, false, false); + else { if (leftsib != P_NONE) { @@ -911,6 +961,9 @@ btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record) BTPageOpaque pageop; BlockNumber downlink = 0; + /* Backup blocks are not used in newroot records */ + Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); + buffer = XLogReadBuffer(xlrec->node, xlrec->rootblk, true); Assert(BufferIsValid(buffer)); page = (Page) BufferGetPage(buffer); @@ -952,67 +1005,36 @@ btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record) forget_matching_split(xlrec->node, downlink, true); } - -void -btree_redo(XLogRecPtr lsn, XLogRecord *record) +static void +btree_xlog_reuse_page(XLogRecPtr lsn, XLogRecord *record) { - uint8 info = record->xl_info & ~XLR_INFO_MASK; + xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) XLogRecGetData(record); /* - * If we have any conflict processing to do, it must happen before we - * update the page. + * Btree reuse_page records exist to provide a conflict point when we + * reuse pages in the index via the FSM. That's all they do though. + * + * latestRemovedXid was the page's btpo.xact. The btpo.xact < + * RecentGlobalXmin test in _bt_page_recyclable() conceptually mirrors the + * pgxact->xmin > limitXmin test in GetConflictingVirtualXIDs(). + * Consequently, one XID value achieves the same exclusion effect on + * master and standby. */ if (InHotStandby) { - switch (info) - { - case XLOG_BTREE_DELETE: - - /* - * Btree delete records can conflict with standby queries. You - * might think that vacuum records would conflict as well, but - * we've handled that already. XLOG_HEAP2_CLEANUP_INFO records - * provide the highest xid cleaned by the vacuum of the heap - * and so we can resolve any conflicts just once when that - * arrives. After that any we know that no conflicts exist - * from individual btree vacuum records on that index. - */ - { - TransactionId latestRemovedXid = btree_xlog_delete_get_latestRemovedXid(record); - xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record); - - ResolveRecoveryConflictWithSnapshot(latestRemovedXid, xlrec->node); - } - break; - - case XLOG_BTREE_REUSE_PAGE: + ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, + xlrec->node); + } - /* - * Btree reuse page records exist to provide a conflict point - * when we reuse pages in the index via the FSM. That's all it - * does though. latestRemovedXid was the page's btpo.xact. The - * btpo.xact < RecentGlobalXmin test in _bt_page_recyclable() - * conceptually mirrors the pgxact->xmin > limitXmin test in - * GetConflictingVirtualXIDs(). Consequently, one XID value - * achieves the same exclusion effect on master and standby. - */ - { - xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) XLogRecGetData(record); + /* Backup blocks are not used in reuse_page records */ + Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); +} - ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node); - } - return; - default: - break; - } - } - - /* - * Vacuum needs to pin and take cleanup lock on every leaf page, a regular - * exclusive lock is enough for all other purposes. - */ - RestoreBkpBlocks(lsn, record, (info == XLOG_BTREE_VACUUM)); +void +btree_redo(XLogRecPtr lsn, XLogRecord *record) +{ + uint8 info = record->xl_info & ~XLR_INFO_MASK; switch (info) { @@ -1052,158 +1074,13 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record) btree_xlog_newroot(lsn, record); break; case XLOG_BTREE_REUSE_PAGE: - /* Handled above before restoring bkp block */ + btree_xlog_reuse_page(lsn, record); break; default: elog(PANIC, "btree_redo: unknown op code %u", info); } } -static void -out_target(StringInfo buf, xl_btreetid *target) -{ - appendStringInfo(buf, "rel %u/%u/%u; tid %u/%u", - target->node.spcNode, target->node.dbNode, target->node.relNode, - ItemPointerGetBlockNumber(&(target->tid)), - ItemPointerGetOffsetNumber(&(target->tid))); -} - -void -btree_desc(StringInfo buf, uint8 xl_info, char *rec) -{ - uint8 info = xl_info & ~XLR_INFO_MASK; - - switch (info) - { - case XLOG_BTREE_INSERT_LEAF: - { - xl_btree_insert *xlrec = (xl_btree_insert *) rec; - - appendStringInfo(buf, "insert: "); - out_target(buf, &(xlrec->target)); - break; - } - case XLOG_BTREE_INSERT_UPPER: - { - xl_btree_insert *xlrec = (xl_btree_insert *) rec; - - appendStringInfo(buf, "insert_upper: "); - out_target(buf, &(xlrec->target)); - break; - } - case XLOG_BTREE_INSERT_META: - { - xl_btree_insert *xlrec = (xl_btree_insert *) rec; - - appendStringInfo(buf, "insert_meta: "); - out_target(buf, &(xlrec->target)); - break; - } - case XLOG_BTREE_SPLIT_L: - { - xl_btree_split *xlrec = (xl_btree_split *) rec; - - appendStringInfo(buf, "split_l: rel %u/%u/%u ", - xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode); - appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d", - xlrec->leftsib, xlrec->rightsib, xlrec->rnext, - xlrec->level, xlrec->firstright); - break; - } - case XLOG_BTREE_SPLIT_R: - { - xl_btree_split *xlrec = (xl_btree_split *) rec; - - appendStringInfo(buf, "split_r: rel %u/%u/%u ", - xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode); - appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d", - xlrec->leftsib, xlrec->rightsib, xlrec->rnext, - xlrec->level, xlrec->firstright); - break; - } - case XLOG_BTREE_SPLIT_L_ROOT: - { - xl_btree_split *xlrec = (xl_btree_split *) rec; - - appendStringInfo(buf, "split_l_root: rel %u/%u/%u ", - xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode); - appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d", - xlrec->leftsib, xlrec->rightsib, xlrec->rnext, - xlrec->level, xlrec->firstright); - break; - } - case XLOG_BTREE_SPLIT_R_ROOT: - { - xl_btree_split *xlrec = (xl_btree_split *) rec; - - appendStringInfo(buf, "split_r_root: rel %u/%u/%u ", - xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode); - appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d", - xlrec->leftsib, xlrec->rightsib, xlrec->rnext, - xlrec->level, xlrec->firstright); - break; - } - case XLOG_BTREE_VACUUM: - { - xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec; - - appendStringInfo(buf, "vacuum: rel %u/%u/%u; blk %u, lastBlockVacuumed %u", - xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode, xlrec->block, - xlrec->lastBlockVacuumed); - break; - } - case XLOG_BTREE_DELETE: - { - xl_btree_delete *xlrec = (xl_btree_delete *) rec; - - appendStringInfo(buf, "delete: index %u/%u/%u; iblk %u, heap %u/%u/%u;", - xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode, - xlrec->block, - xlrec->hnode.spcNode, xlrec->hnode.dbNode, xlrec->hnode.relNode); - break; - } - case XLOG_BTREE_DELETE_PAGE: - case XLOG_BTREE_DELETE_PAGE_META: - case XLOG_BTREE_DELETE_PAGE_HALF: - { - xl_btree_delete_page *xlrec = (xl_btree_delete_page *) rec; - - appendStringInfo(buf, "delete_page: "); - out_target(buf, &(xlrec->target)); - appendStringInfo(buf, "; dead %u; left %u; right %u", - xlrec->deadblk, xlrec->leftblk, xlrec->rightblk); - break; - } - case XLOG_BTREE_NEWROOT: - { - xl_btree_newroot *xlrec = (xl_btree_newroot *) rec; - - appendStringInfo(buf, "newroot: rel %u/%u/%u; root %u lev %u", - xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode, - xlrec->rootblk, xlrec->level); - break; - } - case XLOG_BTREE_REUSE_PAGE: - { - xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) rec; - - appendStringInfo(buf, "reuse_page: rel %u/%u/%u; latestRemovedXid %u", - xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode, xlrec->latestRemovedXid); - break; - } - default: - appendStringInfo(buf, "UNKNOWN"); - break; - } -} - void btree_xlog_startup(void) { diff --git a/src/backend/access/rmgrdesc/Makefile b/src/backend/access/rmgrdesc/Makefile new file mode 100644 index 0000000000..7d092d205d --- /dev/null +++ b/src/backend/access/rmgrdesc/Makefile @@ -0,0 +1,15 @@ +# +# Makefile for the rmgr descriptor routines +# +# src/backend/access/rmgrdesc/Makefile +# + +subdir = src/backend/access/rmgrdesc +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = clogdesc.o dbasedesc.o gindesc.o gistdesc.o hashdesc.o heapdesc.o \ + mxactdesc.o nbtdesc.o relmapdesc.o seqdesc.o smgrdesc.o spgdesc.o \ + standbydesc.o tblspcdesc.o xactdesc.o xlogdesc.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/rmgrdesc/clogdesc.c b/src/backend/access/rmgrdesc/clogdesc.c new file mode 100644 index 0000000000..07ab40fb41 --- /dev/null +++ b/src/backend/access/rmgrdesc/clogdesc.c @@ -0,0 +1,41 @@ +/*------------------------------------------------------------------------- + * + * clogdesc.c + * rmgr descriptor routines for access/transam/clog.c + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/clogdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/clog.h" + + +void +clog_desc(StringInfo buf, uint8 xl_info, char *rec) +{ + uint8 info = xl_info & ~XLR_INFO_MASK; + + if (info == CLOG_ZEROPAGE) + { + int pageno; + + memcpy(&pageno, rec, sizeof(int)); + appendStringInfo(buf, "zeropage: %d", pageno); + } + else if (info == CLOG_TRUNCATE) + { + int pageno; + + memcpy(&pageno, rec, sizeof(int)); + appendStringInfo(buf, "truncate before: %d", pageno); + } + else + appendStringInfo(buf, "UNKNOWN"); +} diff --git a/src/backend/access/rmgrdesc/dbasedesc.c b/src/backend/access/rmgrdesc/dbasedesc.c new file mode 100644 index 0000000000..d7ee96cb85 --- /dev/null +++ b/src/backend/access/rmgrdesc/dbasedesc.c @@ -0,0 +1,43 @@ +/*------------------------------------------------------------------------- + * + * dbasedesc.c + * rmgr descriptor routines for commands/dbcommands.c + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/dbasedesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "commands/dbcommands.h" +#include "lib/stringinfo.h" + + +void +dbase_desc(StringInfo buf, uint8 xl_info, char *rec) +{ + uint8 info = xl_info & ~XLR_INFO_MASK; + + if (info == XLOG_DBASE_CREATE) + { + xl_dbase_create_rec *xlrec = (xl_dbase_create_rec *) rec; + + appendStringInfo(buf, "create db: copy dir %u/%u to %u/%u", + xlrec->src_db_id, xlrec->src_tablespace_id, + xlrec->db_id, xlrec->tablespace_id); + } + else if (info == XLOG_DBASE_DROP) + { + xl_dbase_drop_rec *xlrec = (xl_dbase_drop_rec *) rec; + + appendStringInfo(buf, "drop db: dir %u/%u", + xlrec->db_id, xlrec->tablespace_id); + } + else + appendStringInfo(buf, "UNKNOWN"); +} diff --git a/src/backend/access/rmgrdesc/gindesc.c b/src/backend/access/rmgrdesc/gindesc.c new file mode 100644 index 0000000000..7ea36f28e1 --- /dev/null +++ b/src/backend/access/rmgrdesc/gindesc.c @@ -0,0 +1,83 @@ +/*------------------------------------------------------------------------- + * + * gindesc.c + * rmgr descriptor routines for access/transam/gin/ginxlog.c + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/gindesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/gin_private.h" +#include "lib/stringinfo.h" +#include "storage/relfilenode.h" + +static void +desc_node(StringInfo buf, RelFileNode node, BlockNumber blkno) +{ + appendStringInfo(buf, "node: %u/%u/%u blkno: %u", + node.spcNode, node.dbNode, node.relNode, blkno); +} + +void +gin_desc(StringInfo buf, uint8 xl_info, char *rec) +{ + uint8 info = xl_info & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_GIN_CREATE_INDEX: + appendStringInfo(buf, "Create index, "); + desc_node(buf, *(RelFileNode *) rec, GIN_ROOT_BLKNO); + break; + case XLOG_GIN_CREATE_PTREE: + appendStringInfo(buf, "Create posting tree, "); + desc_node(buf, ((ginxlogCreatePostingTree *) rec)->node, ((ginxlogCreatePostingTree *) rec)->blkno); + break; + case XLOG_GIN_INSERT: + appendStringInfo(buf, "Insert item, "); + desc_node(buf, ((ginxlogInsert *) rec)->node, ((ginxlogInsert *) rec)->blkno); + appendStringInfo(buf, " offset: %u nitem: %u isdata: %c isleaf %c isdelete %c updateBlkno:%u", + ((ginxlogInsert *) rec)->offset, + ((ginxlogInsert *) rec)->nitem, + (((ginxlogInsert *) rec)->isData) ? 'T' : 'F', + (((ginxlogInsert *) rec)->isLeaf) ? 'T' : 'F', + (((ginxlogInsert *) rec)->isDelete) ? 'T' : 'F', + ((ginxlogInsert *) rec)->updateBlkno); + break; + case XLOG_GIN_SPLIT: + appendStringInfo(buf, "Page split, "); + desc_node(buf, ((ginxlogSplit *) rec)->node, ((ginxlogSplit *) rec)->lblkno); + appendStringInfo(buf, " isrootsplit: %c", (((ginxlogSplit *) rec)->isRootSplit) ? 'T' : 'F'); + break; + case XLOG_GIN_VACUUM_PAGE: + appendStringInfo(buf, "Vacuum page, "); + desc_node(buf, ((ginxlogVacuumPage *) rec)->node, ((ginxlogVacuumPage *) rec)->blkno); + break; + case XLOG_GIN_DELETE_PAGE: + appendStringInfo(buf, "Delete page, "); + desc_node(buf, ((ginxlogDeletePage *) rec)->node, ((ginxlogDeletePage *) rec)->blkno); + break; + case XLOG_GIN_UPDATE_META_PAGE: + appendStringInfo(buf, "Update metapage, "); + desc_node(buf, ((ginxlogUpdateMeta *) rec)->node, GIN_METAPAGE_BLKNO); + break; + case XLOG_GIN_INSERT_LISTPAGE: + appendStringInfo(buf, "Insert new list page, "); + desc_node(buf, ((ginxlogInsertListPage *) rec)->node, ((ginxlogInsertListPage *) rec)->blkno); + break; + case XLOG_GIN_DELETE_LISTPAGE: + appendStringInfo(buf, "Delete list pages (%d), ", ((ginxlogDeleteListPages *) rec)->ndeleted); + desc_node(buf, ((ginxlogDeleteListPages *) rec)->node, GIN_METAPAGE_BLKNO); + break; + default: + appendStringInfo(buf, "unknown gin op code %u", info); + break; + } +} diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c new file mode 100644 index 0000000000..1f47c6b8bb --- /dev/null +++ b/src/backend/access/rmgrdesc/gistdesc.c @@ -0,0 +1,68 @@ +/*------------------------------------------------------------------------- + * + * gistdesc.c + * rmgr descriptor routines for access/gist/gistxlog.c + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/gistdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/gist_private.h" +#include "lib/stringinfo.h" +#include "storage/relfilenode.h" + +static void +out_target(StringInfo buf, RelFileNode node) +{ + appendStringInfo(buf, "rel %u/%u/%u", + node.spcNode, node.dbNode, node.relNode); +} + +static void +out_gistxlogPageUpdate(StringInfo buf, gistxlogPageUpdate *xlrec) +{ + out_target(buf, xlrec->node); + appendStringInfo(buf, "; block number %u", xlrec->blkno); +} + +static void +out_gistxlogPageSplit(StringInfo buf, gistxlogPageSplit *xlrec) +{ + appendStringInfo(buf, "page_split: "); + out_target(buf, xlrec->node); + appendStringInfo(buf, "; block number %u splits to %d pages", + xlrec->origblkno, xlrec->npage); +} + +void +gist_desc(StringInfo buf, uint8 xl_info, char *rec) +{ + uint8 info = xl_info & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_GIST_PAGE_UPDATE: + appendStringInfo(buf, "page_update: "); + out_gistxlogPageUpdate(buf, (gistxlogPageUpdate *) rec); + break; + case XLOG_GIST_PAGE_SPLIT: + out_gistxlogPageSplit(buf, (gistxlogPageSplit *) rec); + break; + case XLOG_GIST_CREATE_INDEX: + appendStringInfo(buf, "create_index: rel %u/%u/%u", + ((RelFileNode *) rec)->spcNode, + ((RelFileNode *) rec)->dbNode, + ((RelFileNode *) rec)->relNode); + break; + default: + appendStringInfo(buf, "unknown gist op code %u", info); + break; + } +} diff --git a/src/backend/access/rmgrdesc/hashdesc.c b/src/backend/access/rmgrdesc/hashdesc.c new file mode 100644 index 0000000000..faba90c6f3 --- /dev/null +++ b/src/backend/access/rmgrdesc/hashdesc.c @@ -0,0 +1,22 @@ +/*------------------------------------------------------------------------- + * + * hashdesc.c + * rmgr descriptor routines for access/hash/hash.c + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/hashdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/hash.h" + +void +hash_desc(StringInfo buf, uint8 xl_info, char *rec) +{ +} diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c new file mode 100644 index 0000000000..e65745a96f --- /dev/null +++ b/src/backend/access/rmgrdesc/heapdesc.c @@ -0,0 +1,165 @@ +/*------------------------------------------------------------------------- + * + * heapdesc.c + * rmgr descriptor routines for access/heap/heapam.c + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/heapdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heapam_xlog.h" + +static void +out_target(StringInfo buf, xl_heaptid *target) +{ + appendStringInfo(buf, "rel %u/%u/%u; tid %u/%u", + target->node.spcNode, target->node.dbNode, target->node.relNode, + ItemPointerGetBlockNumber(&(target->tid)), + ItemPointerGetOffsetNumber(&(target->tid))); +} + +void +heap_desc(StringInfo buf, uint8 xl_info, char *rec) +{ + uint8 info = xl_info & ~XLR_INFO_MASK; + + info &= XLOG_HEAP_OPMASK; + if (info == XLOG_HEAP_INSERT) + { + xl_heap_insert *xlrec = (xl_heap_insert *) rec; + + if (xl_info & XLOG_HEAP_INIT_PAGE) + appendStringInfo(buf, "insert(init): "); + else + appendStringInfo(buf, "insert: "); + out_target(buf, &(xlrec->target)); + } + else if (info == XLOG_HEAP_DELETE) + { + xl_heap_delete *xlrec = (xl_heap_delete *) rec; + + appendStringInfo(buf, "delete: "); + out_target(buf, &(xlrec->target)); + } + else if (info == XLOG_HEAP_UPDATE) + { + xl_heap_update *xlrec = (xl_heap_update *) rec; + + if (xl_info & XLOG_HEAP_INIT_PAGE) + appendStringInfo(buf, "update(init): "); + else + appendStringInfo(buf, "update: "); + out_target(buf, &(xlrec->target)); + appendStringInfo(buf, "; new %u/%u", + ItemPointerGetBlockNumber(&(xlrec->newtid)), + ItemPointerGetOffsetNumber(&(xlrec->newtid))); + } + else if (info == XLOG_HEAP_HOT_UPDATE) + { + xl_heap_update *xlrec = (xl_heap_update *) rec; + + if (xl_info & XLOG_HEAP_INIT_PAGE) /* can this case happen? */ + appendStringInfo(buf, "hot_update(init): "); + else + appendStringInfo(buf, "hot_update: "); + out_target(buf, &(xlrec->target)); + appendStringInfo(buf, "; new %u/%u", + ItemPointerGetBlockNumber(&(xlrec->newtid)), + ItemPointerGetOffsetNumber(&(xlrec->newtid))); + } + else if (info == XLOG_HEAP_NEWPAGE) + { + xl_heap_newpage *xlrec = (xl_heap_newpage *) rec; + + appendStringInfo(buf, "newpage: rel %u/%u/%u; fork %u, blk %u", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode, xlrec->forknum, + xlrec->blkno); + } + else if (info == XLOG_HEAP_LOCK) + { + xl_heap_lock *xlrec = (xl_heap_lock *) rec; + + if (xlrec->shared_lock) + appendStringInfo(buf, "shared_lock: "); + else + appendStringInfo(buf, "exclusive_lock: "); + if (xlrec->xid_is_mxact) + appendStringInfo(buf, "mxid "); + else + appendStringInfo(buf, "xid "); + appendStringInfo(buf, "%u ", xlrec->locking_xid); + out_target(buf, &(xlrec->target)); + } + else if (info == XLOG_HEAP_INPLACE) + { + xl_heap_inplace *xlrec = (xl_heap_inplace *) rec; + + appendStringInfo(buf, "inplace: "); + out_target(buf, &(xlrec->target)); + } + else + appendStringInfo(buf, "UNKNOWN"); +} + +void +heap2_desc(StringInfo buf, uint8 xl_info, char *rec) +{ + uint8 info = xl_info & ~XLR_INFO_MASK; + + info &= XLOG_HEAP_OPMASK; + if (info == XLOG_HEAP2_FREEZE) + { + xl_heap_freeze *xlrec = (xl_heap_freeze *) rec; + + appendStringInfo(buf, "freeze: rel %u/%u/%u; blk %u; cutoff %u", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode, xlrec->block, + xlrec->cutoff_xid); + } + else if (info == XLOG_HEAP2_CLEAN) + { + xl_heap_clean *xlrec = (xl_heap_clean *) rec; + + appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u remxid %u", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode, xlrec->block, + xlrec->latestRemovedXid); + } + else if (info == XLOG_HEAP2_CLEANUP_INFO) + { + xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) rec; + + appendStringInfo(buf, "cleanup info: remxid %u", + xlrec->latestRemovedXid); + } + else if (info == XLOG_HEAP2_VISIBLE) + { + xl_heap_visible *xlrec = (xl_heap_visible *) rec; + + appendStringInfo(buf, "visible: rel %u/%u/%u; blk %u", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode, xlrec->block); + } + else if (info == XLOG_HEAP2_MULTI_INSERT) + { + xl_heap_multi_insert *xlrec = (xl_heap_multi_insert *) rec; + + if (xl_info & XLOG_HEAP_INIT_PAGE) + appendStringInfo(buf, "multi-insert (init): "); + else + appendStringInfo(buf, "multi-insert: "); + appendStringInfo(buf, "rel %u/%u/%u; blk %u; %d tuples", + xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode, + xlrec->blkno, xlrec->ntuples); + } + else + appendStringInfo(buf, "UNKNOWN"); +} diff --git a/src/backend/access/rmgrdesc/mxactdesc.c b/src/backend/access/rmgrdesc/mxactdesc.c new file mode 100644 index 0000000000..33b89f9103 --- /dev/null +++ b/src/backend/access/rmgrdesc/mxactdesc.c @@ -0,0 +1,51 @@ +/*------------------------------------------------------------------------- + * + * mxactdesc.c + * rmgr descriptor routines for access/transam/multixact.c + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/mxactdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/multixact.h" + + +void +multixact_desc(StringInfo buf, uint8 xl_info, char *rec) +{ + uint8 info = xl_info & ~XLR_INFO_MASK; + + if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE) + { + int pageno; + + memcpy(&pageno, rec, sizeof(int)); + appendStringInfo(buf, "zero offsets page: %d", pageno); + } + else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE) + { + int pageno; + + memcpy(&pageno, rec, sizeof(int)); + appendStringInfo(buf, "zero members page: %d", pageno); + } + else if (info == XLOG_MULTIXACT_CREATE_ID) + { + xl_multixact_create *xlrec = (xl_multixact_create *) rec; + int i; + + appendStringInfo(buf, "create multixact %u offset %u:", + xlrec->mid, xlrec->moff); + for (i = 0; i < xlrec->nxids; i++) + appendStringInfo(buf, " %u", xlrec->xids[i]); + } + else + appendStringInfo(buf, "UNKNOWN"); +} diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c new file mode 100644 index 0000000000..04da5f8691 --- /dev/null +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -0,0 +1,162 @@ +/*------------------------------------------------------------------------- + * + * nbtdesc.c + * rmgr descriptor routines for access/nbtree/nbtxlog.c + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/nbtdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/nbtree.h" + +static void +out_target(StringInfo buf, xl_btreetid *target) +{ + appendStringInfo(buf, "rel %u/%u/%u; tid %u/%u", + target->node.spcNode, target->node.dbNode, target->node.relNode, + ItemPointerGetBlockNumber(&(target->tid)), + ItemPointerGetOffsetNumber(&(target->tid))); +} + +void +btree_desc(StringInfo buf, uint8 xl_info, char *rec) +{ + uint8 info = xl_info & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_BTREE_INSERT_LEAF: + { + xl_btree_insert *xlrec = (xl_btree_insert *) rec; + + appendStringInfo(buf, "insert: "); + out_target(buf, &(xlrec->target)); + break; + } + case XLOG_BTREE_INSERT_UPPER: + { + xl_btree_insert *xlrec = (xl_btree_insert *) rec; + + appendStringInfo(buf, "insert_upper: "); + out_target(buf, &(xlrec->target)); + break; + } + case XLOG_BTREE_INSERT_META: + { + xl_btree_insert *xlrec = (xl_btree_insert *) rec; + + appendStringInfo(buf, "insert_meta: "); + out_target(buf, &(xlrec->target)); + break; + } + case XLOG_BTREE_SPLIT_L: + { + xl_btree_split *xlrec = (xl_btree_split *) rec; + + appendStringInfo(buf, "split_l: rel %u/%u/%u ", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode); + appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d", + xlrec->leftsib, xlrec->rightsib, xlrec->rnext, + xlrec->level, xlrec->firstright); + break; + } + case XLOG_BTREE_SPLIT_R: + { + xl_btree_split *xlrec = (xl_btree_split *) rec; + + appendStringInfo(buf, "split_r: rel %u/%u/%u ", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode); + appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d", + xlrec->leftsib, xlrec->rightsib, xlrec->rnext, + xlrec->level, xlrec->firstright); + break; + } + case XLOG_BTREE_SPLIT_L_ROOT: + { + xl_btree_split *xlrec = (xl_btree_split *) rec; + + appendStringInfo(buf, "split_l_root: rel %u/%u/%u ", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode); + appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d", + xlrec->leftsib, xlrec->rightsib, xlrec->rnext, + xlrec->level, xlrec->firstright); + break; + } + case XLOG_BTREE_SPLIT_R_ROOT: + { + xl_btree_split *xlrec = (xl_btree_split *) rec; + + appendStringInfo(buf, "split_r_root: rel %u/%u/%u ", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode); + appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d", + xlrec->leftsib, xlrec->rightsib, xlrec->rnext, + xlrec->level, xlrec->firstright); + break; + } + case XLOG_BTREE_VACUUM: + { + xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec; + + appendStringInfo(buf, "vacuum: rel %u/%u/%u; blk %u, lastBlockVacuumed %u", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode, xlrec->block, + xlrec->lastBlockVacuumed); + break; + } + case XLOG_BTREE_DELETE: + { + xl_btree_delete *xlrec = (xl_btree_delete *) rec; + + appendStringInfo(buf, "delete: index %u/%u/%u; iblk %u, heap %u/%u/%u;", + xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode, + xlrec->block, + xlrec->hnode.spcNode, xlrec->hnode.dbNode, xlrec->hnode.relNode); + break; + } + case XLOG_BTREE_DELETE_PAGE: + case XLOG_BTREE_DELETE_PAGE_META: + case XLOG_BTREE_DELETE_PAGE_HALF: + { + xl_btree_delete_page *xlrec = (xl_btree_delete_page *) rec; + + appendStringInfo(buf, "delete_page: "); + out_target(buf, &(xlrec->target)); + appendStringInfo(buf, "; dead %u; left %u; right %u", + xlrec->deadblk, xlrec->leftblk, xlrec->rightblk); + break; + } + case XLOG_BTREE_NEWROOT: + { + xl_btree_newroot *xlrec = (xl_btree_newroot *) rec; + + appendStringInfo(buf, "newroot: rel %u/%u/%u; root %u lev %u", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode, + xlrec->rootblk, xlrec->level); + break; + } + case XLOG_BTREE_REUSE_PAGE: + { + xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) rec; + + appendStringInfo(buf, "reuse_page: rel %u/%u/%u; latestRemovedXid %u", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode, xlrec->latestRemovedXid); + break; + } + default: + appendStringInfo(buf, "UNKNOWN"); + break; + } +} diff --git a/src/backend/access/rmgrdesc/relmapdesc.c b/src/backend/access/rmgrdesc/relmapdesc.c new file mode 100644 index 0000000000..3eaf6be9b4 --- /dev/null +++ b/src/backend/access/rmgrdesc/relmapdesc.c @@ -0,0 +1,33 @@ +/*------------------------------------------------------------------------- + * + * relmapdesc.c + * rmgr descriptor routines for utils/cache/relmapper.c + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/relmapdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "utils/relmapper.h" + +void +relmap_desc(StringInfo buf, uint8 xl_info, char *rec) +{ + uint8 info = xl_info & ~XLR_INFO_MASK; + + if (info == XLOG_RELMAP_UPDATE) + { + xl_relmap_update *xlrec = (xl_relmap_update *) rec; + + appendStringInfo(buf, "update relmap: database %u tablespace %u size %u", + xlrec->dbid, xlrec->tsid, xlrec->nbytes); + } + else + appendStringInfo(buf, "UNKNOWN"); +} diff --git a/src/backend/access/rmgrdesc/seqdesc.c b/src/backend/access/rmgrdesc/seqdesc.c new file mode 100644 index 0000000000..f282385045 --- /dev/null +++ b/src/backend/access/rmgrdesc/seqdesc.c @@ -0,0 +1,36 @@ +/*------------------------------------------------------------------------- + * + * seqdesc.c + * rmgr descriptor routines for commands/sequence.c + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/seqdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "commands/sequence.h" + + +void +seq_desc(StringInfo buf, uint8 xl_info, char *rec) +{ + uint8 info = xl_info & ~XLR_INFO_MASK; + xl_seq_rec *xlrec = (xl_seq_rec *) rec; + + if (info == XLOG_SEQ_LOG) + appendStringInfo(buf, "log: "); + else + { + appendStringInfo(buf, "UNKNOWN"); + return; + } + + appendStringInfo(buf, "rel %u/%u/%u", + xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode); +} diff --git a/src/backend/access/rmgrdesc/smgrdesc.c b/src/backend/access/rmgrdesc/smgrdesc.c new file mode 100644 index 0000000000..40b9708bad --- /dev/null +++ b/src/backend/access/rmgrdesc/smgrdesc.c @@ -0,0 +1,45 @@ +/*------------------------------------------------------------------------- + * + * smgrdesc.c + * rmgr descriptor routines for catalog/storage.c + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/smgrdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "catalog/catalog.h" +#include "catalog/storage_xlog.h" + + +void +smgr_desc(StringInfo buf, uint8 xl_info, char *rec) +{ + uint8 info = xl_info & ~XLR_INFO_MASK; + + if (info == XLOG_SMGR_CREATE) + { + xl_smgr_create *xlrec = (xl_smgr_create *) rec; + char *path = relpathperm(xlrec->rnode, xlrec->forkNum); + + appendStringInfo(buf, "file create: %s", path); + pfree(path); + } + else if (info == XLOG_SMGR_TRUNCATE) + { + xl_smgr_truncate *xlrec = (xl_smgr_truncate *) rec; + char *path = relpathperm(xlrec->rnode, MAIN_FORKNUM); + + appendStringInfo(buf, "file truncate: %s to %u blocks", path, + xlrec->blkno); + pfree(path); + } + else + appendStringInfo(buf, "UNKNOWN"); +} diff --git a/src/backend/access/rmgrdesc/spgdesc.c b/src/backend/access/rmgrdesc/spgdesc.c new file mode 100644 index 0000000000..20de85ddf8 --- /dev/null +++ b/src/backend/access/rmgrdesc/spgdesc.c @@ -0,0 +1,89 @@ +/*------------------------------------------------------------------------- + * + * spgdesc.c + * rmgr descriptor routines for access/spgist/spgxlog.c + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/spgdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/spgist_private.h" + +static void +out_target(StringInfo buf, RelFileNode node) +{ + appendStringInfo(buf, "rel %u/%u/%u ", + node.spcNode, node.dbNode, node.relNode); +} + +void +spg_desc(StringInfo buf, uint8 xl_info, char *rec) +{ + uint8 info = xl_info & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_SPGIST_CREATE_INDEX: + appendStringInfo(buf, "create_index: rel %u/%u/%u", + ((RelFileNode *) rec)->spcNode, + ((RelFileNode *) rec)->dbNode, + ((RelFileNode *) rec)->relNode); + break; + case XLOG_SPGIST_ADD_LEAF: + out_target(buf, ((spgxlogAddLeaf *) rec)->node); + appendStringInfo(buf, "add leaf to page: %u", + ((spgxlogAddLeaf *) rec)->blknoLeaf); + break; + case XLOG_SPGIST_MOVE_LEAFS: + out_target(buf, ((spgxlogMoveLeafs *) rec)->node); + appendStringInfo(buf, "move %u leafs from page %u to page %u", + ((spgxlogMoveLeafs *) rec)->nMoves, + ((spgxlogMoveLeafs *) rec)->blknoSrc, + ((spgxlogMoveLeafs *) rec)->blknoDst); + break; + case XLOG_SPGIST_ADD_NODE: + out_target(buf, ((spgxlogAddNode *) rec)->node); + appendStringInfo(buf, "add node to %u:%u", + ((spgxlogAddNode *) rec)->blkno, + ((spgxlogAddNode *) rec)->offnum); + break; + case XLOG_SPGIST_SPLIT_TUPLE: + out_target(buf, ((spgxlogSplitTuple *) rec)->node); + appendStringInfo(buf, "split node %u:%u to %u:%u", + ((spgxlogSplitTuple *) rec)->blknoPrefix, + ((spgxlogSplitTuple *) rec)->offnumPrefix, + ((spgxlogSplitTuple *) rec)->blknoPostfix, + ((spgxlogSplitTuple *) rec)->offnumPostfix); + break; + case XLOG_SPGIST_PICKSPLIT: + out_target(buf, ((spgxlogPickSplit *) rec)->node); + appendStringInfo(buf, "split leaf page"); + break; + case XLOG_SPGIST_VACUUM_LEAF: + out_target(buf, ((spgxlogVacuumLeaf *) rec)->node); + appendStringInfo(buf, "vacuum leaf tuples on page %u", + ((spgxlogVacuumLeaf *) rec)->blkno); + break; + case XLOG_SPGIST_VACUUM_ROOT: + out_target(buf, ((spgxlogVacuumRoot *) rec)->node); + appendStringInfo(buf, "vacuum leaf tuples on root page %u", + ((spgxlogVacuumRoot *) rec)->blkno); + break; + case XLOG_SPGIST_VACUUM_REDIRECT: + out_target(buf, ((spgxlogVacuumRedirect *) rec)->node); + appendStringInfo(buf, "vacuum redirect tuples on page %u, newest XID %u", + ((spgxlogVacuumRedirect *) rec)->blkno, + ((spgxlogVacuumRedirect *) rec)->newestRedirectXid); + break; + default: + appendStringInfo(buf, "unknown spgist op code %u", info); + break; + } +} diff --git a/src/backend/access/rmgrdesc/standbydesc.c b/src/backend/access/rmgrdesc/standbydesc.c new file mode 100644 index 0000000000..d5982d1daf --- /dev/null +++ b/src/backend/access/rmgrdesc/standbydesc.c @@ -0,0 +1,65 @@ +/*------------------------------------------------------------------------- + * + * standbydesc.c + * rmgr descriptor routines for storage/ipc/standby.c + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/standbydesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/standby.h" + +static void +standby_desc_running_xacts(StringInfo buf, xl_running_xacts *xlrec) +{ + int i; + + appendStringInfo(buf, " nextXid %u latestCompletedXid %u oldestRunningXid %u", + xlrec->nextXid, + xlrec->latestCompletedXid, + xlrec->oldestRunningXid); + if (xlrec->xcnt > 0) + { + appendStringInfo(buf, "; %d xacts:", xlrec->xcnt); + for (i = 0; i < xlrec->xcnt; i++) + appendStringInfo(buf, " %u", xlrec->xids[i]); + } + + if (xlrec->subxid_overflow) + appendStringInfo(buf, "; subxid ovf"); +} + +void +standby_desc(StringInfo buf, uint8 xl_info, char *rec) +{ + uint8 info = xl_info & ~XLR_INFO_MASK; + + if (info == XLOG_STANDBY_LOCK) + { + xl_standby_locks *xlrec = (xl_standby_locks *) rec; + int i; + + appendStringInfo(buf, "AccessExclusive locks:"); + + for (i = 0; i < xlrec->nlocks; i++) + appendStringInfo(buf, " xid %u db %u rel %u", + xlrec->locks[i].xid, xlrec->locks[i].dbOid, + xlrec->locks[i].relOid); + } + else if (info == XLOG_RUNNING_XACTS) + { + xl_running_xacts *xlrec = (xl_running_xacts *) rec; + + appendStringInfo(buf, " running xacts:"); + standby_desc_running_xacts(buf, xlrec); + } + else + appendStringInfo(buf, "UNKNOWN"); +} diff --git a/src/backend/access/rmgrdesc/tblspcdesc.c b/src/backend/access/rmgrdesc/tblspcdesc.c new file mode 100644 index 0000000000..803e1b0148 --- /dev/null +++ b/src/backend/access/rmgrdesc/tblspcdesc.c @@ -0,0 +1,40 @@ +/*------------------------------------------------------------------------- + * + * tblspcdesc.c + * rmgr descriptor routines for commands/tablespace.c + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/tblspcdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "commands/tablespace.h" + + +void +tblspc_desc(StringInfo buf, uint8 xl_info, char *rec) +{ + uint8 info = xl_info & ~XLR_INFO_MASK; + + if (info == XLOG_TBLSPC_CREATE) + { + xl_tblspc_create_rec *xlrec = (xl_tblspc_create_rec *) rec; + + appendStringInfo(buf, "create tablespace: %u \"%s\"", + xlrec->ts_id, xlrec->ts_path); + } + else if (info == XLOG_TBLSPC_DROP) + { + xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) rec; + + appendStringInfo(buf, "drop tablespace: %u", xlrec->ts_id); + } + else + appendStringInfo(buf, "UNKNOWN"); +} diff --git a/src/backend/access/rmgrdesc/xactdesc.c b/src/backend/access/rmgrdesc/xactdesc.c new file mode 100644 index 0000000000..60deddcf8e --- /dev/null +++ b/src/backend/access/rmgrdesc/xactdesc.c @@ -0,0 +1,194 @@ +/*------------------------------------------------------------------------- + * + * xactdesc.c + * rmgr descriptor routines for access/transam/xact.c + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/xactdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xact.h" +#include "catalog/catalog.h" +#include "storage/sinval.h" +#include "utils/timestamp.h" + + +static void +xact_desc_commit(StringInfo buf, xl_xact_commit *xlrec) +{ + int i; + TransactionId *subxacts; + + subxacts = (TransactionId *) &xlrec->xnodes[xlrec->nrels]; + + appendStringInfoString(buf, timestamptz_to_str(xlrec->xact_time)); + + if (xlrec->nrels > 0) + { + appendStringInfo(buf, "; rels:"); + for (i = 0; i < xlrec->nrels; i++) + { + char *path = relpathperm(xlrec->xnodes[i], MAIN_FORKNUM); + + appendStringInfo(buf, " %s", path); + pfree(path); + } + } + if (xlrec->nsubxacts > 0) + { + appendStringInfo(buf, "; subxacts:"); + for (i = 0; i < xlrec->nsubxacts; i++) + appendStringInfo(buf, " %u", subxacts[i]); + } + if (xlrec->nmsgs > 0) + { + SharedInvalidationMessage *msgs; + + msgs = (SharedInvalidationMessage *) &subxacts[xlrec->nsubxacts]; + + if (XactCompletionRelcacheInitFileInval(xlrec->xinfo)) + appendStringInfo(buf, "; relcache init file inval dbid %u tsid %u", + xlrec->dbId, xlrec->tsId); + + appendStringInfo(buf, "; inval msgs:"); + for (i = 0; i < xlrec->nmsgs; i++) + { + SharedInvalidationMessage *msg = &msgs[i]; + + if (msg->id >= 0) + appendStringInfo(buf, " catcache %d", msg->id); + else if (msg->id == SHAREDINVALCATALOG_ID) + appendStringInfo(buf, " catalog %u", msg->cat.catId); + else if (msg->id == SHAREDINVALRELCACHE_ID) + appendStringInfo(buf, " relcache %u", msg->rc.relId); + /* remaining cases not expected, but print something anyway */ + else if (msg->id == SHAREDINVALSMGR_ID) + appendStringInfo(buf, " smgr"); + else if (msg->id == SHAREDINVALRELMAP_ID) + appendStringInfo(buf, " relmap"); + else + appendStringInfo(buf, " unknown id %d", msg->id); + } + } +} + +static void +xact_desc_commit_compact(StringInfo buf, xl_xact_commit_compact *xlrec) +{ + int i; + + appendStringInfoString(buf, timestamptz_to_str(xlrec->xact_time)); + + if (xlrec->nsubxacts > 0) + { + appendStringInfo(buf, "; subxacts:"); + for (i = 0; i < xlrec->nsubxacts; i++) + appendStringInfo(buf, " %u", xlrec->subxacts[i]); + } +} + +static void +xact_desc_abort(StringInfo buf, xl_xact_abort *xlrec) +{ + int i; + + appendStringInfoString(buf, timestamptz_to_str(xlrec->xact_time)); + if (xlrec->nrels > 0) + { + appendStringInfo(buf, "; rels:"); + for (i = 0; i < xlrec->nrels; i++) + { + char *path = relpathperm(xlrec->xnodes[i], MAIN_FORKNUM); + + appendStringInfo(buf, " %s", path); + pfree(path); + } + } + if (xlrec->nsubxacts > 0) + { + TransactionId *xacts = (TransactionId *) + &xlrec->xnodes[xlrec->nrels]; + + appendStringInfo(buf, "; subxacts:"); + for (i = 0; i < xlrec->nsubxacts; i++) + appendStringInfo(buf, " %u", xacts[i]); + } +} + +static void +xact_desc_assignment(StringInfo buf, xl_xact_assignment *xlrec) +{ + int i; + + appendStringInfo(buf, "subxacts:"); + + for (i = 0; i < xlrec->nsubxacts; i++) + appendStringInfo(buf, " %u", xlrec->xsub[i]); +} + +void +xact_desc(StringInfo buf, uint8 xl_info, char *rec) +{ + uint8 info = xl_info & ~XLR_INFO_MASK; + + if (info == XLOG_XACT_COMMIT_COMPACT) + { + xl_xact_commit_compact *xlrec = (xl_xact_commit_compact *) rec; + + appendStringInfo(buf, "commit: "); + xact_desc_commit_compact(buf, xlrec); + } + else if (info == XLOG_XACT_COMMIT) + { + xl_xact_commit *xlrec = (xl_xact_commit *) rec; + + appendStringInfo(buf, "commit: "); + xact_desc_commit(buf, xlrec); + } + else if (info == XLOG_XACT_ABORT) + { + xl_xact_abort *xlrec = (xl_xact_abort *) rec; + + appendStringInfo(buf, "abort: "); + xact_desc_abort(buf, xlrec); + } + else if (info == XLOG_XACT_PREPARE) + { + appendStringInfo(buf, "prepare"); + } + else if (info == XLOG_XACT_COMMIT_PREPARED) + { + xl_xact_commit_prepared *xlrec = (xl_xact_commit_prepared *) rec; + + appendStringInfo(buf, "commit prepared %u: ", xlrec->xid); + xact_desc_commit(buf, &xlrec->crec); + } + else if (info == XLOG_XACT_ABORT_PREPARED) + { + xl_xact_abort_prepared *xlrec = (xl_xact_abort_prepared *) rec; + + appendStringInfo(buf, "abort prepared %u: ", xlrec->xid); + xact_desc_abort(buf, &xlrec->arec); + } + else if (info == XLOG_XACT_ASSIGNMENT) + { + xl_xact_assignment *xlrec = (xl_xact_assignment *) rec; + + /* + * Note that we ignore the WAL record's xid, since we're more + * interested in the top-level xid that issued the record and which + * xids are being reported here. + */ + appendStringInfo(buf, "xid assignment xtop %u: ", xlrec->xtop); + xact_desc_assignment(buf, xlrec); + } + else + appendStringInfo(buf, "UNKNOWN"); +} diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c new file mode 100644 index 0000000000..862e3fa754 --- /dev/null +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -0,0 +1,120 @@ +/*------------------------------------------------------------------------- + * + * xlogdesc.c + * rmgr descriptor routines for access/transam/xlog.c + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/xlogdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xlog_internal.h" +#include "catalog/pg_control.h" +#include "utils/guc.h" + +/* + * GUC support + */ +const struct config_enum_entry wal_level_options[] = { + {"minimal", WAL_LEVEL_MINIMAL, false}, + {"archive", WAL_LEVEL_ARCHIVE, false}, + {"hot_standby", WAL_LEVEL_HOT_STANDBY, false}, + {NULL, 0, false} +}; + +void +xlog_desc(StringInfo buf, uint8 xl_info, char *rec) +{ + uint8 info = xl_info & ~XLR_INFO_MASK; + + if (info == XLOG_CHECKPOINT_SHUTDOWN || + info == XLOG_CHECKPOINT_ONLINE) + { + CheckPoint *checkpoint = (CheckPoint *) rec; + + appendStringInfo(buf, "checkpoint: redo %X/%X; " + "tli %u; fpw %s; xid %u/%u; oid %u; multi %u; offset %u; " + "oldest xid %u in DB %u; oldest running xid %u; %s", + (uint32) (checkpoint->redo >> 32), (uint32) checkpoint->redo, + checkpoint->ThisTimeLineID, + checkpoint->fullPageWrites ? "true" : "false", + checkpoint->nextXidEpoch, checkpoint->nextXid, + checkpoint->nextOid, + checkpoint->nextMulti, + checkpoint->nextMultiOffset, + checkpoint->oldestXid, + checkpoint->oldestXidDB, + checkpoint->oldestActiveXid, + (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online"); + } + else if (info == XLOG_NOOP) + { + appendStringInfo(buf, "xlog no-op"); + } + else if (info == XLOG_NEXTOID) + { + Oid nextOid; + + memcpy(&nextOid, rec, sizeof(Oid)); + appendStringInfo(buf, "nextOid: %u", nextOid); + } + else if (info == XLOG_SWITCH) + { + appendStringInfo(buf, "xlog switch"); + } + else if (info == XLOG_RESTORE_POINT) + { + xl_restore_point *xlrec = (xl_restore_point *) rec; + + appendStringInfo(buf, "restore point: %s", xlrec->rp_name); + + } + else if (info == XLOG_BACKUP_END) + { + XLogRecPtr startpoint; + + memcpy(&startpoint, rec, sizeof(XLogRecPtr)); + appendStringInfo(buf, "backup end: %X/%X", + (uint32) (startpoint >> 32), (uint32) startpoint); + } + else if (info == XLOG_PARAMETER_CHANGE) + { + xl_parameter_change xlrec; + const char *wal_level_str; + const struct config_enum_entry *entry; + + memcpy(&xlrec, rec, sizeof(xl_parameter_change)); + + /* Find a string representation for wal_level */ + wal_level_str = "?"; + for (entry = wal_level_options; entry->name; entry++) + { + if (entry->val == xlrec.wal_level) + { + wal_level_str = entry->name; + break; + } + } + + appendStringInfo(buf, "parameter change: max_connections=%d max_prepared_xacts=%d max_locks_per_xact=%d wal_level=%s", + xlrec.MaxConnections, + xlrec.max_prepared_xacts, + xlrec.max_locks_per_xact, + wal_level_str); + } + else if (info == XLOG_FPW_CHANGE) + { + bool fpw; + + memcpy(&fpw, rec, sizeof(bool)); + appendStringInfo(buf, "full_page_writes: %s", fpw ? "true" : "false"); + } + else + appendStringInfo(buf, "UNKNOWN"); +} diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index 26bbd656c1..7ee225ec23 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -18,7 +18,7 @@ #include "access/genam.h" #include "access/spgist_private.h" #include "access/transam.h" -#include "catalog/storage.h" +#include "catalog/storage_xlog.h" #include "commands/vacuum.h" #include "miscadmin.h" #include "storage/bufmgr.h" diff --git a/src/backend/access/spgist/spgxlog.c b/src/backend/access/spgist/spgxlog.c index 54e78f18b5..2a874a2f16 100644 --- a/src/backend/access/spgist/spgxlog.c +++ b/src/backend/access/spgist/spgxlog.c @@ -76,6 +76,9 @@ spgRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record) Buffer buffer; Page page; + /* Backup blocks are not used in create_index records */ + Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); + buffer = XLogReadBuffer(*node, SPGIST_METAPAGE_BLKNO, true); Assert(BufferIsValid(buffer)); page = (Page) BufferGetPage(buffer); @@ -117,7 +120,14 @@ spgRedoAddLeaf(XLogRecPtr lsn, XLogRecord *record) ptr += sizeof(spgxlogAddLeaf); leafTuple = (SpGistLeafTuple) ptr; - if (!(record->xl_info & XLR_BKP_BLOCK_1)) + /* + * In normal operation we would have both current and parent pages locked + * simultaneously; but in WAL replay it should be safe to update the leaf + * page before updating the parent. + */ + if (record->xl_info & XLR_BKP_BLOCK(0)) + (void) RestoreBackupBlock(lsn, record, 0, false, false); + else { buffer = XLogReadBuffer(xldata->node, xldata->blknoLeaf, xldata->newPage); @@ -169,8 +179,9 @@ spgRedoAddLeaf(XLogRecPtr lsn, XLogRecord *record) } /* update parent downlink if necessary */ - if (xldata->blknoParent != InvalidBlockNumber && - !(record->xl_info & XLR_BKP_BLOCK_2)) + if (record->xl_info & XLR_BKP_BLOCK(1)) + (void) RestoreBackupBlock(lsn, record, 1, false, false); + else if (xldata->blknoParent != InvalidBlockNumber) { buffer = XLogReadBuffer(xldata->node, xldata->blknoParent, false); if (BufferIsValid(buffer)) @@ -219,8 +230,16 @@ spgRedoMoveLeafs(XLogRecPtr lsn, XLogRecord *record) /* now ptr points to the list of leaf tuples */ + /* + * In normal operation we would have all three pages (source, dest, and + * parent) locked simultaneously; but in WAL replay it should be safe to + * update them one at a time, as long as we do it in the right order. + */ + /* Insert tuples on the dest page (do first, so redirect is valid) */ - if (!(record->xl_info & XLR_BKP_BLOCK_2)) + if (record->xl_info & XLR_BKP_BLOCK(1)) + (void) RestoreBackupBlock(lsn, record, 1, false, false); + else { buffer = XLogReadBuffer(xldata->node, xldata->blknoDst, xldata->newPage); @@ -253,7 +272,9 @@ spgRedoMoveLeafs(XLogRecPtr lsn, XLogRecord *record) } /* Delete tuples from the source page, inserting a redirection pointer */ - if (!(record->xl_info & XLR_BKP_BLOCK_1)) + if (record->xl_info & XLR_BKP_BLOCK(0)) + (void) RestoreBackupBlock(lsn, record, 0, false, false); + else { buffer = XLogReadBuffer(xldata->node, xldata->blknoSrc, false); if (BufferIsValid(buffer)) @@ -276,7 +297,9 @@ spgRedoMoveLeafs(XLogRecPtr lsn, XLogRecord *record) } /* And update the parent downlink */ - if (!(record->xl_info & XLR_BKP_BLOCK_3)) + if (record->xl_info & XLR_BKP_BLOCK(2)) + (void) RestoreBackupBlock(lsn, record, 2, false, false); + else { buffer = XLogReadBuffer(xldata->node, xldata->blknoParent, false); if (BufferIsValid(buffer)) @@ -322,7 +345,9 @@ spgRedoAddNode(XLogRecPtr lsn, XLogRecord *record) { /* update in place */ Assert(xldata->blknoParent == InvalidBlockNumber); - if (!(record->xl_info & XLR_BKP_BLOCK_1)) + if (record->xl_info & XLR_BKP_BLOCK(0)) + (void) RestoreBackupBlock(lsn, record, 0, false, false); + else { buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); if (BufferIsValid(buffer)) @@ -347,8 +372,22 @@ spgRedoAddNode(XLogRecPtr lsn, XLogRecord *record) } else { + /* + * In normal operation we would have all three pages (source, dest, + * and parent) locked simultaneously; but in WAL replay it should be + * safe to update them one at a time, as long as we do it in the right + * order. + * + * The logic here depends on the assumption that blkno != blknoNew, + * else we can't tell which BKP bit goes with which page, and the LSN + * checks could go wrong too. + */ + Assert(xldata->blkno != xldata->blknoNew); + /* Install new tuple first so redirect is valid */ - if (!(record->xl_info & XLR_BKP_BLOCK_2)) + if (record->xl_info & XLR_BKP_BLOCK(1)) + (void) RestoreBackupBlock(lsn, record, 1, false, false); + else { buffer = XLogReadBuffer(xldata->node, xldata->blknoNew, xldata->newPage); @@ -365,8 +404,17 @@ spgRedoAddNode(XLogRecPtr lsn, XLogRecord *record) addOrReplaceTuple(page, (Item) innerTuple, innerTuple->size, xldata->offnumNew); - PageSetLSN(page, lsn); - PageSetTLI(page, ThisTimeLineID); + /* + * If parent is in this same page, don't advance LSN; + * doing so would fool us into not applying the parent + * downlink update below. We'll update the LSN when we + * fix the parent downlink. + */ + if (xldata->blknoParent != xldata->blknoNew) + { + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + } MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); @@ -374,7 +422,9 @@ spgRedoAddNode(XLogRecPtr lsn, XLogRecord *record) } /* Delete old tuple, replacing it with redirect or placeholder tuple */ - if (!(record->xl_info & XLR_BKP_BLOCK_1)) + if (record->xl_info & XLR_BKP_BLOCK(0)) + (void) RestoreBackupBlock(lsn, record, 0, false, false); + else { buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); if (BufferIsValid(buffer)) @@ -405,8 +455,17 @@ spgRedoAddNode(XLogRecPtr lsn, XLogRecord *record) else SpGistPageGetOpaque(page)->nRedirection++; - PageSetLSN(page, lsn); - PageSetTLI(page, ThisTimeLineID); + /* + * If parent is in this same page, don't advance LSN; + * doing so would fool us into not applying the parent + * downlink update below. We'll update the LSN when we + * fix the parent downlink. + */ + if (xldata->blknoParent != xldata->blkno) + { + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + } MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); @@ -425,7 +484,12 @@ spgRedoAddNode(XLogRecPtr lsn, XLogRecord *record) else bbi = 2; - if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi))) + if (record->xl_info & XLR_BKP_BLOCK(bbi)) + { + if (bbi == 2) /* else we already did it */ + (void) RestoreBackupBlock(lsn, record, bbi, false, false); + } + else { buffer = XLogReadBuffer(xldata->node, xldata->blknoParent, false); if (BufferIsValid(buffer)) @@ -467,9 +531,16 @@ spgRedoSplitTuple(XLogRecPtr lsn, XLogRecord *record) ptr += prefixTuple->size; postfixTuple = (SpGistInnerTuple) ptr; + /* + * In normal operation we would have both pages locked simultaneously; but + * in WAL replay it should be safe to update them one at a time, as long + * as we do it in the right order. + */ + /* insert postfix tuple first to avoid dangling link */ - if (xldata->blknoPostfix != xldata->blknoPrefix && - !(record->xl_info & XLR_BKP_BLOCK_2)) + if (record->xl_info & XLR_BKP_BLOCK(1)) + (void) RestoreBackupBlock(lsn, record, 1, false, false); + else if (xldata->blknoPostfix != xldata->blknoPrefix) { buffer = XLogReadBuffer(xldata->node, xldata->blknoPostfix, xldata->newPage); @@ -495,7 +566,9 @@ spgRedoSplitTuple(XLogRecPtr lsn, XLogRecord *record) } /* now handle the original page */ - if (!(record->xl_info & XLR_BKP_BLOCK_1)) + if (record->xl_info & XLR_BKP_BLOCK(0)) + (void) RestoreBackupBlock(lsn, record, 0, false, false); + else { buffer = XLogReadBuffer(xldata->node, xldata->blknoPrefix, false); if (BufferIsValid(buffer)) @@ -535,6 +608,8 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) uint8 *leafPageSelect; Buffer srcBuffer; Buffer destBuffer; + Page srcPage; + Page destPage; Page page; int bbi; int i; @@ -563,13 +638,14 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) { /* when splitting root, we touch it only in the guise of new inner */ srcBuffer = InvalidBuffer; + srcPage = NULL; } else if (xldata->initSrc) { /* just re-init the source page */ srcBuffer = XLogReadBuffer(xldata->node, xldata->blknoSrc, true); Assert(BufferIsValid(srcBuffer)); - page = (Page) BufferGetPage(srcBuffer); + srcPage = (Page) BufferGetPage(srcBuffer); SpGistInitBuffer(srcBuffer, SPGIST_LEAF | (xldata->storesNulls ? SPGIST_NULLS : 0)); @@ -577,14 +653,24 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) } else { - /* delete the specified tuples from source page */ - if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi))) + /* + * Delete the specified tuples from source page. (In case we're in + * Hot Standby, we need to hold lock on the page till we're done + * inserting leaf tuples and the new inner tuple, else the added + * redirect tuple will be a dangling link.) + */ + if (record->xl_info & XLR_BKP_BLOCK(bbi)) + { + srcBuffer = RestoreBackupBlock(lsn, record, bbi, false, true); + srcPage = NULL; /* don't need to do any page updates */ + } + else { srcBuffer = XLogReadBuffer(xldata->node, xldata->blknoSrc, false); if (BufferIsValid(srcBuffer)) { - page = BufferGetPage(srcBuffer); - if (!XLByteLE(lsn, PageGetLSN(page))) + srcPage = BufferGetPage(srcBuffer); + if (!XLByteLE(lsn, PageGetLSN(srcPage))) { /* * We have it a bit easier here than in doPickSplit(), @@ -592,14 +678,14 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) * we can inject the correct redirection tuple now. */ if (!state.isBuild) - spgPageIndexMultiDelete(&state, page, + spgPageIndexMultiDelete(&state, srcPage, toDelete, xldata->nDelete, SPGIST_REDIRECT, SPGIST_PLACEHOLDER, xldata->blknoInner, xldata->offnumInner); else - spgPageIndexMultiDelete(&state, page, + spgPageIndexMultiDelete(&state, srcPage, toDelete, xldata->nDelete, SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER, @@ -608,10 +694,12 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) /* don't update LSN etc till we're done with it */ } + else + srcPage = NULL; /* don't do any page updates */ } + else + srcPage = NULL; } - else - srcBuffer = InvalidBuffer; bbi++; } @@ -619,13 +707,14 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) if (xldata->blknoDest == InvalidBlockNumber) { destBuffer = InvalidBuffer; + destPage = NULL; } else if (xldata->initDest) { /* just re-init the dest page */ destBuffer = XLogReadBuffer(xldata->node, xldata->blknoDest, true); Assert(BufferIsValid(destBuffer)); - page = (Page) BufferGetPage(destBuffer); + destPage = (Page) BufferGetPage(destBuffer); SpGistInitBuffer(destBuffer, SPGIST_LEAF | (xldata->storesNulls ? SPGIST_NULLS : 0)); @@ -633,10 +722,27 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) } else { - if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi))) - destBuffer = XLogReadBuffer(xldata->node, xldata->blknoDest, false); + /* + * We could probably release the page lock immediately in the + * full-page-image case, but for safety let's hold it till later. + */ + if (record->xl_info & XLR_BKP_BLOCK(bbi)) + { + destBuffer = RestoreBackupBlock(lsn, record, bbi, false, true); + destPage = NULL; /* don't need to do any page updates */ + } else - destBuffer = InvalidBuffer; + { + destBuffer = XLogReadBuffer(xldata->node, xldata->blknoDest, false); + if (BufferIsValid(destBuffer)) + { + destPage = (Page) BufferGetPage(destBuffer); + if (XLByteLE(lsn, PageGetLSN(destPage))) + destPage = NULL; /* don't do any page updates */ + } + else + destPage = NULL; + } bbi++; } @@ -644,47 +750,34 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) for (i = 0; i < xldata->nInsert; i++) { SpGistLeafTuple lt = (SpGistLeafTuple) ptr; - Buffer leafBuffer; ptr += lt->size; - leafBuffer = leafPageSelect[i] ? destBuffer : srcBuffer; - if (!BufferIsValid(leafBuffer)) + page = leafPageSelect[i] ? destPage : srcPage; + if (page == NULL) continue; /* no need to touch this page */ - page = BufferGetPage(leafBuffer); - if (!XLByteLE(lsn, PageGetLSN(page))) - { - addOrReplaceTuple(page, (Item) lt, lt->size, toInsert[i]); - } + addOrReplaceTuple(page, (Item) lt, lt->size, toInsert[i]); } - /* Now update src and dest page LSNs */ - if (BufferIsValid(srcBuffer)) + /* Now update src and dest page LSNs if needed */ + if (srcPage != NULL) { - page = BufferGetPage(srcBuffer); - if (!XLByteLE(lsn, PageGetLSN(page))) - { - PageSetLSN(page, lsn); - PageSetTLI(page, ThisTimeLineID); - MarkBufferDirty(srcBuffer); - } - UnlockReleaseBuffer(srcBuffer); + PageSetLSN(srcPage, lsn); + PageSetTLI(srcPage, ThisTimeLineID); + MarkBufferDirty(srcBuffer); } - if (BufferIsValid(destBuffer)) + if (destPage != NULL) { - page = BufferGetPage(destBuffer); - if (!XLByteLE(lsn, PageGetLSN(page))) - { - PageSetLSN(page, lsn); - PageSetTLI(page, ThisTimeLineID); - MarkBufferDirty(destBuffer); - } - UnlockReleaseBuffer(destBuffer); + PageSetLSN(destPage, lsn); + PageSetTLI(destPage, ThisTimeLineID); + MarkBufferDirty(destBuffer); } /* restore new inner tuple */ - if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi))) + if (record->xl_info & XLR_BKP_BLOCK(bbi)) + (void) RestoreBackupBlock(lsn, record, bbi, false, false); + else { Buffer buffer = XLogReadBuffer(xldata->node, xldata->blknoInner, xldata->initInner); @@ -722,6 +815,15 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) } bbi++; + /* + * Now we can release the leaf-page locks. It's okay to do this before + * updating the parent downlink. + */ + if (BufferIsValid(srcBuffer)) + UnlockReleaseBuffer(srcBuffer); + if (BufferIsValid(destBuffer)) + UnlockReleaseBuffer(destBuffer); + /* update parent downlink, unless we did it above */ if (xldata->blknoParent == InvalidBlockNumber) { @@ -730,7 +832,9 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) } else if (xldata->blknoInner != xldata->blknoParent) { - if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi))) + if (record->xl_info & XLR_BKP_BLOCK(bbi)) + (void) RestoreBackupBlock(lsn, record, bbi, false, false); + else { Buffer buffer = XLogReadBuffer(xldata->node, xldata->blknoParent, false); @@ -788,7 +892,9 @@ spgRedoVacuumLeaf(XLogRecPtr lsn, XLogRecord *record) ptr += sizeof(OffsetNumber) * xldata->nChain; chainDest = (OffsetNumber *) ptr; - if (!(record->xl_info & XLR_BKP_BLOCK_1)) + if (record->xl_info & XLR_BKP_BLOCK(0)) + (void) RestoreBackupBlock(lsn, record, 0, false, false); + else { buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); if (BufferIsValid(buffer)) @@ -857,7 +963,9 @@ spgRedoVacuumRoot(XLogRecPtr lsn, XLogRecord *record) ptr += sizeof(spgxlogVacuumRoot); toDelete = (OffsetNumber *) ptr; - if (!(record->xl_info & XLR_BKP_BLOCK_1)) + if (record->xl_info & XLR_BKP_BLOCK(0)) + (void) RestoreBackupBlock(lsn, record, 0, false, false); + else { buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); if (BufferIsValid(buffer)) @@ -889,7 +997,20 @@ spgRedoVacuumRedirect(XLogRecPtr lsn, XLogRecord *record) ptr += sizeof(spgxlogVacuumRedirect); itemToPlaceholder = (OffsetNumber *) ptr; - if (!(record->xl_info & XLR_BKP_BLOCK_1)) + /* + * If any redirection tuples are being removed, make sure there are no + * live Hot Standby transactions that might need to see them. + */ + if (InHotStandby) + { + if (TransactionIdIsValid(xldata->newestRedirectXid)) + ResolveRecoveryConflictWithSnapshot(xldata->newestRedirectXid, + xldata->node); + } + + if (record->xl_info & XLR_BKP_BLOCK(0)) + (void) RestoreBackupBlock(lsn, record, 0, false, false); + else { buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); @@ -954,36 +1075,6 @@ spg_redo(XLogRecPtr lsn, XLogRecord *record) uint8 info = record->xl_info & ~XLR_INFO_MASK; MemoryContext oldCxt; - /* - * If we have any conflict processing to do, it must happen before we - * update the page. - */ - if (InHotStandby) - { - switch (info) - { - case XLOG_SPGIST_VACUUM_REDIRECT: - { - spgxlogVacuumRedirect *xldata = - (spgxlogVacuumRedirect *) XLogRecGetData(record); - - /* - * If any redirection tuples are being removed, make sure - * there are no live Hot Standby transactions that might - * need to see them. - */ - if (TransactionIdIsValid(xldata->newestRedirectXid)) - ResolveRecoveryConflictWithSnapshot(xldata->newestRedirectXid, - xldata->node); - break; - } - default: - break; - } - } - - RestoreBkpBlocks(lsn, record, false); - oldCxt = MemoryContextSwitchTo(opCtx); switch (info) { @@ -1022,78 +1113,6 @@ spg_redo(XLogRecPtr lsn, XLogRecord *record) MemoryContextReset(opCtx); } -static void -out_target(StringInfo buf, RelFileNode node) -{ - appendStringInfo(buf, "rel %u/%u/%u ", - node.spcNode, node.dbNode, node.relNode); -} - -void -spg_desc(StringInfo buf, uint8 xl_info, char *rec) -{ - uint8 info = xl_info & ~XLR_INFO_MASK; - - switch (info) - { - case XLOG_SPGIST_CREATE_INDEX: - appendStringInfo(buf, "create_index: rel %u/%u/%u", - ((RelFileNode *) rec)->spcNode, - ((RelFileNode *) rec)->dbNode, - ((RelFileNode *) rec)->relNode); - break; - case XLOG_SPGIST_ADD_LEAF: - out_target(buf, ((spgxlogAddLeaf *) rec)->node); - appendStringInfo(buf, "add leaf to page: %u", - ((spgxlogAddLeaf *) rec)->blknoLeaf); - break; - case XLOG_SPGIST_MOVE_LEAFS: - out_target(buf, ((spgxlogMoveLeafs *) rec)->node); - appendStringInfo(buf, "move %u leafs from page %u to page %u", - ((spgxlogMoveLeafs *) rec)->nMoves, - ((spgxlogMoveLeafs *) rec)->blknoSrc, - ((spgxlogMoveLeafs *) rec)->blknoDst); - break; - case XLOG_SPGIST_ADD_NODE: - out_target(buf, ((spgxlogAddNode *) rec)->node); - appendStringInfo(buf, "add node to %u:%u", - ((spgxlogAddNode *) rec)->blkno, - ((spgxlogAddNode *) rec)->offnum); - break; - case XLOG_SPGIST_SPLIT_TUPLE: - out_target(buf, ((spgxlogSplitTuple *) rec)->node); - appendStringInfo(buf, "split node %u:%u to %u:%u", - ((spgxlogSplitTuple *) rec)->blknoPrefix, - ((spgxlogSplitTuple *) rec)->offnumPrefix, - ((spgxlogSplitTuple *) rec)->blknoPostfix, - ((spgxlogSplitTuple *) rec)->offnumPostfix); - break; - case XLOG_SPGIST_PICKSPLIT: - out_target(buf, ((spgxlogPickSplit *) rec)->node); - appendStringInfo(buf, "split leaf page"); - break; - case XLOG_SPGIST_VACUUM_LEAF: - out_target(buf, ((spgxlogVacuumLeaf *) rec)->node); - appendStringInfo(buf, "vacuum leaf tuples on page %u", - ((spgxlogVacuumLeaf *) rec)->blkno); - break; - case XLOG_SPGIST_VACUUM_ROOT: - out_target(buf, ((spgxlogVacuumRoot *) rec)->node); - appendStringInfo(buf, "vacuum leaf tuples on root page %u", - ((spgxlogVacuumRoot *) rec)->blkno); - break; - case XLOG_SPGIST_VACUUM_REDIRECT: - out_target(buf, ((spgxlogVacuumRedirect *) rec)->node); - appendStringInfo(buf, "vacuum redirect tuples on page %u, newest XID %u", - ((spgxlogVacuumRedirect *) rec)->blkno, - ((spgxlogVacuumRedirect *) rec)->newestRedirectXid); - break; - default: - appendStringInfo(buf, "unknown spgist op code %u", info); - break; - } -} - void spg_xlog_startup(void) { diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README index 573c9ad682..548ddbb4dd 100644 --- a/src/backend/access/transam/README +++ b/src/backend/access/transam/README @@ -438,8 +438,9 @@ critical section.) 4. Mark the shared buffer(s) as dirty with MarkBufferDirty(). (This must happen before the WAL record is inserted; see notes in SyncOneBuffer().) -5. Build a WAL log record and pass it to XLogInsert(); then update the page's -LSN and TLI using the returned XLOG location. For instance, +5. If the relation requires WAL-logging, build a WAL log record and pass it +to XLogInsert(); then update the page's LSN and TLI using the returned XLOG +location. For instance, recptr = XLogInsert(rmgr_id, info, rdata); @@ -466,9 +467,9 @@ which buffers were handled that way --- otherwise they may be misled about what the XLOG record actually contains. XLOG records that describe multi-page changes therefore require some care to design: you must be certain that you know what data is indicated by each "BKP" bit. An example of the trickiness -is that in a HEAP_UPDATE record, BKP(1) normally is associated with the source -page and BKP(2) is associated with the destination page --- but if these are -the same page, only BKP(1) would have been set. +is that in a HEAP_UPDATE record, BKP(0) normally is associated with the source +page and BKP(1) is associated with the destination page --- but if these are +the same page, only BKP(0) would have been set. For this reason as well as the risk of deadlocking on buffer locks, it's best to design WAL records so that they reflect small atomic actions involving just @@ -497,12 +498,19 @@ incrementally update the page, the rdata array *must* mention the buffer ID at least once; otherwise there is no defense against torn-page problems. The standard replay-routine pattern for this case is - if (record->xl_info & XLR_BKP_BLOCK_n) - << do nothing, page was rewritten from logged copy >>; + if (record->xl_info & XLR_BKP_BLOCK(N)) + { + /* apply the change from the full-page image */ + (void) RestoreBackupBlock(lsn, record, N, false, false); + return; + } buffer = XLogReadBuffer(rnode, blkno, false); if (!BufferIsValid(buffer)) - << do nothing, page has been deleted >>; + { + /* page has been deleted, so we need do nothing */ + return; + } page = (Page) BufferGetPage(buffer); if (XLByteLE(lsn, PageGetLSN(page))) @@ -520,13 +528,50 @@ The standard replay-routine pattern for this case is UnlockReleaseBuffer(buffer); As noted above, for a multi-page update you need to be able to determine -which XLR_BKP_BLOCK_n flag applies to each page. If a WAL record reflects +which XLR_BKP_BLOCK(N) flag applies to each page. If a WAL record reflects a combination of fully-rewritable and incremental updates, then the rewritable -pages don't count for the XLR_BKP_BLOCK_n numbering. (XLR_BKP_BLOCK_n is -associated with the n'th distinct buffer ID seen in the "rdata" array, and +pages don't count for the XLR_BKP_BLOCK(N) numbering. (XLR_BKP_BLOCK(N) is +associated with the N'th distinct buffer ID seen in the "rdata" array, and per the above discussion, fully-rewritable buffers shouldn't be mentioned in "rdata".) +When replaying a WAL record that describes changes on multiple pages, you +must be careful to lock the pages properly to prevent concurrent Hot Standby +queries from seeing an inconsistent state. If this requires that two +or more buffer locks be held concurrently, the coding pattern shown above +is too simplistic, since it assumes the routine can exit as soon as it's +known the current page requires no modification. Instead, you might have +something like + + if (record->xl_info & XLR_BKP_BLOCK(0)) + { + /* apply the change from the full-page image */ + buffer0 = RestoreBackupBlock(lsn, record, 0, false, true); + } + else + { + buffer0 = XLogReadBuffer(rnode, blkno, false); + if (BufferIsValid(buffer0)) + { + ... apply the change if not already done ... + MarkBufferDirty(buffer0); + } + } + + ... similarly apply the changes for remaining pages ... + + /* and now we can release the lock on the first page */ + if (BufferIsValid(buffer0)) + UnlockReleaseBuffer(buffer0); + +Note that we must only use PageSetLSN/PageGetLSN() when we know the action +is serialised. Only Startup process may modify data blocks during recovery, +so Startup process may execute PageGetLSN() without fear of serialisation +problems. All other processes must only call PageSet/GetLSN when holding +either an exclusive buffer lock or a shared lock plus buffer header lock, +or be writing the data block directly rather than through shared buffers +while holding AccessExclusiveLock on the relation. + Due to all these constraints, complex changes (such as a multilevel index insertion) normally need to be described by a series of atomic-action WAL records. What do you do if the intermediate states are not self-consistent? diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 7f2f6921d5..e3fd56dd2b 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -768,26 +768,3 @@ clog_redo(XLogRecPtr lsn, XLogRecord *record) else elog(PANIC, "clog_redo: unknown op code %u", info); } - -void -clog_desc(StringInfo buf, uint8 xl_info, char *rec) -{ - uint8 info = xl_info & ~XLR_INFO_MASK; - - if (info == CLOG_ZEROPAGE) - { - int pageno; - - memcpy(&pageno, rec, sizeof(int)); - appendStringInfo(buf, "zeropage: %d", pageno); - } - else if (info == CLOG_TRUNCATE) - { - int pageno; - - memcpy(&pageno, rec, sizeof(int)); - appendStringInfo(buf, "truncate before: %d", pageno); - } - else - appendStringInfo(buf, "UNKNOWN"); -} diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 8bdf387917..d76105e455 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -2053,36 +2053,3 @@ multixact_redo(XLogRecPtr lsn, XLogRecord *record) else elog(PANIC, "multixact_redo: unknown op code %u", info); } - -void -multixact_desc(StringInfo buf, uint8 xl_info, char *rec) -{ - uint8 info = xl_info & ~XLR_INFO_MASK; - - if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE) - { - int pageno; - - memcpy(&pageno, rec, sizeof(int)); - appendStringInfo(buf, "zero offsets page: %d", pageno); - } - else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE) - { - int pageno; - - memcpy(&pageno, rec, sizeof(int)); - appendStringInfo(buf, "zero members page: %d", pageno); - } - else if (info == XLOG_MULTIXACT_CREATE_ID) - { - xl_multixact_create *xlrec = (xl_multixact_create *) rec; - int i; - - appendStringInfo(buf, "create multixact %u offset %u:", - xlrec->mid, xlrec->moff); - for (i = 0; i < xlrec->nxids; i++) - appendStringInfo(buf, " %u", xlrec->xids[i]); - } - else - appendStringInfo(buf, "UNKNOWN"); -} diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index aafd73fbd5..cc210a7e59 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -17,7 +17,7 @@ #include "access/spgist.h" #include "access/xact.h" #include "access/xlog_internal.h" -#include "catalog/storage.h" +#include "catalog/storage_xlog.h" #include "commands/dbcommands.h" #include "commands/sequence.h" #include "commands/tablespace.h" diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index dd69c232eb..b8f60d693f 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -531,7 +531,7 @@ SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata) int i; for (i = 0; i < fdata->num_files; i++) - close(fdata->fd[i]); + CloseTransientFile(fdata->fd[i]); } /* Re-acquire control lock and update page state */ @@ -593,7 +593,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) * SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case * where the file doesn't exist, and return zeroes instead. */ - fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); + fd = OpenTransientFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) { if (errno != ENOENT || !InRecovery) @@ -614,7 +614,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) { slru_errcause = SLRU_SEEK_FAILED; slru_errno = errno; - close(fd); + CloseTransientFile(fd); return false; } @@ -623,11 +623,11 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) { slru_errcause = SLRU_READ_FAILED; slru_errno = errno; - close(fd); + CloseTransientFile(fd); return false; } - if (close(fd)) + if (CloseTransientFile(fd)) { slru_errcause = SLRU_CLOSE_FAILED; slru_errno = errno; @@ -740,8 +740,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) * don't use O_EXCL or O_TRUNC or anything like that. */ SlruFileName(ctl, path, segno); - fd = BasicOpenFile(path, O_RDWR | O_CREAT | PG_BINARY, - S_IRUSR | S_IWUSR); + fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY, + S_IRUSR | S_IWUSR); if (fd < 0) { slru_errcause = SLRU_OPEN_FAILED; @@ -773,7 +773,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) slru_errcause = SLRU_SEEK_FAILED; slru_errno = errno; if (!fdata) - close(fd); + CloseTransientFile(fd); return false; } @@ -786,7 +786,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) slru_errcause = SLRU_WRITE_FAILED; slru_errno = errno; if (!fdata) - close(fd); + CloseTransientFile(fd); return false; } @@ -800,11 +800,11 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) { slru_errcause = SLRU_FSYNC_FAILED; slru_errno = errno; - close(fd); + CloseTransientFile(fd); return false; } - if (close(fd)) + if (CloseTransientFile(fd)) { slru_errcause = SLRU_CLOSE_FAILED; slru_errno = errno; @@ -1078,7 +1078,7 @@ SimpleLruFlush(SlruCtl ctl, bool checkpoint) ok = false; } - if (close(fdata.fd[i])) + if (CloseTransientFile(fdata.fd[i])) { slru_errcause = SLRU_CLOSE_FAILED; slru_errno = errno; diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c index f3632c572a..324b6c1860 100644 --- a/src/backend/access/transam/timeline.c +++ b/src/backend/access/transam/timeline.c @@ -12,10 +12,10 @@ * * Each line in the file represents a timeline switch: * - * + * * * parentTLI ID of the parent timeline - * xlogfname filename of the WAL segment where the switch happened + * switchpoint XLogRecPtr of the WAL position where the switch happened * reason human-readable explanation of why the timeline was changed * * The fields are separated by tabs. Lines beginning with # are comments, and @@ -56,15 +56,23 @@ readTimeLineHistory(TimeLineID targetTLI) char histfname[MAXFNAMELEN]; char fline[MAXPGPATH]; FILE *fd; + TimeLineHistoryEntry *entry; + TimeLineID lasttli = 0; + XLogRecPtr prevend; /* Timeline 1 does not have a history file, so no need to check */ if (targetTLI == 1) - return list_make1_int((int) targetTLI); + { + entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry)); + entry->tli = targetTLI; + entry->begin = entry->end = InvalidXLogRecPtr; + return list_make1(entry); + } if (InArchiveRecovery) { TLHistoryFileName(histfname, targetTLI); - RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0); + RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false); } else TLHistoryFilePath(path, targetTLI); @@ -77,7 +85,10 @@ readTimeLineHistory(TimeLineID targetTLI) (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); /* Not there, so assume no parents */ - return list_make1_int((int) targetTLI); + entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry)); + entry->tli = targetTLI; + entry->begin = entry->end = InvalidXLogRecPtr; + return list_make1(entry); } result = NIL; @@ -85,12 +96,15 @@ readTimeLineHistory(TimeLineID targetTLI) /* * Parse the file... */ + prevend = InvalidXLogRecPtr; while (fgets(fline, sizeof(fline), fd) != NULL) { /* skip leading whitespace and check for # comment */ char *ptr; - char *endptr; TimeLineID tli; + uint32 switchpoint_hi; + uint32 switchpoint_lo; + int nfields; for (ptr = fline; *ptr; ptr++) { @@ -100,38 +114,56 @@ readTimeLineHistory(TimeLineID targetTLI) if (*ptr == '\0' || *ptr == '#') continue; - /* expect a numeric timeline ID as first field of line */ - tli = (TimeLineID) strtoul(ptr, &endptr, 0); - if (endptr == ptr) + nfields = sscanf(fline, "%u\t%X/%X", &tli, &switchpoint_hi, &switchpoint_lo); + + if (nfields < 1) + { + /* expect a numeric timeline ID as first field of line */ ereport(FATAL, (errmsg("syntax error in history file: %s", fline), errhint("Expected a numeric timeline ID."))); + } + if (nfields != 3) + ereport(FATAL, + (errmsg("syntax error in history file: %s", fline), + errhint("Expected an XLOG switchpoint location."))); - if (result && - tli <= (TimeLineID) linitial_int(result)) + if (result && tli <= lasttli) ereport(FATAL, (errmsg("invalid data in history file: %s", fline), errhint("Timeline IDs must be in increasing sequence."))); + lasttli = tli; + + entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry)); + entry->tli = tli; + entry->begin = prevend; + entry->end = ((uint64) (switchpoint_hi)) << 32 | (uint64) switchpoint_lo; + prevend = entry->end; + /* Build list with newest item first */ - result = lcons_int((int) tli, result); + result = lcons(entry, result); /* we ignore the remainder of each line */ } FreeFile(fd); - if (result && - targetTLI <= (TimeLineID) linitial_int(result)) + if (result && targetTLI <= lasttli) ereport(FATAL, (errmsg("invalid data in history file \"%s\"", path), errhint("Timeline IDs must be less than child timeline's ID."))); - result = lcons_int((int) targetTLI, result); + /* + * Create one more entry for the "tip" of the timeline, which has no + * entry in the history file. + */ + entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry)); + entry->tli = targetTLI; + entry->begin = prevend; + entry->end = InvalidXLogRecPtr; - ereport(DEBUG3, - (errmsg_internal("history of timeline %u is %s", - targetTLI, nodeToString(result)))); + result = lcons(entry, result); return result; } @@ -153,7 +185,7 @@ existsTimeLineHistory(TimeLineID probeTLI) if (InArchiveRecovery) { TLHistoryFileName(histfname, probeTLI); - RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0); + RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false); } else TLHistoryFilePath(path, probeTLI); @@ -214,7 +246,7 @@ findNewestTimeLine(TimeLineID startTLI) * * newTLI: ID of the new timeline * parentTLI: ID of its immediate parent - * endTLI et al: ID of the last used WAL file, for annotation purposes + * switchpoint: XLOG position where the system switched to the new timeline * reason: human-readable explanation of why the timeline was switched * * Currently this is only used at the end recovery, and so there are no locking @@ -223,12 +255,11 @@ findNewestTimeLine(TimeLineID startTLI) */ void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, - TimeLineID endTLI, XLogSegNo endLogSegNo, char *reason) + XLogRecPtr switchpoint, char *reason) { char path[MAXPGPATH]; char tmppath[MAXPGPATH]; char histfname[MAXFNAMELEN]; - char xlogfname[MAXFNAMELEN]; char buffer[BLCKSZ]; int srcfd; int fd; @@ -244,8 +275,8 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, unlink(tmppath); /* do not use get_sync_bit() here --- want to fsync only at end of fill */ - fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL, - S_IRUSR | S_IWUSR); + fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL, + S_IRUSR | S_IWUSR); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), @@ -257,12 +288,12 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, if (InArchiveRecovery) { TLHistoryFileName(histfname, parentTLI); - RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0); + RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false); } else TLHistoryFilePath(path, parentTLI); - srcfd = BasicOpenFile(path, O_RDONLY, 0); + srcfd = OpenTransientFile(path, O_RDONLY, 0); if (srcfd < 0) { if (errno != ENOENT) @@ -304,7 +335,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, errmsg("could not write to file \"%s\": %m", tmppath))); } } - close(srcfd); + CloseTransientFile(srcfd); } /* @@ -313,13 +344,11 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, * If we did have a parent file, insert an extra newline just in case the * parent file failed to end with one. */ - XLogFileName(xlogfname, endTLI, endLogSegNo); - snprintf(buffer, sizeof(buffer), - "%s%u\t%s\t%s\n", + "%s%u\t%X/%X\t%s\n", (srcfd < 0) ? "" : "\n", parentTLI, - xlogfname, + (uint32) (switchpoint >> 32), (uint32) (switchpoint), reason); nbytes = strlen(buffer); @@ -345,7 +374,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", tmppath))); - if (close(fd)) + if (CloseTransientFile(fd)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", tmppath))); @@ -380,3 +409,70 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, TLHistoryFileName(histfname, newTLI); XLogArchiveNotify(histfname); } + +/* + * Returns true if 'expectedTLEs' contains a timeline with id 'tli' + */ +bool +tliInHistory(TimeLineID tli, List *expectedTLEs) +{ + ListCell *cell; + + foreach(cell, expectedTLEs) + { + if (((TimeLineHistoryEntry *) lfirst(cell))->tli == tli) + return true; + } + + return false; +} + +/* + * Returns the ID of the timeline in use at a particular point in time, in + * the given timeline history. + */ +TimeLineID +tliOfPointInHistory(XLogRecPtr ptr, List *history) +{ + ListCell *cell; + + foreach(cell, history) + { + TimeLineHistoryEntry *tle = (TimeLineHistoryEntry *) lfirst(cell); + if ((XLogRecPtrIsInvalid(tle->begin) || XLByteLE(tle->begin, ptr)) && + (XLogRecPtrIsInvalid(tle->end) || XLByteLT(ptr, tle->end))) + { + /* found it */ + return tle->tli; + } + } + + /* shouldn't happen. */ + elog(ERROR, "timeline history was not contiguous"); + return 0; /* keep compiler quiet */ +} + +/* + * Returns the point in history where we branched off the given timeline. + * Returns InvalidXLogRecPtr if the timeline is current (= we have not + * branched off from it), and throws an error if the timeline is not part of + * this server's history. + */ +XLogRecPtr +tliSwitchPoint(TimeLineID tli, List *history) +{ + ListCell *cell; + + foreach (cell, history) + { + TimeLineHistoryEntry *tle = (TimeLineHistoryEntry *) lfirst(cell); + + if (tle->tli == tli) + return tle->end; + } + + ereport(ERROR, + (errmsg("requested timeline %u is not in this server's history", + tli))); + return InvalidXLogRecPtr; /* keep compiler quiet */ +} diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 29a2ee6d39..3a0b190abc 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -318,7 +318,7 @@ MarkAsPreparing(TransactionId xid, const char *gid, proc->lxid = (LocalTransactionId) xid; pgxact->xid = xid; pgxact->xmin = InvalidTransactionId; - pgxact->inCommit = false; + pgxact->delayChkpt = false; pgxact->vacuumFlags = 0; proc->pid = 0; proc->backendId = InvalidBackendId; @@ -970,17 +970,12 @@ EndPrepare(GlobalTransaction gxact) /* * Create the 2PC state file. - * - * Note: because we use BasicOpenFile(), we are responsible for ensuring - * the FD gets closed in any error exit path. Once we get into the - * critical section, though, it doesn't matter since any failure causes - * PANIC anyway. */ TwoPhaseFilePath(path, xid); - fd = BasicOpenFile(path, - O_CREAT | O_EXCL | O_WRONLY | PG_BINARY, - S_IRUSR | S_IWUSR); + fd = OpenTransientFile(path, + O_CREAT | O_EXCL | O_WRONLY | PG_BINARY, + S_IRUSR | S_IWUSR); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), @@ -995,7 +990,7 @@ EndPrepare(GlobalTransaction gxact) COMP_CRC32(statefile_crc, record->data, record->len); if ((write(fd, record->data, record->len)) != record->len) { - close(fd); + CloseTransientFile(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not write two-phase state file: %m"))); @@ -1012,7 +1007,7 @@ EndPrepare(GlobalTransaction gxact) if ((write(fd, &bogus_crc, sizeof(pg_crc32))) != sizeof(pg_crc32)) { - close(fd); + CloseTransientFile(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not write two-phase state file: %m"))); @@ -1021,7 +1016,7 @@ EndPrepare(GlobalTransaction gxact) /* Back up to prepare for rewriting the CRC */ if (lseek(fd, -((off_t) sizeof(pg_crc32)), SEEK_CUR) < 0) { - close(fd); + CloseTransientFile(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in two-phase state file: %m"))); @@ -1039,18 +1034,18 @@ EndPrepare(GlobalTransaction gxact) * odds of a PANIC actually occurring should be very tiny given that we * were able to write the bogus CRC above. * - * We have to set inCommit here, too; otherwise a checkpoint starting + * We have to set delayChkpt here, too; otherwise a checkpoint starting * immediately after the WAL record is inserted could complete without * fsync'ing our state file. (This is essentially the same kind of race * condition as the COMMIT-to-clog-write case that RecordTransactionCommit - * uses inCommit for; see notes there.) + * uses delayChkpt for; see notes there.) * * We save the PREPARE record's location in the gxact for later use by * CheckPointTwoPhase. */ START_CRIT_SECTION(); - MyPgXact->inCommit = true; + MyPgXact->delayChkpt = true; gxact->prepare_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE, records.head); @@ -1061,13 +1056,13 @@ EndPrepare(GlobalTransaction gxact) /* write correct CRC and close file */ if ((write(fd, &statefile_crc, sizeof(pg_crc32))) != sizeof(pg_crc32)) { - close(fd); + CloseTransientFile(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not write two-phase state file: %m"))); } - if (close(fd) != 0) + if (CloseTransientFile(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close two-phase state file: %m"))); @@ -1091,7 +1086,7 @@ EndPrepare(GlobalTransaction gxact) * checkpoint starting after this will certainly see the gxact as a * candidate for fsyncing. */ - MyPgXact->inCommit = false; + MyPgXact->delayChkpt = false; END_CRIT_SECTION(); @@ -1144,7 +1139,7 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings) TwoPhaseFilePath(path, xid); - fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0); + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0); if (fd < 0) { if (give_warnings) @@ -1163,7 +1158,7 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings) */ if (fstat(fd, &stat)) { - close(fd); + CloseTransientFile(fd); if (give_warnings) ereport(WARNING, (errcode_for_file_access(), @@ -1177,14 +1172,14 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings) sizeof(pg_crc32)) || stat.st_size > MaxAllocSize) { - close(fd); + CloseTransientFile(fd); return NULL; } crc_offset = stat.st_size - sizeof(pg_crc32); if (crc_offset != MAXALIGN(crc_offset)) { - close(fd); + CloseTransientFile(fd); return NULL; } @@ -1195,7 +1190,7 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings) if (read(fd, buf, stat.st_size) != stat.st_size) { - close(fd); + CloseTransientFile(fd); if (give_warnings) ereport(WARNING, (errcode_for_file_access(), @@ -1205,7 +1200,7 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings) return NULL; } - close(fd); + CloseTransientFile(fd); hdr = (TwoPhaseFileHeader *) buf; if (hdr->magic != TWOPHASE_MAGIC || hdr->total_len != stat.st_size) @@ -1469,9 +1464,9 @@ RecreateTwoPhaseFile(TransactionId xid, void *content, int len) TwoPhaseFilePath(path, xid); - fd = BasicOpenFile(path, - O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY, - S_IRUSR | S_IWUSR); + fd = OpenTransientFile(path, + O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY, + S_IRUSR | S_IWUSR); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), @@ -1481,14 +1476,14 @@ RecreateTwoPhaseFile(TransactionId xid, void *content, int len) /* Write content and CRC */ if (write(fd, content, len) != len) { - close(fd); + CloseTransientFile(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not write two-phase state file: %m"))); } if (write(fd, &statefile_crc, sizeof(pg_crc32)) != sizeof(pg_crc32)) { - close(fd); + CloseTransientFile(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not write two-phase state file: %m"))); @@ -1500,13 +1495,13 @@ RecreateTwoPhaseFile(TransactionId xid, void *content, int len) */ if (pg_fsync(fd) != 0) { - close(fd); + CloseTransientFile(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync two-phase state file: %m"))); } - if (close(fd) != 0) + if (CloseTransientFile(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close two-phase state file: %m"))); @@ -1577,7 +1572,7 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) TwoPhaseFilePath(path, xid); - fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0); + fd = OpenTransientFile(path, O_RDWR | PG_BINARY, 0); if (fd < 0) { if (errno == ENOENT) @@ -1596,14 +1591,14 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) if (pg_fsync(fd) != 0) { - close(fd); + CloseTransientFile(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync two-phase state file \"%s\": %m", path))); } - if (close(fd) != 0) + if (CloseTransientFile(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close two-phase state file \"%s\": %m", @@ -1977,7 +1972,7 @@ RecoverPreparedTransactions(void) * RecordTransactionCommitPrepared * * This is basically the same as RecordTransactionCommit: in particular, - * we must set the inCommit flag to avoid a race condition. + * we must set the delayChkpt flag to avoid a race condition. * * We know the transaction made at least one XLOG entry (its PREPARE), * so it is never possible to optimize out the commit record. @@ -2000,7 +1995,7 @@ RecordTransactionCommitPrepared(TransactionId xid, START_CRIT_SECTION(); /* See notes in RecordTransactionCommit */ - MyPgXact->inCommit = true; + MyPgXact->delayChkpt = true; /* Emit the XLOG commit record */ xlrec.xid = xid; @@ -2058,7 +2053,7 @@ RecordTransactionCommitPrepared(TransactionId xid, TransactionIdCommitTree(xid, nchildren, children); /* Checkpoint can proceed now */ - MyPgXact->inCommit = false; + MyPgXact->delayChkpt = false; END_CRIT_SECTION(); diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index e7a6606ec3..a36c8061a2 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -1001,13 +1001,13 @@ RecordTransactionCommit(void) * RecordTransactionAbort. That's because loss of a transaction abort * is noncritical; the presumption would be that it aborted, anyway. * - * It's safe to change the inCommit flag of our own backend without + * It's safe to change the delayChkpt flag of our own backend without * holding the ProcArrayLock, since we're the only one modifying it. - * This makes checkpoint's determination of which xacts are inCommit a + * This makes checkpoint's determination of which xacts are delayChkpt a * bit fuzzy, but it doesn't matter. */ START_CRIT_SECTION(); - MyPgXact->inCommit = true; + MyPgXact->delayChkpt = true; SetCurrentTransactionStopTimestamp(); @@ -1160,7 +1160,7 @@ RecordTransactionCommit(void) */ if (markXidCommitted) { - MyPgXact->inCommit = false; + MyPgXact->delayChkpt = false; END_CRIT_SECTION(); } @@ -1949,7 +1949,7 @@ CommitTransaction(void) AtEOXact_SPI(true); AtEOXact_on_commit_actions(true); AtEOXact_Namespace(true); - /* smgrcommit already done */ + AtEOXact_SMgr(); AtEOXact_Files(); AtEOXact_ComboCid(); AtEOXact_HashTables(true); @@ -2202,7 +2202,7 @@ PrepareTransaction(void) AtEOXact_SPI(true); AtEOXact_on_commit_actions(true); AtEOXact_Namespace(true); - /* smgrcommit already done */ + AtEOXact_SMgr(); AtEOXact_Files(); AtEOXact_ComboCid(); AtEOXact_HashTables(true); @@ -2348,6 +2348,7 @@ AbortTransaction(void) AtEOXact_SPI(false); AtEOXact_on_commit_actions(false); AtEOXact_Namespace(false); + AtEOXact_SMgr(); AtEOXact_Files(); AtEOXact_ComboCid(); AtEOXact_HashTables(false); @@ -4608,9 +4609,11 @@ xact_redo_commit_internal(TransactionId xid, XLogRecPtr lsn, /* * Release locks, if any. We do this for both two phase and normal one * phase transactions. In effect we are ignoring the prepare phase and - * just going straight to lock release. + * just going straight to lock release. At commit we release all locks + * via their top-level xid only, so no need to provide subxact list, + * which will save time when replaying commits. */ - StandbyReleaseLockTree(xid, nsubxacts, sub_xids); + StandbyReleaseLockTree(xid, 0, NULL); } /* Make sure files supposed to be dropped are dropped */ @@ -4822,176 +4825,3 @@ xact_redo(XLogRecPtr lsn, XLogRecord *record) else elog(PANIC, "xact_redo: unknown op code %u", info); } - -static void -xact_desc_commit(StringInfo buf, xl_xact_commit *xlrec) -{ - int i; - TransactionId *subxacts; - - subxacts = (TransactionId *) &xlrec->xnodes[xlrec->nrels]; - - appendStringInfoString(buf, timestamptz_to_str(xlrec->xact_time)); - - if (xlrec->nrels > 0) - { - appendStringInfo(buf, "; rels:"); - for (i = 0; i < xlrec->nrels; i++) - { - char *path = relpathperm(xlrec->xnodes[i], MAIN_FORKNUM); - - appendStringInfo(buf, " %s", path); - pfree(path); - } - } - if (xlrec->nsubxacts > 0) - { - appendStringInfo(buf, "; subxacts:"); - for (i = 0; i < xlrec->nsubxacts; i++) - appendStringInfo(buf, " %u", subxacts[i]); - } - if (xlrec->nmsgs > 0) - { - SharedInvalidationMessage *msgs; - - msgs = (SharedInvalidationMessage *) &subxacts[xlrec->nsubxacts]; - - if (XactCompletionRelcacheInitFileInval(xlrec->xinfo)) - appendStringInfo(buf, "; relcache init file inval dbid %u tsid %u", - xlrec->dbId, xlrec->tsId); - - appendStringInfo(buf, "; inval msgs:"); - for (i = 0; i < xlrec->nmsgs; i++) - { - SharedInvalidationMessage *msg = &msgs[i]; - - if (msg->id >= 0) - appendStringInfo(buf, " catcache %d", msg->id); - else if (msg->id == SHAREDINVALCATALOG_ID) - appendStringInfo(buf, " catalog %u", msg->cat.catId); - else if (msg->id == SHAREDINVALRELCACHE_ID) - appendStringInfo(buf, " relcache %u", msg->rc.relId); - /* remaining cases not expected, but print something anyway */ - else if (msg->id == SHAREDINVALSMGR_ID) - appendStringInfo(buf, " smgr"); - else if (msg->id == SHAREDINVALRELMAP_ID) - appendStringInfo(buf, " relmap"); - else - appendStringInfo(buf, " unknown id %d", msg->id); - } - } -} - -static void -xact_desc_commit_compact(StringInfo buf, xl_xact_commit_compact *xlrec) -{ - int i; - - appendStringInfoString(buf, timestamptz_to_str(xlrec->xact_time)); - - if (xlrec->nsubxacts > 0) - { - appendStringInfo(buf, "; subxacts:"); - for (i = 0; i < xlrec->nsubxacts; i++) - appendStringInfo(buf, " %u", xlrec->subxacts[i]); - } -} - -static void -xact_desc_abort(StringInfo buf, xl_xact_abort *xlrec) -{ - int i; - - appendStringInfoString(buf, timestamptz_to_str(xlrec->xact_time)); - if (xlrec->nrels > 0) - { - appendStringInfo(buf, "; rels:"); - for (i = 0; i < xlrec->nrels; i++) - { - char *path = relpathperm(xlrec->xnodes[i], MAIN_FORKNUM); - - appendStringInfo(buf, " %s", path); - pfree(path); - } - } - if (xlrec->nsubxacts > 0) - { - TransactionId *xacts = (TransactionId *) - &xlrec->xnodes[xlrec->nrels]; - - appendStringInfo(buf, "; subxacts:"); - for (i = 0; i < xlrec->nsubxacts; i++) - appendStringInfo(buf, " %u", xacts[i]); - } -} - -static void -xact_desc_assignment(StringInfo buf, xl_xact_assignment *xlrec) -{ - int i; - - appendStringInfo(buf, "subxacts:"); - - for (i = 0; i < xlrec->nsubxacts; i++) - appendStringInfo(buf, " %u", xlrec->xsub[i]); -} - -void -xact_desc(StringInfo buf, uint8 xl_info, char *rec) -{ - uint8 info = xl_info & ~XLR_INFO_MASK; - - if (info == XLOG_XACT_COMMIT_COMPACT) - { - xl_xact_commit_compact *xlrec = (xl_xact_commit_compact *) rec; - - appendStringInfo(buf, "commit: "); - xact_desc_commit_compact(buf, xlrec); - } - else if (info == XLOG_XACT_COMMIT) - { - xl_xact_commit *xlrec = (xl_xact_commit *) rec; - - appendStringInfo(buf, "commit: "); - xact_desc_commit(buf, xlrec); - } - else if (info == XLOG_XACT_ABORT) - { - xl_xact_abort *xlrec = (xl_xact_abort *) rec; - - appendStringInfo(buf, "abort: "); - xact_desc_abort(buf, xlrec); - } - else if (info == XLOG_XACT_PREPARE) - { - appendStringInfo(buf, "prepare"); - } - else if (info == XLOG_XACT_COMMIT_PREPARED) - { - xl_xact_commit_prepared *xlrec = (xl_xact_commit_prepared *) rec; - - appendStringInfo(buf, "commit prepared %u: ", xlrec->xid); - xact_desc_commit(buf, &xlrec->crec); - } - else if (info == XLOG_XACT_ABORT_PREPARED) - { - xl_xact_abort_prepared *xlrec = (xl_xact_abort_prepared *) rec; - - appendStringInfo(buf, "abort prepared %u: ", xlrec->xid); - xact_desc_abort(buf, &xlrec->arec); - } - else if (info == XLOG_XACT_ASSIGNMENT) - { - xl_xact_assignment *xlrec = (xl_xact_assignment *) rec; - - /* - * Note that we ignore the WAL record's xid, since we're more - * interested in the top-level xid that issued the record and which - * xids are being reported here. - */ - appendStringInfo(buf, "xid assignment xtop %u: ", xlrec->xtop); - xact_desc_assignment(buf, xlrec); - } - else - appendStringInfo(buf, "UNKNOWN"); -} diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index d251d08b19..5a97edf50f 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -99,16 +99,10 @@ bool XLOG_DEBUG = false; */ #define XLOGfileslop (2*CheckPointSegments + 1) + /* * GUC support */ -const struct config_enum_entry wal_level_options[] = { - {"minimal", WAL_LEVEL_MINIMAL, false}, - {"archive", WAL_LEVEL_ARCHIVE, false}, - {"hot_standby", WAL_LEVEL_HOT_STANDBY, false}, - {NULL, 0, false} -}; - const struct config_enum_entry sync_method_options[] = { {"fsync", SYNC_METHOD_FSYNC, false}, #ifdef HAVE_FSYNC_WRITETHROUGH @@ -232,7 +226,7 @@ static bool recoveryStopAfter; * * recoveryTargetIsLatest: was the requested target timeline 'latest'? * - * expectedTLIs: an integer list of recoveryTargetTLI and the TLIs of + * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of * its known parents, newest first (so recoveryTargetTLI is always the * first list member). Only these TLIs are expected to be seen in the WAL * segments we read, and indeed only these TLIs will be considered as @@ -246,7 +240,7 @@ static bool recoveryStopAfter; */ static TimeLineID recoveryTargetTLI; static bool recoveryTargetIsLatest = false; -static List *expectedTLIs; +static List *expectedTLEs; static TimeLineID curFileTLI; /* @@ -453,6 +447,7 @@ typedef struct XLogCtlData /* end+1 of the last record replayed (or being replayed) */ XLogRecPtr replayEndRecPtr; + TimeLineID replayEndTLI; /* end+1 of the last record replayed */ XLogRecPtr recoveryLastRecPtr; /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */ @@ -512,12 +507,18 @@ static XLogwrtResult LogwrtResult = {0, 0}; /* * Codes indicating where we got a WAL file from during recovery, or where - * to attempt to get one. These are chosen so that they can be OR'd together - * in a bitmask state variable. + * to attempt to get one. */ -#define XLOG_FROM_ARCHIVE (1<<0) /* Restored using restore_command */ -#define XLOG_FROM_PG_XLOG (1<<1) /* Existing file in pg_xlog */ -#define XLOG_FROM_STREAM (1<<2) /* Streamed from master */ +typedef enum +{ + XLOG_FROM_ANY = 0, /* request to read WAL from any source */ + XLOG_FROM_ARCHIVE, /* restored using restore_command */ + XLOG_FROM_PG_XLOG, /* existing file in pg_xlog */ + XLOG_FROM_STREAM, /* streamed from master */ +} XLogSource; + +/* human-readable names for XLogSources, for debugging output */ +static const char *xlogSourceNames[] = { "any", "archive", "pg_xlog", "stream" }; /* * openLogFile is -1 or a kernel FD for an open log file segment. @@ -542,22 +543,28 @@ static XLogSegNo readSegNo = 0; static uint32 readOff = 0; static uint32 readLen = 0; static bool readFileHeaderValidated = false; -static int readSource = 0; /* XLOG_FROM_* code */ +static XLogSource readSource = 0; /* XLOG_FROM_* code */ /* - * Keeps track of which sources we've tried to read the current WAL - * record from and failed. + * Keeps track of which source we're currently reading from. This is + * different from readSource in that this is always set, even when we don't + * currently have a WAL file open. If lastSourceFailed is set, our last + * attempt to read from currentSource failed, and we should try another source + * next. */ -static int failedSources = 0; /* OR of XLOG_FROM_* codes */ +static XLogSource currentSource = 0; /* XLOG_FROM_* code */ +static bool lastSourceFailed = false; /* * These variables track when we last obtained some WAL data to process, * and where we got it from. (XLogReceiptSource is initially the same as * readSource, but readSource gets reset to zero when we don't have data - * to process right now.) + * to process right now. It is also different from currentSource, which + * also changes when we try to read from a source and fail, while + * XLogReceiptSource tracks where we last successfully read some WAL.) */ static TimestampTz XLogReceiptTime = 0; -static int XLogReceiptSource = 0; /* XLOG_FROM_* code */ +static XLogSource XLogReceiptSource = 0; /* XLOG_FROM_* code */ /* Buffer for currently read page (XLOG_BLCKSZ bytes) */ static char *readBuf = NULL; @@ -570,9 +577,11 @@ static uint32 readRecordBufSize = 0; static XLogRecPtr ReadRecPtr; /* start of last record read */ static XLogRecPtr EndRecPtr; /* end+1 of last record read */ static TimeLineID lastPageTLI = 0; +static TimeLineID lastSegmentTLI = 0; static XLogRecPtr minRecoveryPoint; /* local copy of * ControlFile->minRecoveryPoint */ +static TimeLineID minRecoveryPointTLI; static bool updateMinRecoveryPoint = true; /* @@ -587,25 +596,6 @@ static bool InRedo = false; /* Have we launched bgwriter during recovery? */ static bool bgwriterLaunched = false; -/* - * Information logged when we detect a change in one of the parameters - * important for Hot Standby. - */ -typedef struct xl_parameter_change -{ - int MaxConnections; - int max_prepared_xacts; - int max_locks_per_xact; - int wal_level; -} xl_parameter_change; - -/* logs restore point */ -typedef struct xl_restore_point -{ - TimestampTz rp_time; - char rp_name[MAXFNAMELEN]; -} xl_restore_point; - static void readRecoveryCommandFile(void); static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo); @@ -629,7 +619,7 @@ static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, bool use_lock); static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, int source, bool notexistOk); -static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, int sources); +static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source); static bool XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt, bool randAccess); static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, @@ -644,7 +634,7 @@ static void CleanupBackupHistory(void); static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force); static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt); static void CheckRecoveryConsistency(void); -static bool ValidXLogPageHeader(XLogPageHeader hdr, int emode); +static bool ValidXLogPageHeader(XLogPageHeader hdr, int emode, bool segmentonly); static bool ValidXLogRecordHeader(XLogRecPtr *RecPtr, XLogRecord *record, int emode, bool randAccess); static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt); @@ -835,8 +825,8 @@ begin:; * At the exit of this loop, write_len includes the backup block data. * * Also set the appropriate info bits to show which buffers were backed - * up. The i'th XLR_SET_BKP_BLOCK bit corresponds to the i'th distinct - * buffer value (ignoring InvalidBuffer) appearing in the rdata chain. + * up. The XLR_BKP_BLOCK(N) bit corresponds to the N'th distinct buffer + * value (ignoring InvalidBuffer) appearing in the rdata chain. */ rdt_lastnormal = rdt; write_len = len; @@ -848,7 +838,7 @@ begin:; if (!dtbuf_bkp[i]) continue; - info |= XLR_SET_BKP_BLOCK(i); + info |= XLR_BKP_BLOCK(i); bkpb = &(dtbuf_xlg[i]); page = (char *) BufferGetBlock(dtbuf[i]); @@ -1224,7 +1214,8 @@ XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites, /* * XXX We assume page LSN is first data on *every* page that can be passed * to XLogInsert, whether it otherwise has the standard page layout or - * not. + * not. We don't need the buffer header lock for PageGetLSN because we + * have exclusive lock on the page and/or the relation. */ *lsn = PageGetLSN(page); @@ -1789,6 +1780,7 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) /* update local copy */ minRecoveryPoint = ControlFile->minRecoveryPoint; + minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; /* * An invalid minRecoveryPoint means that we need to recover all the WAL, @@ -1802,6 +1794,7 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) /* use volatile pointer to prevent code rearrangement */ volatile XLogCtlData *xlogctl = XLogCtl; XLogRecPtr newMinRecoveryPoint; + TimeLineID newMinRecoveryPointTLI; /* * To avoid having to update the control file too often, we update it @@ -1818,6 +1811,7 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) */ SpinLockAcquire(&xlogctl->info_lck); newMinRecoveryPoint = xlogctl->replayEndRecPtr; + newMinRecoveryPointTLI = xlogctl->replayEndTLI; SpinLockRelease(&xlogctl->info_lck); if (!force && XLByteLT(newMinRecoveryPoint, lsn)) @@ -1831,13 +1825,16 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) if (XLByteLT(ControlFile->minRecoveryPoint, newMinRecoveryPoint)) { ControlFile->minRecoveryPoint = newMinRecoveryPoint; + ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI; UpdateControlFile(); minRecoveryPoint = newMinRecoveryPoint; + minRecoveryPointTLI = newMinRecoveryPointTLI; ereport(DEBUG2, - (errmsg("updated min recovery point to %X/%X", + (errmsg("updated min recovery point to %X/%X on timeline %u", (uint32) (minRecoveryPoint >> 32), - (uint32) minRecoveryPoint))); + (uint32) minRecoveryPoint, + newMinRecoveryPointTLI))); } } LWLockRelease(ControlFileLock); @@ -2143,6 +2140,7 @@ XLogNeedsFlush(XLogRecPtr record) if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED)) return true; minRecoveryPoint = ControlFile->minRecoveryPoint; + minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; LWLockRelease(ControlFileLock); /* @@ -2245,6 +2243,16 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) unlink(tmppath); + /* + * Allocate a buffer full of zeros. This is done before opening the file + * so that we don't leak the file descriptor if palloc fails. + * + * Note: palloc zbuffer, instead of just using a local char array, to + * ensure it is reasonably well-aligned; this may save a few cycles + * transferring data to the kernel. + */ + zbuffer = (char *) palloc0(XLOG_BLCKSZ); + /* do not use get_sync_bit() here --- want to fsync only at end of fill */ fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, S_IRUSR | S_IWUSR); @@ -2261,12 +2269,7 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) * fsync below) that all the indirect blocks are down on disk. Therefore, * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the * log file. - * - * Note: palloc zbuffer, instead of just using a local char array, to - * ensure it is reasonably well-aligned; this may save a few cycles - * transferring data to the kernel. */ - zbuffer = (char *) palloc0(XLOG_BLCKSZ); for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ) { errno = 0; @@ -2278,6 +2281,9 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) * If we fail to make the file, delete it to release disk space */ unlink(tmppath); + + close(fd); + /* if write didn't set errno, assume problem is no disk space */ errno = save_errno ? save_errno : ENOSPC; @@ -2289,9 +2295,12 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) pfree(zbuffer); if (pg_fsync(fd) != 0) + { + close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", tmppath))); + } if (close(fd)) ereport(ERROR, @@ -2362,7 +2371,7 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno) * Open the source file */ XLogFilePath(path, srcTLI, srcsegno); - srcfd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0); + srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0); if (srcfd < 0) ereport(ERROR, (errcode_for_file_access(), @@ -2376,8 +2385,8 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno) unlink(tmppath); /* do not use get_sync_bit() here --- want to fsync only at end of fill */ - fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, - S_IRUSR | S_IWUSR); + fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + S_IRUSR | S_IWUSR); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), @@ -2422,12 +2431,12 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno) (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", tmppath))); - if (close(fd)) + if (CloseTransientFile(fd)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", tmppath))); - close(srcfd); + CloseTransientFile(srcfd); /* * Now move the segment into place with its final name. @@ -2506,7 +2515,7 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, /* * Prefer link() to rename() here just to be really sure that we don't - * overwrite an existing logfile. However, there shouldn't be one, so + * overwrite an existing file. However, there shouldn't be one, so * rename() is an acceptable substitute except for the truly paranoid. */ #if HAVE_WORKING_LINK @@ -2564,7 +2573,7 @@ XLogFileOpen(XLogSegNo segno) /* * Open a logfile segment for reading (during recovery). * - * If source = XLOG_FROM_ARCHIVE, the segment is retrieved from archive. + * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive. * Otherwise, it's assumed to be already available in pg_xlog. */ static int @@ -2588,7 +2597,8 @@ XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, restoredFromArchive = RestoreArchivedFile(path, xlogfname, "RECOVERYXLOG", - XLogSegSize); + XLogSegSize, + InRedo); if (!restoredFromArchive) return -1; break; @@ -2706,10 +2716,10 @@ XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, /* * Open a logfile segment for reading (during recovery). * - * This version searches for the segment with any TLI listed in expectedTLIs. + * This version searches for the segment with any TLI listed in expectedTLEs. */ static int -XLogFileReadAnyTLI(XLogSegNo segno, int emode, int sources) +XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source) { char path[MAXPGPATH]; ListCell *cell; @@ -2717,7 +2727,7 @@ XLogFileReadAnyTLI(XLogSegNo segno, int emode, int sources) /* * Loop looking for a suitable timeline ID: we might need to read any of - * the timelines listed in expectedTLIs. + * the timelines listed in expectedTLEs. * * We expect curFileTLI on entry to be the TLI of the preceding file in * sequence, or 0 if there was no predecessor. We do not allow curFileTLI @@ -2725,14 +2735,14 @@ XLogFileReadAnyTLI(XLogSegNo segno, int emode, int sources) * parent timeline extends to higher segment numbers than the child we * want to read. */ - foreach(cell, expectedTLIs) + foreach(cell, expectedTLEs) { - TimeLineID tli = (TimeLineID) lfirst_int(cell); + TimeLineID tli = ((TimeLineHistoryEntry *) lfirst(cell))->tli; if (tli < curFileTLI) break; /* don't bother looking at too-old TLIs */ - if (sources & XLOG_FROM_ARCHIVE) + if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE) { fd = XLogFileRead(segno, emode, tli, XLOG_FROM_ARCHIVE, true); if (fd != -1) @@ -2742,7 +2752,7 @@ XLogFileReadAnyTLI(XLogSegNo segno, int emode, int sources) } } - if (sources & XLOG_FROM_PG_XLOG) + if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_XLOG) { fd = XLogFileRead(segno, emode, tli, XLOG_FROM_PG_XLOG, true); if (fd != -1) @@ -3080,9 +3090,16 @@ CleanupBackupHistory(void) } /* - * Restore the backup blocks present in an XLOG record, if any. + * Restore a full-page image from a backup block attached to an XLOG record. + * + * lsn: LSN of the XLOG record being replayed + * record: the complete XLOG record + * block_index: which backup block to restore (0 .. XLR_MAX_BKP_BLOCKS - 1) + * get_cleanup_lock: TRUE to get a cleanup rather than plain exclusive lock + * keep_buffer: TRUE to return the buffer still locked and pinned * - * We assume all of the record has been read into memory at *record. + * Returns the buffer number containing the page. Note this is not terribly + * useful unless keep_buffer is specified as TRUE. * * Note: when a backup block is available in XLOG, we restore it * unconditionally, even if the page in the database appears newer. @@ -3093,15 +3110,20 @@ CleanupBackupHistory(void) * modifications of the page that appear in XLOG, rather than possibly * ignoring them as already applied, but that's not a huge drawback. * - * If 'cleanup' is true, a cleanup lock is used when restoring blocks. - * Otherwise, a normal exclusive lock is used. During crash recovery, that's - * just pro forma because there can't be any regular backends in the system, - * but in hot standby mode the distinction is important. The 'cleanup' - * argument applies to all backup blocks in the WAL record, that suffices for - * now. + * If 'get_cleanup_lock' is true, a cleanup lock is obtained on the buffer, + * else a normal exclusive lock is used. During crash recovery, that's just + * pro forma because there can't be any regular backends in the system, but + * in hot standby mode the distinction is important. + * + * If 'keep_buffer' is true, return without releasing the buffer lock and pin; + * then caller is responsible for doing UnlockReleaseBuffer() later. This + * is needed in some cases when replaying XLOG records that touch multiple + * pages, to prevent inconsistent states from being visible to other backends. + * (Again, that's only important in hot standby mode.) */ -void -RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup) +Buffer +RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index, + bool get_cleanup_lock, bool keep_buffer) { Buffer buffer; Page page; @@ -3109,49 +3131,59 @@ RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup) char *blk; int i; - if (!(record->xl_info & XLR_BKP_BLOCK_MASK)) - return; - + /* Locate requested BkpBlock in the record */ blk = (char *) XLogRecGetData(record) + record->xl_len; for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++) { - if (!(record->xl_info & XLR_SET_BKP_BLOCK(i))) + if (!(record->xl_info & XLR_BKP_BLOCK(i))) continue; memcpy(&bkpb, blk, sizeof(BkpBlock)); blk += sizeof(BkpBlock); - buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block, - RBM_ZERO); - Assert(BufferIsValid(buffer)); - if (cleanup) - LockBufferForCleanup(buffer); - else - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + if (i == block_index) + { + /* Found it, apply the update */ + buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block, + RBM_ZERO); + Assert(BufferIsValid(buffer)); + if (get_cleanup_lock) + LockBufferForCleanup(buffer); + else + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - page = (Page) BufferGetPage(buffer); + page = (Page) BufferGetPage(buffer); - if (bkpb.hole_length == 0) - { - memcpy((char *) page, blk, BLCKSZ); - } - else - { - memcpy((char *) page, blk, bkpb.hole_offset); - /* must zero-fill the hole */ - MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length); - memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length), - blk + bkpb.hole_offset, - BLCKSZ - (bkpb.hole_offset + bkpb.hole_length)); - } + if (bkpb.hole_length == 0) + { + memcpy((char *) page, blk, BLCKSZ); + } + else + { + memcpy((char *) page, blk, bkpb.hole_offset); + /* must zero-fill the hole */ + MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length); + memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length), + blk + bkpb.hole_offset, + BLCKSZ - (bkpb.hole_offset + bkpb.hole_length)); + } - PageSetLSN(page, lsn); - PageSetTLI(page, ThisTimeLineID); - MarkBufferDirty(buffer); - UnlockReleaseBuffer(buffer); + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + + if (!keep_buffer) + UnlockReleaseBuffer(buffer); + + return buffer; + } blk += BLCKSZ - bkpb.hole_length; } + + /* Caller specified a bogus block_index */ + elog(ERROR, "failed to restore block_index %d", block_index); + return InvalidBuffer; /* keep compiler quiet */ } /* @@ -3193,7 +3225,7 @@ RecordIsValid(XLogRecord *record, XLogRecPtr recptr, int emode) { uint32 blen; - if (!(record->xl_info & XLR_SET_BKP_BLOCK(i))) + if (!(record->xl_info & XLR_BKP_BLOCK(i))) continue; if (remaining < sizeof(BkpBlock)) @@ -3312,16 +3344,17 @@ ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt) /* * Since we are going to a random position in WAL, forget any prior * state about what timeline we were in, and allow it to be any - * timeline in expectedTLIs. We also set a flag to allow curFileTLI + * timeline in expectedTLEs. We also set a flag to allow curFileTLI * to go backwards (but we can't reset that variable right here, since * we might not change files at all). */ - lastPageTLI = 0; /* see comment in ValidXLogPageHeader */ + /* see comment in ValidXLogPageHeader */ + lastPageTLI = lastSegmentTLI = 0; randAccess = true; /* allow curFileTLI to go backwards too */ } /* This is the first try to read this page. */ - failedSources = 0; + lastSourceFailed = false; retry: /* Read the page containing the record */ if (!XLogPageRead(RecPtr, emode, fetching_ckpt, randAccess)) @@ -3534,7 +3567,7 @@ ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt) return record; next_record_is_invalid: - failedSources |= readSource; + lastSourceFailed = true; if (readFile >= 0) { @@ -3556,7 +3589,7 @@ ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt) * ReadRecord. It's not intended for use from anywhere else. */ static bool -ValidXLogPageHeader(XLogPageHeader hdr, int emode) +ValidXLogPageHeader(XLogPageHeader hdr, int emode, bool segmentonly) { XLogRecPtr recaddr; @@ -3642,7 +3675,7 @@ ValidXLogPageHeader(XLogPageHeader hdr, int emode) /* * Check page TLI is one of the expected values. */ - if (!list_member_int(expectedTLIs, (int) hdr->xlp_tli)) + if (!tliInHistory(hdr->xlp_tli, expectedTLEs)) { ereport(emode_for_corrupt_record(emode, recaddr), (errmsg("unexpected timeline ID %u in log segment %s, offset %u", @@ -3658,19 +3691,32 @@ ValidXLogPageHeader(XLogPageHeader hdr, int emode) * successive pages of a consistent WAL sequence. * * Of course this check should only be applied when advancing sequentially - * across pages; therefore ReadRecord resets lastPageTLI to zero when - * going to a random page. + * across pages; therefore ReadRecord resets lastPageTLI and + * lastSegmentTLI to zero when going to a random page. + * + * Sometimes we re-open a segment that's already been partially replayed. + * In that case we cannot perform the normal TLI check: if there is a + * timeline switch within the segment, the first page has a smaller TLI + * than later pages following the timeline switch, and we might've read + * them already. As a weaker test, we still check that it's not smaller + * than the TLI we last saw at the beginning of a segment. Pass + * segmentonly = true when re-validating the first page like that, and the + * page you're actually interested in comes later. */ - if (hdr->xlp_tli < lastPageTLI) + if (hdr->xlp_tli < (segmentonly ? lastSegmentTLI : lastPageTLI)) { ereport(emode_for_corrupt_record(emode, recaddr), (errmsg("out-of-sequence timeline ID %u (after %u) in log segment %s, offset %u", - hdr->xlp_tli, lastPageTLI, + hdr->xlp_tli, + segmentonly ? lastSegmentTLI : lastPageTLI, XLogFileNameP(curFileTLI, readSegNo), readOff))); return false; } lastPageTLI = hdr->xlp_tli; + if (readOff == 0) + lastSegmentTLI = hdr->xlp_tli; + return true; } @@ -3766,57 +3812,86 @@ ValidXLogRecordHeader(XLogRecPtr *RecPtr, XLogRecord *record, int emode, static bool rescanLatestTimeLine(void) { + List *newExpectedTLEs; + bool found; + ListCell *cell; TimeLineID newtarget; + TimeLineHistoryEntry *currentTle = NULL; + /* use volatile pointer to prevent code rearrangement */ + volatile XLogCtlData *xlogctl = XLogCtl; newtarget = findNewestTimeLine(recoveryTargetTLI); - if (newtarget != recoveryTargetTLI) + if (newtarget == recoveryTargetTLI) { - /* - * Determine the list of expected TLIs for the new TLI - */ - List *newExpectedTLIs; - - newExpectedTLIs = readTimeLineHistory(newtarget); + /* No new timelines found */ + return false; + } - /* - * If the current timeline is not part of the history of the new - * timeline, we cannot proceed to it. - * - * XXX This isn't foolproof: The new timeline might have forked from - * the current one, but before the current recovery location. In that - * case we will still switch to the new timeline and proceed replaying - * from it even though the history doesn't match what we already - * replayed. That's not good. We will likely notice at the next online - * checkpoint, as the TLI won't match what we expected, but it's not - * guaranteed. The admin needs to make sure that doesn't happen. - */ - if (!list_member_int(newExpectedTLIs, - (int) recoveryTargetTLI)) - ereport(LOG, - (errmsg("new timeline %u is not a child of database system timeline %u", - newtarget, - ThisTimeLineID))); - else - { - /* use volatile pointer to prevent code rearrangement */ - volatile XLogCtlData *xlogctl = XLogCtl; + /* + * Determine the list of expected TLIs for the new TLI + */ - /* Switch target */ - recoveryTargetTLI = newtarget; - list_free(expectedTLIs); - expectedTLIs = newExpectedTLIs; + newExpectedTLEs = readTimeLineHistory(newtarget); - SpinLockAcquire(&xlogctl->info_lck); - xlogctl->RecoveryTargetTLI = recoveryTargetTLI; - SpinLockRelease(&xlogctl->info_lck); + /* + * If the current timeline is not part of the history of the new + * timeline, we cannot proceed to it. + */ + found = false; + foreach (cell, newExpectedTLEs) + { + currentTle = (TimeLineHistoryEntry *) lfirst(cell); - ereport(LOG, - (errmsg("new target timeline is %u", - recoveryTargetTLI))); - return true; + if (currentTle->tli == recoveryTargetTLI) + { + found = true; + break; } } - return false; + if (!found) + { + ereport(LOG, + (errmsg("new timeline %u is not a child of database system timeline %u", + newtarget, + ThisTimeLineID))); + return false; + } + + /* + * The current timeline was found in the history file, but check that the + * next timeline was forked off from it *after* the current recovery + * location. + */ + if (XLByteLT(currentTle->end, EndRecPtr)) + { + ereport(LOG, + (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X", + newtarget, + ThisTimeLineID, + (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr))); + return false; + } + + /* The new timeline history seems valid. Switch target */ + recoveryTargetTLI = newtarget; + list_free_deep(expectedTLEs); + expectedTLEs = newExpectedTLEs; + + SpinLockAcquire(&xlogctl->info_lck); + xlogctl->RecoveryTargetTLI = recoveryTargetTLI; + SpinLockRelease(&xlogctl->info_lck); + + ereport(LOG, + (errmsg("new target timeline is %u", + recoveryTargetTLI))); + + /* + * Wake up any walsenders to notice that we have a new target timeline. + */ + if (AllowCascadeReplication()) + WalSndWakeup(); + + return true; } /* @@ -5254,19 +5329,47 @@ StartupXLOG(void) readRecoveryCommandFile(); /* Now we can determine the list of expected TLIs */ - expectedTLIs = readTimeLineHistory(recoveryTargetTLI); + expectedTLEs = readTimeLineHistory(recoveryTargetTLI); + + /* + * If the location of the checkpoint record is not on the expected + * timeline in the history of the requested timeline, we cannot proceed: + * the backup is not part of the history of the requested timeline. + */ + if (tliOfPointInHistory(ControlFile->checkPoint, expectedTLEs) != + ControlFile->checkPointCopy.ThisTimeLineID) + { + XLogRecPtr switchpoint; + + /* + * tliSwitchPoint will throw an error if the checkpoint's timeline + * is not in expectedTLEs at all. + */ + switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs); + ereport(FATAL, + (errmsg("requested timeline %u is not a child of this server's history", + recoveryTargetTLI), + errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X", + (uint32) (ControlFile->checkPoint >> 32), + (uint32) ControlFile->checkPoint, + ControlFile->checkPointCopy.ThisTimeLineID, + (uint32) (switchpoint >> 32), + (uint32) switchpoint))); + } /* - * If pg_control's timeline is not in expectedTLIs, then we cannot - * proceed: the backup is not part of the history of the requested - * timeline. + * The min recovery point should be part of the requested timeline's + * history, too. */ - if (!list_member_int(expectedTLIs, - (int) ControlFile->checkPointCopy.ThisTimeLineID)) + if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) && + tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) != + ControlFile->minRecoveryPointTLI) ereport(FATAL, - (errmsg("requested timeline %u is not a child of database system timeline %u", + (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u", recoveryTargetTLI, - ControlFile->checkPointCopy.ThisTimeLineID))); + (uint32) (ControlFile->minRecoveryPoint >> 32), + (uint32) ControlFile->minRecoveryPoint, + ControlFile->minRecoveryPointTLI))); /* * Save the selected recovery target timeline ID and @@ -5486,7 +5589,10 @@ StartupXLOG(void) { /* initialize minRecoveryPoint if not set yet */ if (XLByteLT(ControlFile->minRecoveryPoint, checkPoint.redo)) + { ControlFile->minRecoveryPoint = checkPoint.redo; + ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID; + } } /* @@ -5519,6 +5625,7 @@ StartupXLOG(void) /* initialize our local copy of minRecoveryPoint */ minRecoveryPoint = ControlFile->minRecoveryPoint; + minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; /* * Reset pgstat data, because it may be invalid after recovery. @@ -5607,6 +5714,7 @@ StartupXLOG(void) * subxids are listed with their parent prepared transactions. */ running.xcnt = nxids; + running.subxcnt = 0; running.subxid_overflow = false; running.nextXid = checkPoint.nextXid; running.oldestRunningXid = oldestActiveXID; @@ -5643,6 +5751,7 @@ StartupXLOG(void) */ SpinLockAcquire(&xlogctl->info_lck); xlogctl->replayEndRecPtr = ReadRecPtr; + xlogctl->replayEndTLI = ThisTimeLineID; xlogctl->recoveryLastRecPtr = EndRecPtr; xlogctl->recoveryLastXTime = 0; xlogctl->currentChunkStartTime = 0; @@ -5698,7 +5807,7 @@ StartupXLOG(void) bool recoveryContinue = true; bool recoveryApply = true; bool recoveryPause = false; - ErrorContextCallback errcontext; + ErrorContextCallback errcallback; TimestampTz xtime; InRedo = true; @@ -5760,10 +5869,10 @@ StartupXLOG(void) } /* Setup error traceback support for ereport() */ - errcontext.callback = rm_redo_error_callback; - errcontext.arg = (void *) record; - errcontext.previous = error_context_stack; - error_context_stack = &errcontext; + errcallback.callback = rm_redo_error_callback; + errcallback.arg = (void *) record; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; /* * ShmemVariableCache->nextXid must be beyond record's xid. @@ -5808,9 +5917,9 @@ StartupXLOG(void) RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record); /* Pop the error context stack */ - error_context_stack = errcontext.previous; + error_context_stack = errcallback.previous; - if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint) && + if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) && XLByteLE(ControlFile->backupEndPoint, EndRecPtr)) { /* @@ -5961,8 +6070,8 @@ StartupXLOG(void) (errmsg("selected new timeline ID: %u", ThisTimeLineID))); /* - * Write comment to history file to explain why and where timeline - * changed. Comment varies according to the recovery target used. + * Create a comment for the history file to explain why and where + * timeline changed. */ if (recoveryTarget == RECOVERY_TARGET_XID) snprintf(reason, sizeof(reason), @@ -5982,7 +6091,7 @@ StartupXLOG(void) snprintf(reason, sizeof(reason), "no recovery target specified"); writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI, - curFileTLI, endLogSegNo, reason); + EndRecPtr, reason); } /* Save the selected TimeLineID in shared memory, too */ @@ -6823,6 +6932,18 @@ LogCheckpointEnd(bool restartpoint) * Note: flags contains other bits, of interest here only for logging purposes. * In particular note that this routine is synchronous and does not pay * attention to CHECKPOINT_WAIT. + * + * If !shutdown then we are writing an online checkpoint. This is a very special + * kind of operation and WAL record because the checkpoint action occurs over + * a period of time yet logically occurs at just a single LSN. The logical + * position of the WAL record (redo ptr) is the same or earlier than the + * physical position. When we replay WAL we locate the checkpoint via its + * physical position then read the redo ptr and actually start replay at the + * earlier logical position. Note that we don't write *anything* to WAL at + * the logical position, so that location could be any other kind of WAL record. + * All of this mechanism allows us to continue working while we checkpoint. + * As a result, timing of actions is critical here and be careful to note that + * this function will likely take minutes to execute on a busy system. */ void CreateCheckPoint(int flags) @@ -6834,8 +6955,8 @@ CreateCheckPoint(int flags) XLogRecData rdata; uint32 freespace; XLogSegNo _logSegNo; - TransactionId *inCommitXids; - int nInCommit; + VirtualTransactionId *vxids; + int nvxids; /* * An end-of-recovery checkpoint is really a shutdown checkpoint, just @@ -7006,9 +7127,14 @@ CreateCheckPoint(int flags) TRACE_POSTGRESQL_CHECKPOINT_START(flags); /* - * Before flushing data, we must wait for any transactions that are - * currently in their commit critical sections. If an xact inserted its - * commit record into XLOG just before the REDO point, then a crash + * In some cases there are groups of actions that must all occur on + * one side or the other of a checkpoint record. Before flushing the + * checkpoint record we must explicitly wait for any backend currently + * performing those groups of actions. + * + * One example is end of transaction, so we must wait for any transactions + * that are currently in commit critical sections. If an xact inserted + * its commit record into XLOG just before the REDO point, then a crash * restart from the REDO point would not replay that record, which means * that our flushing had better include the xact's update of pg_clog. So * we wait till he's out of his commit critical section before proceeding. @@ -7023,21 +7149,24 @@ CreateCheckPoint(int flags) * protected by different locks, but again that seems best on grounds of * minimizing lock contention.) * - * A transaction that has not yet set inCommit when we look cannot be at + * A transaction that has not yet set delayChkpt when we look cannot be at * risk, since he's not inserted his commit record yet; and one that's * already cleared it is not at risk either, since he's done fixing clog * and we will correctly flush the update below. So we cannot miss any * xacts we need to wait for. */ - nInCommit = GetTransactionsInCommit(&inCommitXids); - if (nInCommit > 0) + vxids = GetVirtualXIDsDelayingChkpt(&nvxids); + if (nvxids > 0) { + uint32 nwaits = 0; + do { pg_usleep(10000L); /* wait for 10 msec */ - } while (HaveTransactionsInCommit(inCommitXids, nInCommit)); + nwaits++; + } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids)); } - pfree(inCommitXids); + pfree(vxids); /* * Get the other info we need for the checkpoint record. @@ -7082,11 +7211,9 @@ CreateCheckPoint(int flags) * * If we are shutting down, or Startup process is completing crash * recovery we don't need to write running xact data. - * - * Update checkPoint.nextXid since we have a later value */ if (!shutdown && XLogStandbyInfoActive()) - LogStandbySnapshot(&checkPoint.nextXid); + LogStandbySnapshot(); START_CRIT_SECTION(); @@ -7146,6 +7273,7 @@ CreateCheckPoint(int flags) ControlFile->time = (pg_time_t) time(NULL); /* crash recovery should always recover to the end of WAL */ MemSet(&ControlFile->minRecoveryPoint, 0, sizeof(XLogRecPtr)); + ControlFile->minRecoveryPointTLI = 0; UpdateControlFile(); LWLockRelease(ControlFileLock); @@ -7791,6 +7919,7 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) * with their parent prepared transactions. */ running.xcnt = nxids; + running.subxcnt = 0; running.subxid_overflow = false; running.nextXid = checkPoint.nextXid; running.oldestRunningXid = oldestActiveXID; @@ -7821,16 +7950,41 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) } /* - * TLI may change in a shutdown checkpoint, but it shouldn't decrease + * TLI may change in a shutdown checkpoint. */ if (checkPoint.ThisTimeLineID != ThisTimeLineID) { + /* + * The new timeline better be in the list of timelines we expect + * to see, according to the timeline history. It should also not + * decrease. + */ if (checkPoint.ThisTimeLineID < ThisTimeLineID || - !list_member_int(expectedTLIs, - (int) checkPoint.ThisTimeLineID)) + !tliInHistory(checkPoint.ThisTimeLineID, expectedTLEs)) ereport(PANIC, (errmsg("unexpected timeline ID %u (after %u) in checkpoint record", checkPoint.ThisTimeLineID, ThisTimeLineID))); + + /* + * If we have not yet reached min recovery point, and we're about + * to switch to a timeline greater than the timeline of the min + * recovery point: trouble. After switching to the new timeline, + * we could not possibly visit the min recovery point on the + * correct timeline anymore. This can happen if there is a newer + * timeline in the archive that branched before the timeline the + * min recovery point is on, and you attempt to do PITR to the + * new timeline. + */ + if (!XLogRecPtrIsInvalid(minRecoveryPoint) && + XLByteLT(lsn, minRecoveryPoint) && + checkPoint.ThisTimeLineID > minRecoveryPointTLI) + ereport(PANIC, + (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u", + checkPoint.ThisTimeLineID, + (uint32) (minRecoveryPoint >> 32), + (uint32) minRecoveryPoint, + minRecoveryPointTLI))); + /* Following WAL records should be run with new TLI */ ThisTimeLineID = checkPoint.ThisTimeLineID; } @@ -7915,7 +8069,10 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); if (XLByteLT(ControlFile->minRecoveryPoint, lsn)) + { ControlFile->minRecoveryPoint = lsn; + ControlFile->minRecoveryPointTLI = ThisTimeLineID; + } MemSet(&ControlFile->backupStartPoint, 0, sizeof(XLogRecPtr)); ControlFile->backupEndRequired = false; UpdateControlFile(); @@ -7945,9 +8102,11 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) * decreasing max_* settings. */ minRecoveryPoint = ControlFile->minRecoveryPoint; + minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; if (minRecoveryPoint != 0 && XLByteLT(minRecoveryPoint, lsn)) { ControlFile->minRecoveryPoint = lsn; + ControlFile->minRecoveryPointTLI = ThisTimeLineID; } UpdateControlFile(); @@ -7982,97 +8141,6 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) } } -void -xlog_desc(StringInfo buf, uint8 xl_info, char *rec) -{ - uint8 info = xl_info & ~XLR_INFO_MASK; - - if (info == XLOG_CHECKPOINT_SHUTDOWN || - info == XLOG_CHECKPOINT_ONLINE) - { - CheckPoint *checkpoint = (CheckPoint *) rec; - - appendStringInfo(buf, "checkpoint: redo %X/%X; " - "tli %u; fpw %s; xid %u/%u; oid %u; multi %u; offset %u; " - "oldest xid %u in DB %u; oldest running xid %u; %s", - (uint32) (checkpoint->redo >> 32), (uint32) checkpoint->redo, - checkpoint->ThisTimeLineID, - checkpoint->fullPageWrites ? "true" : "false", - checkpoint->nextXidEpoch, checkpoint->nextXid, - checkpoint->nextOid, - checkpoint->nextMulti, - checkpoint->nextMultiOffset, - checkpoint->oldestXid, - checkpoint->oldestXidDB, - checkpoint->oldestActiveXid, - (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online"); - } - else if (info == XLOG_NOOP) - { - appendStringInfo(buf, "xlog no-op"); - } - else if (info == XLOG_NEXTOID) - { - Oid nextOid; - - memcpy(&nextOid, rec, sizeof(Oid)); - appendStringInfo(buf, "nextOid: %u", nextOid); - } - else if (info == XLOG_SWITCH) - { - appendStringInfo(buf, "xlog switch"); - } - else if (info == XLOG_RESTORE_POINT) - { - xl_restore_point *xlrec = (xl_restore_point *) rec; - - appendStringInfo(buf, "restore point: %s", xlrec->rp_name); - - } - else if (info == XLOG_BACKUP_END) - { - XLogRecPtr startpoint; - - memcpy(&startpoint, rec, sizeof(XLogRecPtr)); - appendStringInfo(buf, "backup end: %X/%X", - (uint32) (startpoint >> 32), (uint32) startpoint); - } - else if (info == XLOG_PARAMETER_CHANGE) - { - xl_parameter_change xlrec; - const char *wal_level_str; - const struct config_enum_entry *entry; - - memcpy(&xlrec, rec, sizeof(xl_parameter_change)); - - /* Find a string representation for wal_level */ - wal_level_str = "?"; - for (entry = wal_level_options; entry->name; entry++) - { - if (entry->val == xlrec.wal_level) - { - wal_level_str = entry->name; - break; - } - } - - appendStringInfo(buf, "parameter change: max_connections=%d max_prepared_xacts=%d max_locks_per_xact=%d wal_level=%s", - xlrec.MaxConnections, - xlrec.max_prepared_xacts, - xlrec.max_locks_per_xact, - wal_level_str); - } - else if (info == XLOG_FPW_CHANGE) - { - bool fpw; - - memcpy(&fpw, rec, sizeof(bool)); - appendStringInfo(buf, "full_page_writes: %s", fpw ? "true" : "false"); - } - else - appendStringInfo(buf, "UNKNOWN"); -} - #ifdef WAL_DEBUG static void @@ -8081,7 +8149,8 @@ xlog_outrec(StringInfo buf, XLogRecord *record) int i; appendStringInfo(buf, "prev %X/%X; xid %u", - (uint32) (record->xl_prev >> 32), (uint32) record->xl_prev, + (uint32) (record->xl_prev >> 32), + (uint32) record->xl_prev, record->xl_xid); appendStringInfo(buf, "; len %u", @@ -8089,8 +8158,8 @@ xlog_outrec(StringInfo buf, XLogRecord *record) for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++) { - if (record->xl_info & XLR_SET_BKP_BLOCK(i)) - appendStringInfo(buf, "; bkpb%d", i + 1); + if (record->xl_info & XLR_BKP_BLOCK(i)) + appendStringInfo(buf, "; bkpb%d", i); } appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name); @@ -8175,7 +8244,7 @@ assign_xlog_sync_method(int new_sync_method, void *extra) ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync log segment %s: %m", - XLogFileNameP(curFileTLI, readSegNo)))); + XLogFileNameP(ThisTimeLineID, openLogSegNo)))); if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method)) XLogFileClose(); } @@ -8199,7 +8268,7 @@ issue_xlog_fsync(int fd, XLogSegNo segno) ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync log file %s: %m", - XLogFileNameP(ThisTimeLineID, openLogSegNo)))); + XLogFileNameP(ThisTimeLineID, segno)))); break; #ifdef HAVE_FSYNC_WRITETHROUGH case SYNC_METHOD_FSYNC_WRITETHROUGH: @@ -8207,7 +8276,7 @@ issue_xlog_fsync(int fd, XLogSegNo segno) ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync write-through log file %s: %m", - XLogFileNameP(ThisTimeLineID, openLogSegNo)))); + XLogFileNameP(ThisTimeLineID, segno)))); break; #endif #ifdef HAVE_FDATASYNC @@ -8216,7 +8285,7 @@ issue_xlog_fsync(int fd, XLogSegNo segno) ereport(PANIC, (errcode_for_file_access(), errmsg("could not fdatasync log file %s: %m", - XLogFileNameP(ThisTimeLineID, openLogSegNo)))); + XLogFileNameP(ThisTimeLineID, segno)))); break; #endif case SYNC_METHOD_OPEN: @@ -9028,33 +9097,16 @@ GetXLogWriteRecPtr(void) } /* - * Returns the redo pointer of the last restartpoint. This is the oldest - * point in WAL that we still need, if we have to restart recovery. Returns - * InvalidXLogRecPtr if we don't reliably know that point yet, that is, - * before we have started WAL redo. - * - * This function only works in the startup process, and only while we are - * in WAL redo. It's important to not return a value before redo has started, - * to avoid deleting WAL files that we might still need, but there's no - * fundamental reason why this couldn't return a valid value after redo has - * finished, or in other processes. This is enough for the current usage, - * however. + * Returns the redo pointer of the last checkpoint or restartpoint. This is + * the oldest point in WAL that we still need, if we have to restart recovery. */ void GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli) { - if (InRedo) - { - LWLockAcquire(ControlFileLock, LW_SHARED); - *oldrecptr = ControlFile->checkPointCopy.redo; - *oldtli = ControlFile->checkPointCopy.ThisTimeLineID; - LWLockRelease(ControlFileLock); - } - else - { - *oldrecptr = InvalidXLogRecPtr; - *oldtli = 0; - } + LWLockAcquire(ControlFileLock, LW_SHARED); + *oldrecptr = ControlFile->checkPointCopy.redo; + *oldtli = ControlFile->checkPointCopy.ThisTimeLineID; + LWLockRelease(ControlFileLock); } /* @@ -9233,7 +9285,7 @@ CancelBackup(void) * In standby mode, if after a successful return of XLogPageRead() the * caller finds the record it's interested in to be broken, it should * ereport the error with the level determined by - * emode_for_corrupt_record(), and then set "failedSources |= readSource" + * emode_for_corrupt_record(), and then set lastSourceFailed * and call XLogPageRead() again with the same arguments. This lets * XLogPageRead() to try fetching the record from another source, or to * sleep and retry. @@ -9251,7 +9303,7 @@ XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt, targetRecOff = (*RecPtr) % XLOG_BLCKSZ; /* Fast exit if we have read the record in the current buffer already */ - if (failedSources == 0 && targetSegNo == readSegNo && + if (!lastSourceFailed && targetSegNo == readSegNo && targetPageOff == readOff && targetRecOff < readLen) return true; @@ -9298,17 +9350,18 @@ XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt, /* In archive or crash recovery. */ if (readFile < 0) { - int sources; + int source; /* Reset curFileTLI if random fetch. */ if (randAccess) curFileTLI = 0; - sources = XLOG_FROM_PG_XLOG; if (InArchiveRecovery) - sources |= XLOG_FROM_ARCHIVE; + source = XLOG_FROM_ANY; + else + source = XLOG_FROM_PG_XLOG; - readFile = XLogFileReadAnyTLI(readSegNo, emode, sources); + readFile = XLogFileReadAnyTLI(readSegNo, emode, source); if (readFile < 0) return false; } @@ -9359,7 +9412,7 @@ XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt, fname, readOff))); goto next_record_is_invalid; } - if (!ValidXLogPageHeader((XLogPageHeader) readBuf, emode)) + if (!ValidXLogPageHeader((XLogPageHeader) readBuf, emode, true)) goto next_record_is_invalid; } @@ -9385,7 +9438,7 @@ XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt, fname, readOff))); goto next_record_is_invalid; } - if (!ValidXLogPageHeader((XLogPageHeader) readBuf, emode)) + if (!ValidXLogPageHeader((XLogPageHeader) readBuf, emode, false)) goto next_record_is_invalid; readFileHeaderValidated = true; @@ -9397,7 +9450,7 @@ XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt, return true; next_record_is_invalid: - failedSources |= readSource; + lastSourceFailed = true; if (readFile >= 0) close(readFile); @@ -9437,185 +9490,289 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, bool fetching_ckpt) { static pg_time_t last_fail_time = 0; + pg_time_t now; + + /*------- + * Standby mode is implemented by a state machine: + * + * 1. Read from archive (XLOG_FROM_ARCHIVE) + * 2. Read from pg_xlog (XLOG_FROM_PG_XLOG) + * 3. Check trigger file + * 4. Read from primary server via walreceiver (XLOG_FROM_STREAM) + * 5. Rescan timelines + * 6. Sleep 5 seconds, and loop back to 1. + * + * Failure to read from the current source advances the state machine to + * the next state. In addition, successfully reading a file from pg_xlog + * moves the state machine from state 2 back to state 1 (we always prefer + * files in the archive over files in pg_xlog). + * + * 'currentSource' indicates the current state. There are no currentSource + * values for "check trigger", "rescan timelines", and "sleep" states, + * those actions are taken when reading from the previous source fails, as + * part of advancing to the next state. + *------- + */ + if (currentSource == 0) + currentSource = XLOG_FROM_ARCHIVE; for (;;) { - if (WalRcvInProgress()) + int oldSource = currentSource; + + /* + * First check if we failed to read from the current source, and + * advance the state machine if so. The failure to read might've + * happened outside this function, e.g when a CRC check fails on a + * record, or within this loop. + */ + if (lastSourceFailed) { - bool havedata; - /* - * If we find an invalid record in the WAL streamed from master, - * something is seriously wrong. There's little chance that the - * problem will just go away, but PANIC is not good for - * availability either, especially in hot standby mode. - * Disconnect, and retry from archive/pg_xlog again. The WAL in - * the archive should be identical to what was streamed, so it's - * unlikely that it helps, but one can hope... - */ - if (failedSources & XLOG_FROM_STREAM) + switch (currentSource) { - ShutdownWalRcv(); - continue; - } + case XLOG_FROM_ARCHIVE: + currentSource = XLOG_FROM_PG_XLOG; + break; - /* - * Walreceiver is active, so see if new data has arrived. - * - * We only advance XLogReceiptTime when we obtain fresh WAL from - * walreceiver and observe that we had already processed - * everything before the most recent "chunk" that it flushed to - * disk. In steady state where we are keeping up with the - * incoming data, XLogReceiptTime will be updated on each cycle. - * When we are behind, XLogReceiptTime will not advance, so the - * grace time allotted to conflicting queries will decrease. - */ - if (XLByteLT(RecPtr, receivedUpto)) - havedata = true; - else - { - XLogRecPtr latestChunkStart; + case XLOG_FROM_PG_XLOG: + /* + * Check to see if the trigger file exists. Note that we do + * this only after failure, so when you create the trigger + * file, we still finish replaying as much as we can from + * archive and pg_xlog before failover. + */ + if (CheckForStandbyTrigger()) + return false; - receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart); - if (XLByteLT(RecPtr, receivedUpto)) - { - havedata = true; - if (!XLByteLT(RecPtr, latestChunkStart)) + /* + * If primary_conninfo is set, launch walreceiver to try to + * stream the missing WAL. + * + * If fetching_ckpt is TRUE, RecPtr points to the initial + * checkpoint location. In that case, we use RedoStartLSN + * as the streaming start position instead of RecPtr, so + * that when we later jump backwards to start redo at + * RedoStartLSN, we will have the logs streamed already. + */ + if (PrimaryConnInfo) { - XLogReceiptTime = GetCurrentTimestamp(); - SetCurrentChunkStartTime(XLogReceiptTime); + XLogRecPtr ptr = fetching_ckpt ? RedoStartLSN : RecPtr; + + RequestXLogStreaming(ptr, PrimaryConnInfo); } - } - else - havedata = false; - } - if (havedata) - { - /* - * Great, streamed far enough. Open the file if it's not open - * already. Use XLOG_FROM_STREAM so that source info is set - * correctly and XLogReceiptTime isn't changed. - */ - if (readFile < 0) - { - readFile = XLogFileRead(readSegNo, PANIC, - recoveryTargetTLI, - XLOG_FROM_STREAM, false); - Assert(readFile >= 0); - } - else - { - /* just make sure source info is correct... */ - readSource = XLOG_FROM_STREAM; - XLogReceiptSource = XLOG_FROM_STREAM; - } - break; - } + /* + * Move to XLOG_FROM_STREAM state in either case. We'll get + * immediate failure if we didn't launch walreceiver, and + * move on to the next state. + */ + currentSource = XLOG_FROM_STREAM; + break; - /* - * Data not here yet, so check for trigger then sleep for five - * seconds like in the WAL file polling case below. - */ - if (CheckForStandbyTrigger()) - return false; + case XLOG_FROM_STREAM: + /* + * Failure while streaming. Most likely, we got here because + * streaming replication was terminated, or promotion was + * triggered. But we also get here if we find an invalid + * record in the WAL streamed from master, in which case + * something is seriously wrong. There's little chance that + * the problem will just go away, but PANIC is not good for + * availability either, especially in hot standby mode. So, + * we treat that the same as disconnection, and retry from + * archive/pg_xlog again. The WAL in the archive should be + * identical to what was streamed, so it's unlikely that it + * helps, but one can hope... + */ + /* + * Before we leave XLOG_FROM_STREAM state, make sure that + * walreceiver is not running, so that it won't overwrite + * any WAL that we restore from archive. + */ + if (WalRcvInProgress()) + ShutdownWalRcv(); - /* - * Wait for more WAL to arrive, or timeout to be reached - */ - WaitLatch(&XLogCtl->recoveryWakeupLatch, - WL_LATCH_SET | WL_TIMEOUT, - 5000L); - ResetLatch(&XLogCtl->recoveryWakeupLatch); + /* + * Before we sleep, re-scan for possible new timelines if + * we were requested to recover to the latest timeline. + */ + if (recoveryTargetIsLatest) + { + if (rescanLatestTimeLine()) + { + currentSource = XLOG_FROM_ARCHIVE; + break; + } + } + + /* + * XLOG_FROM_STREAM is the last state in our state machine, + * so we've exhausted all the options for obtaining the + * requested WAL. We're going to loop back and retry from + * the archive, but if it hasn't been long since last + * attempt, sleep 5 seconds to avoid busy-waiting. + */ + now = (pg_time_t) time(NULL); + if ((now - last_fail_time) < 5) + { + pg_usleep(1000000L * (5 - (now - last_fail_time))); + now = (pg_time_t) time(NULL); + } + last_fail_time = now; + currentSource = XLOG_FROM_ARCHIVE; + break; + + default: + elog(ERROR, "unexpected WAL source %d", currentSource); + } } - else + else if (currentSource == XLOG_FROM_PG_XLOG) { /* - * WAL receiver is not active. Poll the archive. + * We just successfully read a file in pg_xlog. We prefer files + * in the archive over ones in pg_xlog, so try the next file + * again from the archive first. */ - int sources; - pg_time_t now; + currentSource = XLOG_FROM_ARCHIVE; + } - if (readFile >= 0) - { - close(readFile); - readFile = -1; - } - /* Reset curFileTLI if random fetch. */ - if (randAccess) - curFileTLI = 0; + if (currentSource != oldSource) + elog(DEBUG2, "switched WAL source from %s to %s after %s", + xlogSourceNames[oldSource], xlogSourceNames[currentSource], + lastSourceFailed ? "failure" : "success"); + + /* + * We've now handled possible failure. Try to read from the chosen + * source. + */ + lastSourceFailed = false; + + switch (currentSource) + { + case XLOG_FROM_ARCHIVE: + case XLOG_FROM_PG_XLOG: + /* Close any old file we might have open. */ + if (readFile >= 0) + { + close(readFile); + readFile = -1; + } + /* Reset curFileTLI if random fetch. */ + if (randAccess) + curFileTLI = 0; - /* - * Try to restore the file from archive, or read an existing file - * from pg_xlog. - */ - sources = XLOG_FROM_ARCHIVE | XLOG_FROM_PG_XLOG; - if (!(sources & ~failedSources)) - { /* - * We've exhausted all options for retrieving the file. Retry. + * Try to restore the file from archive, or read an existing + * file from pg_xlog. */ - failedSources = 0; + readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2, currentSource); + if (readFile >= 0) + return true; /* success! */ /* - * Before we sleep, re-scan for possible new timelines if we - * were requested to recover to the latest timeline. + * Nope, not found in archive or pg_xlog. */ - if (recoveryTargetIsLatest) - { - if (rescanLatestTimeLine()) - continue; - } + lastSourceFailed = true; + break; + + case XLOG_FROM_STREAM: + { + bool havedata; /* - * If it hasn't been long since last attempt, sleep to avoid - * busy-waiting. + * Check if WAL receiver is still active. */ - now = (pg_time_t) time(NULL); - if ((now - last_fail_time) < 5) + if (!WalRcvInProgress()) { - pg_usleep(1000000L * (5 - (now - last_fail_time))); - now = (pg_time_t) time(NULL); + lastSourceFailed = true; + break; } - last_fail_time = now; /* - * If primary_conninfo is set, launch walreceiver to try to - * stream the missing WAL, before retrying to restore from - * archive/pg_xlog. + * Walreceiver is active, so see if new data has arrived. * - * If fetching_ckpt is TRUE, RecPtr points to the initial - * checkpoint location. In that case, we use RedoStartLSN as - * the streaming start position instead of RecPtr, so that - * when we later jump backwards to start redo at RedoStartLSN, - * we will have the logs streamed already. + * We only advance XLogReceiptTime when we obtain fresh WAL + * from walreceiver and observe that we had already processed + * everything before the most recent "chunk" that it flushed to + * disk. In steady state where we are keeping up with the + * incoming data, XLogReceiptTime will be updated on each cycle. + * When we are behind, XLogReceiptTime will not advance, so the + * grace time allotted to conflicting queries will decrease. */ - if (PrimaryConnInfo) + if (XLByteLT(RecPtr, receivedUpto)) + havedata = true; + else { - XLogRecPtr ptr = fetching_ckpt ? RedoStartLSN : RecPtr; + XLogRecPtr latestChunkStart; - RequestXLogStreaming(ptr, PrimaryConnInfo); - continue; + receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart); + if (XLByteLT(RecPtr, receivedUpto)) + { + havedata = true; + if (!XLByteLT(RecPtr, latestChunkStart)) + { + XLogReceiptTime = GetCurrentTimestamp(); + SetCurrentChunkStartTime(XLogReceiptTime); + } + } + else + havedata = false; + } + if (havedata) + { + /* + * Great, streamed far enough. Open the file if it's not + * open already. Use XLOG_FROM_STREAM so that source info + * is set correctly and XLogReceiptTime isn't changed. + */ + if (readFile < 0) + { + readFile = XLogFileRead(readSegNo, PANIC, + recoveryTargetTLI, + XLOG_FROM_STREAM, false); + Assert(readFile >= 0); + } + else + { + /* just make sure source info is correct... */ + readSource = XLOG_FROM_STREAM; + XLogReceiptSource = XLOG_FROM_STREAM; + return true; + } + break; } - } - /* Don't try to read from a source that just failed */ - sources &= ~failedSources; - readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2, sources); - if (readFile >= 0) - break; - /* - * Nope, not found in archive and/or pg_xlog. - */ - failedSources |= sources; + /* + * Data not here yet. Check for trigger, then wait for + * walreceiver to wake us up when new WAL arrives. + */ + if (CheckForStandbyTrigger()) + { + /* + * Note that we don't "return false" immediately here. + * After being triggered, we still want to replay all the + * WAL that was already streamed. It's in pg_xlog now, so + * we just treat this as a failure, and the state machine + * will move on to replay the streamed WAL from pg_xlog, + * and then recheck the trigger and exit replay. + */ + lastSourceFailed = true; + break; + } - /* - * Check to see if the trigger file exists. Note that we do this - * only after failure, so when you create the trigger file, we - * still finish replaying as much as we can from archive and - * pg_xlog before failover. - */ - if (CheckForStandbyTrigger()) - return false; + /* + * Wait for more WAL to arrive. Time out after 5 seconds, like + * when polling the archive, to react to a trigger file + * promptly. + */ + WaitLatch(&XLogCtl->recoveryWakeupLatch, + WL_LATCH_SET | WL_TIMEOUT, + 5000L); + ResetLatch(&XLogCtl->recoveryWakeupLatch); + break; + } + + default: + elog(ERROR, "unexpected WAL source %d", currentSource); } /* @@ -9625,7 +9782,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, HandleStartupProcInterrupts(); } - return true; + return false; /* not reached */ } /* diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c index 93aca7375c..9bd6b8e3a2 100644 --- a/src/backend/access/transam/xlogarchive.c +++ b/src/backend/access/transam/xlogarchive.c @@ -41,10 +41,15 @@ * For fixed-size files, the caller may pass the expected size as an * additional crosscheck on successful recovery. If the file size is not * known, set expectedSize = 0. + * + * When 'cleanupEnabled' is false, refrain from deleting any old WAL segments + * in the archive. This is used when fetching the initial checkpoint record, + * when we are not yet sure how far back we need the WAL. */ bool RestoreArchivedFile(char *path, const char *xlogfname, - const char *recovername, off_t expectedSize) + const char *recovername, off_t expectedSize, + bool cleanupEnabled) { char xlogpath[MAXPGPATH]; char xlogRestoreCmd[MAXPGPATH]; @@ -113,9 +118,10 @@ RestoreArchivedFile(char *path, const char *xlogfname, * replication. All files earlier than this point can be deleted from the * archive, though there is no requirement to do so. * - * We initialise this with the filename of an InvalidXLogRecPtr, which - * will prevent the deletion of any WAL files from the archive because of - * the alphabetic sorting property of WAL filenames. + * If cleanup is not enabled, initialise this with the filename of + * InvalidXLogRecPtr, which will prevent the deletion of any WAL files + * from the archive because of the alphabetic sorting property of WAL + * filenames. * * Once we have successfully located the redo pointer of the checkpoint * from which we start recovery we never request a file prior to the redo @@ -124,9 +130,9 @@ RestoreArchivedFile(char *path, const char *xlogfname, * flags to signify the point when we can begin deleting WAL files from * the archive. */ - GetOldestRestartPoint(&restartRedoPtr, &restartTli); - if (!XLogRecPtrIsInvalid(restartRedoPtr)) + if (cleanupEnabled) { + GetOldestRestartPoint(&restartRedoPtr, &restartTli); XLByteToSeg(restartRedoPtr, restartSegNo); XLogFileName(lastRestartPointFname, restartTli, restartSegNo); /* we shouldn't need anything earlier than last restart point */ @@ -342,7 +348,6 @@ ExecuteRecoveryCommand(char *command, char *commandName, bool failOnSignal) GetOldestRestartPoint(&restartRedoPtr, &restartTli); XLByteToSeg(restartRedoPtr, restartSegNo); XLogFileName(lastRestartPointFname, restartTli, restartSegNo); - LWLockRelease(ControlFileLock); /* * construct the command to be executed diff --git a/src/backend/bootstrap/bootparse.y b/src/backend/bootstrap/bootparse.y index ec634f1660..ec7786a2ae 100644 --- a/src/backend/bootstrap/bootparse.y +++ b/src/backend/bootstrap/bootparse.y @@ -247,7 +247,8 @@ Boot_CreateStmt: ONCOMMIT_NOOP, (Datum) 0, false, - true); + true, + false); elog(DEBUG4, "relation created with OID %u", id); } do_end(); diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c index 87d6f02421..0b61c97eb4 100644 --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@ -355,12 +355,7 @@ performMultipleDeletions(const ObjectAddresses *objects, /* And clean up */ free_object_addresses(targetObjects); - /* - * We closed depRel earlier in deleteOneObject if doing a drop - * concurrently - */ - if ((flags & PERFORM_DELETION_CONCURRENTLY) != PERFORM_DELETION_CONCURRENTLY) - heap_close(depRel, RowExclusiveLock); + heap_close(depRel, RowExclusiveLock); } /* @@ -1012,8 +1007,33 @@ deleteOneObject(const ObjectAddress *object, Relation depRel, int flags) } /* - * First remove any pg_depend records that link from this object to - * others. (Any records linking to this object should be gone already.) + * Close depRel if we are doing a drop concurrently. The object deletion + * subroutine will commit the current transaction, so we can't keep the + * relation open across doDeletion(). + */ + if (flags & PERFORM_DELETION_CONCURRENTLY) + heap_close(depRel, RowExclusiveLock); + + /* + * Delete the object itself, in an object-type-dependent way. + * + * We used to do this after removing the outgoing dependency links, but it + * seems just as reasonable to do it beforehand. In the concurrent case + * we *must* do it in this order, because we can't make any transactional + * updates before calling doDeletion() --- they'd get committed right + * away, which is not cool if the deletion then fails. + */ + doDeletion(object, flags); + + /* + * Reopen depRel if we closed it above + */ + if (flags & PERFORM_DELETION_CONCURRENTLY) + depRel = heap_open(DependRelationId, RowExclusiveLock); + + /* + * Now remove any pg_depend records that link from this object to others. + * (Any records linking to this object should be gone already.) * * When dropping a whole object (subId = 0), remove all pg_depend records * for its sub-objects too. @@ -1054,17 +1074,6 @@ deleteOneObject(const ObjectAddress *object, Relation depRel, int flags) deleteSharedDependencyRecordsFor(object->classId, object->objectId, object->objectSubId); - /* - * Close depRel if we are doing a drop concurrently because it commits the - * transaction, so we don't want dangling references. - */ - if ((flags & PERFORM_DELETION_CONCURRENTLY) == PERFORM_DELETION_CONCURRENTLY) - heap_close(depRel, RowExclusiveLock); - - /* - * Now delete the object itself, in an object-type-dependent way. - */ - doDeletion(object, flags); /* * Delete any comments or security labels associated with this object. @@ -1247,15 +1256,23 @@ AcquireDeletionLock(const ObjectAddress *object, int flags) { if (object->classId == RelationRelationId) { - if ((flags & PERFORM_DELETION_CONCURRENTLY) == PERFORM_DELETION_CONCURRENTLY) + /* + * In DROP INDEX CONCURRENTLY, take only ShareUpdateExclusiveLock on + * the index for the moment. index_drop() will promote the lock once + * it's safe to do so. In all other cases we need full exclusive + * lock. + */ + if (flags & PERFORM_DELETION_CONCURRENTLY) LockRelationOid(object->objectId, ShareUpdateExclusiveLock); else LockRelationOid(object->objectId, AccessExclusiveLock); } else + { /* assume we should lock the whole object not a sub-object */ LockDatabaseObject(object->classId, object->objectId, 0, AccessExclusiveLock); + } } /* diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index c80df418fa..d93d273eb1 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -49,6 +49,7 @@ #include "catalog/pg_type.h" #include "catalog/pg_type_fn.h" #include "catalog/storage.h" +#include "catalog/storage_xlog.h" #include "commands/tablecmds.h" #include "commands/typecmds.h" #include "miscadmin.h" @@ -985,7 +986,8 @@ heap_create_with_catalog(const char *relname, OnCommitAction oncommit, Datum reloptions, bool use_user_acl, - bool allow_system_table_mods) + bool allow_system_table_mods, + bool is_internal) { Relation pg_class_desc; Relation new_rel_desc; @@ -1275,8 +1277,15 @@ heap_create_with_catalog(const char *relname, } /* Post creation hook for new relation */ - InvokeObjectAccessHook(OAT_POST_CREATE, - RelationRelationId, relid, 0, NULL); + if (object_access_hook) + { + ObjectAccessPostCreate post_create_args; + + memset(&post_create_args, 0, sizeof(ObjectAccessPostCreate)); + post_create_args.is_internal = is_internal; + (*object_access_hook)(OAT_POST_CREATE, RelationRelationId, + relid, 0, &post_create_args); + } /* * Store any supplied constraints and defaults. @@ -1426,6 +1435,47 @@ DeleteAttributeTuples(Oid relid) heap_close(attrel, RowExclusiveLock); } +/* + * DeleteSystemAttributeTuples + * + * Remove pg_attribute rows for system columns of the given relid. + * + * Note: this is only used when converting a table to a view. Views don't + * have system columns, so we should remove them from pg_attribute. + */ +void +DeleteSystemAttributeTuples(Oid relid) +{ + Relation attrel; + SysScanDesc scan; + ScanKeyData key[2]; + HeapTuple atttup; + + /* Grab an appropriate lock on the pg_attribute relation */ + attrel = heap_open(AttributeRelationId, RowExclusiveLock); + + /* Use the index to scan only system attributes of the target relation */ + ScanKeyInit(&key[0], + Anum_pg_attribute_attrelid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(relid)); + ScanKeyInit(&key[1], + Anum_pg_attribute_attnum, + BTLessEqualStrategyNumber, F_INT2LE, + Int16GetDatum(0)); + + scan = systable_beginscan(attrel, AttributeRelidNumIndexId, true, + SnapshotNow, 2, key); + + /* Delete all the matching tuples */ + while ((atttup = systable_getnext(scan)) != NULL) + simple_heap_delete(attrel, &atttup->t_self); + + /* Clean up after the scan */ + systable_endscan(scan); + heap_close(attrel, RowExclusiveLock); +} + /* * RemoveAttributeById * diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 464950b9af..92e8de1531 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -33,6 +33,7 @@ #include "catalog/dependency.h" #include "catalog/heap.h" #include "catalog/index.h" +#include "catalog/objectaccess.h" #include "catalog/pg_collation.h" #include "catalog/pg_constraint.h" #include "catalog/pg_operator.h" @@ -124,6 +125,10 @@ static void ResetReindexPending(void); * See whether an existing relation has a primary key. * * Caller must have suitable lock on the relation. + * + * Note: we intentionally do not check IndexIsValid here; that's because this + * is used to enforce the rule that there can be only one indisprimary index, + * and we want that to be true even if said index is invalid. */ static bool relationHasPrimaryKey(Relation rel) @@ -607,6 +612,7 @@ UpdateIndexRelation(Oid indexoid, values[Anum_pg_index_indcheckxmin - 1] = BoolGetDatum(false); /* we set isvalid and isready the same way */ values[Anum_pg_index_indisready - 1] = BoolGetDatum(isvalid); + values[Anum_pg_index_indislive - 1] = BoolGetDatum(true); values[Anum_pg_index_indkey - 1] = PointerGetDatum(indkey); values[Anum_pg_index_indcollation - 1] = PointerGetDatum(indcollation); values[Anum_pg_index_indclass - 1] = PointerGetDatum(indclass); @@ -686,7 +692,8 @@ index_create(Relation heapRelation, bool initdeferred, bool allow_system_table_mods, bool skip_build, - bool concurrent) + bool concurrent, + bool is_internal) { Oid heapRelationId = RelationGetRelid(heapRelation); Relation pg_class; @@ -1018,6 +1025,17 @@ index_create(Relation heapRelation, Assert(!initdeferred); } + /* Post creation hook for new index */ + if (object_access_hook) + { + ObjectAccessPostCreate post_create_args; + + memset(&post_create_args, 0, sizeof(ObjectAccessPostCreate)); + post_create_args.is_internal = is_internal; + (*object_access_hook)(OAT_POST_CREATE, RelationRelationId, + indexRelationId, 0, &post_create_args); + } + /* * Advance the command counter so that we can see the newly-entered * catalog tuples for the index. @@ -1245,8 +1263,9 @@ index_constraint_create(Relation heapRelation, * Note: since this is a transactional update, it's unsafe against * concurrent SnapshotNow scans of pg_index. When making an existing * index into a constraint, caller must have a table lock that prevents - * concurrent table updates, and there is a risk that concurrent readers - * of the table will miss seeing this index at all. + * concurrent table updates; if it's less than a full exclusive lock, + * there is a risk that concurrent readers of the table will miss seeing + * this index at all. */ if (update_pgindex && (mark_as_primary || deferrable)) { @@ -1304,8 +1323,8 @@ index_drop(Oid indexId, bool concurrent) LockRelId heaprelid, indexrelid; LOCKTAG heaplocktag; + LOCKMODE lockmode; VirtualTransactionId *old_lockholders; - Form_pg_index indexForm; /* * To drop an index safely, we must grab exclusive lock on its parent @@ -1316,78 +1335,102 @@ index_drop(Oid indexId, bool concurrent) * table lock strong enough to prevent all queries on the table from * proceeding until we commit and send out a shared-cache-inval notice * that will make them update their index lists. + * + * In the concurrent case we avoid this requirement by disabling index use + * in multiple steps and waiting out any transactions that might be using + * the index, so we don't need exclusive lock on the parent table. Instead + * we take ShareUpdateExclusiveLock, to ensure that two sessions aren't + * doing CREATE/DROP INDEX CONCURRENTLY on the same index. (We will get + * AccessExclusiveLock on the index below, once we're sure nobody else is + * using it.) */ heapId = IndexGetRelation(indexId, false); - if (concurrent) - { - userHeapRelation = heap_open(heapId, ShareUpdateExclusiveLock); - userIndexRelation = index_open(indexId, ShareUpdateExclusiveLock); - } - else - { - userHeapRelation = heap_open(heapId, AccessExclusiveLock); - userIndexRelation = index_open(indexId, AccessExclusiveLock); - } + lockmode = concurrent ? ShareUpdateExclusiveLock : AccessExclusiveLock; + userHeapRelation = heap_open(heapId, lockmode); + userIndexRelation = index_open(indexId, lockmode); /* - * We might still have open queries using it in our own session. + * We might still have open queries using it in our own session, which the + * above locking won't prevent, so test explicitly. */ CheckTableNotInUse(userIndexRelation, "DROP INDEX"); /* - * Drop Index concurrently is similar in many ways to creating an index - * concurrently, so some actions are similar to DefineIndex() + * Drop Index Concurrently is more or less the reverse process of Create + * Index Concurrently. + * + * First we unset indisvalid so queries starting afterwards don't use the + * index to answer queries anymore. We have to keep indisready = true so + * transactions that are still scanning the index can continue to see + * valid index contents. For instance, if they are using READ COMMITTED + * mode, and another transaction makes changes and commits, they need to + * see those new tuples in the index. + * + * After all transactions that could possibly have used the index for + * queries end, we can unset indisready and indislive, then wait till + * nobody could be touching it anymore. (Note: we need indislive because + * this state must be distinct from the initial state during CREATE INDEX + * CONCURRENTLY, which has indislive true while indisready and indisvalid + * are false. That's because in that state, transactions must examine the + * index for HOT-safety decisions, while in this state we don't want them + * to open it at all.) + * + * Since all predicate locks on the index are about to be made invalid, we + * must promote them to predicate locks on the heap. In the + * non-concurrent case we can just do that now. In the concurrent case + * it's a bit trickier. The predicate locks must be moved when there are + * no index scans in progress on the index and no more can subsequently + * start, so that no new predicate locks can be made on the index. Also, + * they must be moved before heap inserts stop maintaining the index, else + * the conflict with the predicate lock on the index gap could be missed + * before the lock on the heap relation is in place to detect a conflict + * based on the heap tuple insert. */ if (concurrent) { /* - * Mark index invalid by updating its pg_index entry - * - * Don't Assert(indexForm->indisvalid) because we may be trying to - * clear up after an error when trying to create an index which left - * the index invalid + * We must commit our transaction in order to make the first pg_index + * state update visible to other sessions. If the DROP machinery has + * already performed any other actions (removal of other objects, + * pg_depend entries, etc), the commit would make those actions + * permanent, which would leave us with inconsistent catalog state if + * we fail partway through the following sequence. Since DROP INDEX + * CONCURRENTLY is restricted to dropping just one index that has no + * dependencies, we should get here before anything's been done --- + * but let's check that to be sure. We can verify that the current + * transaction has not executed any transactional updates by checking + * that no XID has been assigned. */ - indexRelation = heap_open(IndexRelationId, RowExclusiveLock); - - tuple = SearchSysCacheCopy1(INDEXRELID, - ObjectIdGetDatum(indexId)); - if (!HeapTupleIsValid(tuple)) - elog(ERROR, "cache lookup failed for index %u", indexId); - indexForm = (Form_pg_index) GETSTRUCT(tuple); - - indexForm->indisvalid = false; /* make unusable for queries */ - indexForm->indisready = false; /* make invisible to changes */ - - simple_heap_update(indexRelation, &tuple->t_self, tuple); - CatalogUpdateIndexes(indexRelation, tuple); + if (GetTopTransactionIdIfAny() != InvalidTransactionId) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("DROP INDEX CONCURRENTLY must be first action in transaction"))); - heap_close(indexRelation, RowExclusiveLock); + /* + * Mark index invalid by updating its pg_index entry + */ + index_set_state_flags(indexId, INDEX_DROP_CLEAR_VALID); /* - * Invalidate the relcache for the table, so that after this - * transaction we will refresh the index list. Forgetting just the - * index is not enough. + * Invalidate the relcache for the table, so that after this commit + * all sessions will refresh any cached plans that might reference the + * index. */ CacheInvalidateRelcache(userHeapRelation); /* save lockrelid and locktag for below, then close but keep locks */ heaprelid = userHeapRelation->rd_lockInfo.lockRelId; SET_LOCKTAG_RELATION(heaplocktag, heaprelid.dbId, heaprelid.relId); - heap_close(userHeapRelation, NoLock); - indexrelid = userIndexRelation->rd_lockInfo.lockRelId; + + heap_close(userHeapRelation, NoLock); index_close(userIndexRelation, NoLock); /* - * For a concurrent drop, it's important to make the catalog entries - * visible to other transactions before we drop the index. The index - * will be marked not indisvalid, so that no one else tries to either - * insert into it or use it for queries. - * - * We must commit our current transaction so that the index update - * becomes visible; then start another. Note that all the data - * structures we just built are lost in the commit. The only data we - * keep past here are the relation IDs. + * We must commit our current transaction so that the indisvalid + * update becomes visible to other transactions; then start another. + * Note that any previously-built data structures are lost in the + * commit. The only data we keep past here are the relation IDs. * * Before committing, get a session-level lock on the table, to ensure * that neither it nor the index can be dropped before we finish. This @@ -1402,13 +1445,13 @@ index_drop(Oid indexId, bool concurrent) StartTransactionCommand(); /* - * Now we must wait until no running transaction could have the table - * open with the old list of indexes. To do this, inquire which xacts - * currently would conflict with AccessExclusiveLock on the table -- - * ie, which ones have a lock of any kind on the table. Then wait for - * each of these xacts to commit or abort. Note we do not need to - * worry about xacts that open the table for writing after this point; - * they will see the index as invalid when they open the relation. + * Now we must wait until no running transaction could be using the + * index for a query. To do this, inquire which xacts currently would + * conflict with AccessExclusiveLock on the table -- ie, which ones + * have a lock of any kind on the table. Then wait for each of these + * xacts to commit or abort. Note we do not need to worry about xacts + * that open the table for reading after this point; they will see the + * index as invalid when they open the relation. * * Note: the reason we use actual lock acquisition here, rather than * just checking the ProcArray and sleeping, is that deadlock is @@ -1428,6 +1471,56 @@ index_drop(Oid indexId, bool concurrent) old_lockholders++; } + /* + * No more predicate locks will be acquired on this index, and we're + * about to stop doing inserts into the index which could show + * conflicts with existing predicate locks, so now is the time to move + * them to the heap relation. + */ + userHeapRelation = heap_open(heapId, ShareUpdateExclusiveLock); + userIndexRelation = index_open(indexId, ShareUpdateExclusiveLock); + TransferPredicateLocksToHeapRelation(userIndexRelation); + + /* + * Now we are sure that nobody uses the index for queries; they just + * might have it open for updating it. So now we can unset indisready + * and indislive, then wait till nobody could be using it at all + * anymore. + */ + index_set_state_flags(indexId, INDEX_DROP_SET_DEAD); + + /* + * Invalidate the relcache for the table, so that after this commit + * all sessions will refresh the table's index list. Forgetting just + * the index's relcache entry is not enough. + */ + CacheInvalidateRelcache(userHeapRelation); + + /* + * Close the relations again, though still holding session lock. + */ + heap_close(userHeapRelation, NoLock); + index_close(userIndexRelation, NoLock); + + /* + * Again, commit the transaction to make the pg_index update visible + * to other sessions. + */ + CommitTransactionCommand(); + StartTransactionCommand(); + + /* + * Wait till every transaction that saw the old index state has + * finished. The logic here is the same as above. + */ + old_lockholders = GetLockConflicts(&heaplocktag, AccessExclusiveLock); + + while (VirtualTransactionIdIsValid(*old_lockholders)) + { + VirtualXactLock(*old_lockholders, true); + old_lockholders++; + } + /* * Re-open relations to allow us to complete our actions. * @@ -1438,12 +1531,11 @@ index_drop(Oid indexId, bool concurrent) userHeapRelation = heap_open(heapId, ShareUpdateExclusiveLock); userIndexRelation = index_open(indexId, AccessExclusiveLock); } - - /* - * All predicate locks on the index are about to be made invalid. Promote - * them to relation locks on the heap. - */ - TransferPredicateLocksToHeapRelation(userIndexRelation); + else + { + /* Not concurrent, so just transfer predicate locks and we're good */ + TransferPredicateLocksToHeapRelation(userIndexRelation); + } /* * Schedule physical removal of the files @@ -1497,7 +1589,8 @@ index_drop(Oid indexId, bool concurrent) * of relhasindex (the next VACUUM will fix it if necessary). So there is * no need to update the pg_class tuple for the owning relation. But we * must send out a shared-cache-inval notice on the owning relation to - * ensure other backends update their relcache lists of indexes. + * ensure other backends update their relcache lists of indexes. (In the + * concurrent case, this is redundant but harmless.) */ CacheInvalidateRelcache(userHeapRelation); @@ -1573,7 +1666,7 @@ BuildIndexInfo(Relation index) /* other info */ ii->ii_Unique = indexStruct->indisunique; - ii->ii_ReadyForInserts = indexStruct->indisready; + ii->ii_ReadyForInserts = IndexIsReady(indexStruct); /* initialize index-build state to default */ ii->ii_Concurrent = false; @@ -1931,8 +2024,20 @@ index_build(Relation heapRelation, * index's usability horizon. Moreover, we *must not* try to change the * index's pg_index entry while reindexing pg_index itself, and this * optimization nicely prevents that. - */ - if (indexInfo->ii_BrokenHotChain && !isreindex) + * + * We also need not set indcheckxmin during a concurrent index build, + * because we won't set indisvalid true until all transactions that care + * about the broken HOT chains are gone. + * + * Therefore, this code path can only be taken during non-concurrent + * CREATE INDEX. Thus the fact that heap_update will set the pg_index + * tuple's xmin doesn't matter, because that tuple was created in the + * current transaction anyway. That also means we don't need to worry + * about any concurrent readers of the tuple; no other transaction can see + * it yet. + */ + if (indexInfo->ii_BrokenHotChain && !isreindex && + !indexInfo->ii_Concurrent) { Oid indexId = RelationGetRelid(indexRelation); Relation pg_index; @@ -2895,6 +3000,97 @@ validate_index_heapscan(Relation heapRelation, } +/* + * index_set_state_flags - adjust pg_index state flags + * + * This is used during CREATE/DROP INDEX CONCURRENTLY to adjust the pg_index + * flags that denote the index's state. We must use an in-place update of + * the pg_index tuple, because we do not have exclusive lock on the parent + * table and so other sessions might concurrently be doing SnapshotNow scans + * of pg_index to identify the table's indexes. A transactional update would + * risk somebody not seeing the index at all. Because the update is not + * transactional and will not roll back on error, this must only be used as + * the last step in a transaction that has not made any transactional catalog + * updates! + * + * Note that heap_inplace_update does send a cache inval message for the + * tuple, so other sessions will hear about the update as soon as we commit. + */ +void +index_set_state_flags(Oid indexId, IndexStateFlagsAction action) +{ + Relation pg_index; + HeapTuple indexTuple; + Form_pg_index indexForm; + + /* Assert that current xact hasn't done any transactional updates */ + Assert(GetTopTransactionIdIfAny() == InvalidTransactionId); + + /* Open pg_index and fetch a writable copy of the index's tuple */ + pg_index = heap_open(IndexRelationId, RowExclusiveLock); + + indexTuple = SearchSysCacheCopy1(INDEXRELID, + ObjectIdGetDatum(indexId)); + if (!HeapTupleIsValid(indexTuple)) + elog(ERROR, "cache lookup failed for index %u", indexId); + indexForm = (Form_pg_index) GETSTRUCT(indexTuple); + + /* Perform the requested state change on the copy */ + switch (action) + { + case INDEX_CREATE_SET_READY: + /* Set indisready during a CREATE INDEX CONCURRENTLY sequence */ + Assert(indexForm->indislive); + Assert(!indexForm->indisready); + Assert(!indexForm->indisvalid); + indexForm->indisready = true; + break; + case INDEX_CREATE_SET_VALID: + /* Set indisvalid during a CREATE INDEX CONCURRENTLY sequence */ + Assert(indexForm->indislive); + Assert(indexForm->indisready); + Assert(!indexForm->indisvalid); + indexForm->indisvalid = true; + break; + case INDEX_DROP_CLEAR_VALID: + + /* + * Clear indisvalid during a DROP INDEX CONCURRENTLY sequence + * + * If indisready == true we leave it set so the index still gets + * maintained by active transactions. We only need to ensure that + * indisvalid is false. (We don't assert that either is initially + * true, though, since we want to be able to retry a DROP INDEX + * CONCURRENTLY that failed partway through.) + * + * Note: the CLUSTER logic assumes that indisclustered cannot be + * set on any invalid index, so clear that flag too. + */ + indexForm->indisvalid = false; + indexForm->indisclustered = false; + break; + case INDEX_DROP_SET_DEAD: + + /* + * Clear indisready/indislive during DROP INDEX CONCURRENTLY + * + * We clear both indisready and indislive, because we not only + * want to stop updates, we want to prevent sessions from touching + * the index at all. + */ + Assert(!indexForm->indisvalid); + indexForm->indisready = false; + indexForm->indislive = false; + break; + } + + /* ... and write it back in-place */ + heap_inplace_update(pg_index, indexTuple); + + heap_close(pg_index, RowExclusiveLock); +} + + /* * IndexGetRelation: given an index's relation OID, get the OID of the * relation it is an index on. Uses the system cache. @@ -2928,12 +3124,9 @@ void reindex_index(Oid indexId, bool skip_constraint_checks) { Relation iRel, - heapRelation, - pg_index; + heapRelation; Oid heapId; IndexInfo *indexInfo; - HeapTuple indexTuple; - Form_pg_index indexForm; volatile bool skipped_constraint = false; /* @@ -3006,32 +3199,48 @@ reindex_index(Oid indexId, bool skip_constraint_checks) ResetReindexProcessing(); /* - * If the index is marked invalid or not ready (ie, it's from a failed - * CREATE INDEX CONCURRENTLY), and we didn't skip a uniqueness check, we - * can now mark it valid. This allows REINDEX to be used to clean up in - * such cases. + * If the index is marked invalid/not-ready/dead (ie, it's from a failed + * CREATE INDEX CONCURRENTLY, or a DROP INDEX CONCURRENTLY failed midway), + * and we didn't skip a uniqueness check, we can now mark it valid. This + * allows REINDEX to be used to clean up in such cases. * * We can also reset indcheckxmin, because we have now done a * non-concurrent index build, *except* in the case where index_build - * found some still-broken HOT chains. If it did, we normally leave - * indcheckxmin alone (note that index_build won't have changed it, - * because this is a reindex). But if the index was invalid or not ready - * and there were broken HOT chains, it seems best to force indcheckxmin - * true, because the normal argument that the HOT chains couldn't conflict - * with the index is suspect for an invalid index. + * found some still-broken HOT chains. If it did, and we don't have to + * change any of the other flags, we just leave indcheckxmin alone (note + * that index_build won't have changed it, because this is a reindex). + * This is okay and desirable because not updating the tuple leaves the + * index's usability horizon (recorded as the tuple's xmin value) the same + * as it was. + * + * But, if the index was invalid/not-ready/dead and there were broken HOT + * chains, we had better force indcheckxmin true, because the normal + * argument that the HOT chains couldn't conflict with the index is + * suspect for an invalid index. (A conflict is definitely possible if + * the index was dead. It probably shouldn't happen otherwise, but let's + * be conservative.) In this case advancing the usability horizon is + * appropriate. + * + * Note that if we have to update the tuple, there is a risk of concurrent + * transactions not seeing it during their SnapshotNow scans of pg_index. + * While not especially desirable, this is safe because no such + * transaction could be trying to update the table (since we have + * ShareLock on it). The worst case is that someone might transiently + * fail to use the index for a query --- but it was probably unusable + * before anyway, if we are updating the tuple. * - * Note that it is important to not update the pg_index entry if we don't - * have to, because updating it will move the index's usability horizon - * (recorded as the tuple's xmin value) if indcheckxmin is true. We don't - * really want REINDEX to move the usability horizon forward ever, but we - * have no choice if we are to fix indisvalid or indisready. Of course, - * clearing indcheckxmin eliminates the issue, so we're happy to do that - * if we can. Another reason for caution here is that while reindexing - * pg_index itself, we must not try to update it. We assume that - * pg_index's indexes will always have these flags in their clean state. + * Another reason for avoiding unnecessary updates here is that while + * reindexing pg_index itself, we must not try to update tuples in it. + * pg_index's indexes should always have these flags in their clean state, + * so that won't happen. */ if (!skipped_constraint) { + Relation pg_index; + HeapTuple indexTuple; + Form_pg_index indexForm; + bool index_bad; + pg_index = heap_open(IndexRelationId, RowExclusiveLock); indexTuple = SearchSysCacheCopy1(INDEXRELID, @@ -3040,17 +3249,30 @@ reindex_index(Oid indexId, bool skip_constraint_checks) elog(ERROR, "cache lookup failed for index %u", indexId); indexForm = (Form_pg_index) GETSTRUCT(indexTuple); - if (!indexForm->indisvalid || !indexForm->indisready || + index_bad = (!indexForm->indisvalid || + !indexForm->indisready || + !indexForm->indislive); + if (index_bad || (indexForm->indcheckxmin && !indexInfo->ii_BrokenHotChain)) { if (!indexInfo->ii_BrokenHotChain) indexForm->indcheckxmin = false; - else if (!indexForm->indisvalid || !indexForm->indisready) + else if (index_bad) indexForm->indcheckxmin = true; indexForm->indisvalid = true; indexForm->indisready = true; + indexForm->indislive = true; simple_heap_update(pg_index, &indexTuple->t_self, indexTuple); CatalogUpdateIndexes(pg_index, indexTuple); + + /* + * Invalidate the relcache for the table, so that after we commit + * all sessions will refresh the table's index list. This ensures + * that if anyone misses seeing the pg_index row during this + * update, they'll refresh their list before attempting any update + * on the table. + */ + CacheInvalidateRelcache(heapRelation); } heap_close(pg_index, RowExclusiveLock); diff --git a/src/backend/catalog/pg_constraint.c b/src/backend/catalog/pg_constraint.c index 8df3f18e30..5e8c6da807 100644 --- a/src/backend/catalog/pg_constraint.c +++ b/src/backend/catalog/pg_constraint.c @@ -679,7 +679,7 @@ RenameConstraintById(Oid conId, const char *newname) */ void AlterConstraintNamespaces(Oid ownerId, Oid oldNspId, - Oid newNspId, bool isType) + Oid newNspId, bool isType, ObjectAddresses *objsMoved) { Relation conRel; ScanKeyData key[1]; @@ -712,6 +712,14 @@ AlterConstraintNamespaces(Oid ownerId, Oid oldNspId, while (HeapTupleIsValid((tup = systable_getnext(scan)))) { Form_pg_constraint conform = (Form_pg_constraint) GETSTRUCT(tup); + ObjectAddress thisobj; + + thisobj.classId = ConstraintRelationId; + thisobj.objectId = HeapTupleGetOid(tup); + thisobj.objectSubId = 0; + + if (object_address_present(&thisobj, objsMoved)) + continue; if (conform->connamespace == oldNspId) { @@ -729,6 +737,8 @@ AlterConstraintNamespaces(Oid ownerId, Oid oldNspId, * changeDependencyFor(). */ } + + add_exact_object_address(&thisobj, objsMoved); } systable_endscan(scan); diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 993bc49c2a..2446282493 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -24,6 +24,7 @@ #include "access/xlogutils.h" #include "catalog/catalog.h" #include "catalog/storage.h" +#include "catalog/storage_xlog.h" #include "storage/freespace.h" #include "storage/smgr.h" #include "utils/memutils.h" @@ -60,30 +61,6 @@ typedef struct PendingRelDelete static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */ -/* - * Declarations for smgr-related XLOG records - * - * Note: we log file creation and truncation here, but logging of deletion - * actions is handled by xact.c, because it is part of transaction commit. - */ - -/* XLOG gives us high 4 bits */ -#define XLOG_SMGR_CREATE 0x10 -#define XLOG_SMGR_TRUNCATE 0x20 - -typedef struct xl_smgr_create -{ - RelFileNode rnode; - ForkNumber forkNum; -} xl_smgr_create; - -typedef struct xl_smgr_truncate -{ - BlockNumber blkno; - RelFileNode rnode; -} xl_smgr_truncate; - - /* * RelationCreateStorage * Create physical storage for a relation. @@ -523,29 +500,3 @@ smgr_redo(XLogRecPtr lsn, XLogRecord *record) else elog(PANIC, "smgr_redo: unknown op code %u", info); } - -void -smgr_desc(StringInfo buf, uint8 xl_info, char *rec) -{ - uint8 info = xl_info & ~XLR_INFO_MASK; - - if (info == XLOG_SMGR_CREATE) - { - xl_smgr_create *xlrec = (xl_smgr_create *) rec; - char *path = relpathperm(xlrec->rnode, xlrec->forkNum); - - appendStringInfo(buf, "file create: %s", path); - pfree(path); - } - else if (info == XLOG_SMGR_TRUNCATE) - { - xl_smgr_truncate *xlrec = (xl_smgr_truncate *) rec; - char *path = relpathperm(xlrec->rnode, MAIN_FORKNUM); - - appendStringInfo(buf, "file truncate: %s to %u blocks", path, - xlrec->blkno); - pfree(path); - } - else - appendStringInfo(buf, "UNKNOWN"); -} diff --git a/src/backend/catalog/toasting.c b/src/backend/catalog/toasting.c index 1feffd25ef..2979819e96 100644 --- a/src/backend/catalog/toasting.c +++ b/src/backend/catalog/toasting.c @@ -226,6 +226,7 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid, Datum reloptio ONCOMMIT_NOOP, reloptions, false, + true, true); Assert(toast_relid != InvalidOid); @@ -279,7 +280,7 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid, Datum reloptio rel->rd_rel->reltablespace, collationObjectId, classObjectId, coloptions, (Datum) 0, true, false, false, false, - true, false, false); + true, false, false, true); heap_close(toast_rel, NoLock); diff --git a/src/backend/commands/alter.c b/src/backend/commands/alter.c index 8df875b8a8..9e271c3e59 100644 --- a/src/backend/commands/alter.c +++ b/src/backend/commands/alter.c @@ -252,7 +252,8 @@ ExecAlterObjectSchemaStmt(AlterObjectSchemaStmt *stmt) * object doesn't have a schema. */ Oid -AlterObjectNamespace_oid(Oid classId, Oid objid, Oid nspOid) +AlterObjectNamespace_oid(Oid classId, Oid objid, Oid nspOid, + ObjectAddresses *objsMoved) { Oid oldNspOid = InvalidOid; ObjectAddress dep; @@ -266,20 +267,11 @@ AlterObjectNamespace_oid(Oid classId, Oid objid, Oid nspOid) case OCLASS_CLASS: { Relation rel; - Relation classRel; rel = relation_open(objid, AccessExclusiveLock); oldNspOid = RelationGetNamespace(rel); - classRel = heap_open(RelationRelationId, RowExclusiveLock); - - AlterRelationNamespaceInternal(classRel, - objid, - oldNspOid, - nspOid, - true); - - heap_close(classRel, RowExclusiveLock); + AlterTableNamespaceInternal(rel, oldNspOid, nspOid, objsMoved); relation_close(rel, NoLock); break; @@ -290,7 +282,7 @@ AlterObjectNamespace_oid(Oid classId, Oid objid, Oid nspOid) break; case OCLASS_TYPE: - oldNspOid = AlterTypeNamespace_oid(objid, nspOid); + oldNspOid = AlterTypeNamespace_oid(objid, nspOid, objsMoved); break; case OCLASS_COLLATION: diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index cfec413d54..c3deb56762 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -444,7 +444,7 @@ check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck, LOCKMOD * might put recently-dead tuples out-of-order in the new table, and there * is little harm in that.) */ - if (!OldIndex->rd_index->indisvalid) + if (!IndexIsValid(OldIndex->rd_index)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot cluster on invalid index \"%s\"", @@ -458,6 +458,11 @@ check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck, LOCKMOD * mark_index_clustered: mark the specified index as the one clustered on * * With indexOid == InvalidOid, will mark all indexes of rel not-clustered. + * + * Note: we do transactional updates of the pg_index rows, which are unsafe + * against concurrent SnapshotNow scans of pg_index. Therefore this is unsafe + * to execute with less than full exclusive lock on the parent table; + * otherwise concurrent executions of RelationGetIndexList could miss indexes. */ void mark_index_clustered(Relation rel, Oid indexOid) @@ -513,6 +518,9 @@ mark_index_clustered(Relation rel, Oid indexOid) } else if (thisIndexOid == indexOid) { + /* this was checked earlier, but let's be real sure */ + if (!IndexIsValid(indexForm)) + elog(ERROR, "cannot cluster on invalid index %u", indexOid); indexForm->indisclustered = true; simple_heap_update(pg_index, &indexTuple->t_self, indexTuple); CatalogUpdateIndexes(pg_index, indexTuple); @@ -643,6 +651,7 @@ make_new_heap(Oid OIDOldHeap, Oid NewTableSpace) ONCOMMIT_NOOP, reloptions, false, + true, true); Assert(OIDNewHeap != InvalidOid); diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 0567ab003d..09f40667f6 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -44,6 +44,7 @@ #include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/memutils.h" +#include "utils/portal.h" #include "utils/rel.h" #include "utils/snapmgr.h" @@ -109,6 +110,7 @@ typedef struct CopyStateData char *filename; /* filename, or NULL for STDIN/STDOUT */ bool binary; /* binary format? */ bool oids; /* include OIDs? */ + bool freeze; /* freeze rows on loading? */ bool csv_mode; /* Comma Separated Value format? */ bool header_line; /* CSV header line? */ char *null_print; /* NULL marker string (server encoding!) */ @@ -895,6 +897,14 @@ ProcessCopyOptions(CopyState cstate, errmsg("conflicting or redundant options"))); cstate->oids = defGetBoolean(defel); } + else if (strcmp(defel->defname, "freeze") == 0) + { + if (cstate->freeze) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + cstate->freeze = defGetBoolean(defel); + } else if (strcmp(defel->defname, "delimiter") == 0) { if (cstate->delim) @@ -1901,7 +1911,7 @@ CopyFrom(CopyState cstate) TupleTableSlot *myslot; MemoryContext oldcontext = CurrentMemoryContext; - ErrorContextCallback errcontext; + ErrorContextCallback errcallback; CommandId mycid = GetCurrentCommandId(true); int hi_options = 0; /* start with default heap_insert options */ BulkInsertState bistate; @@ -1974,8 +1984,28 @@ CopyFrom(CopyState cstate) hi_options |= HEAP_INSERT_SKIP_FSM; if (!XLogIsNeeded()) hi_options |= HEAP_INSERT_SKIP_WAL; + + /* + * Optimize if new relfilenode was created in this subxact or + * one of its committed children and we won't see those rows later + * as part of an earlier scan or command. This ensures that if this + * subtransaction aborts then the frozen rows won't be visible + * after xact cleanup. Note that the stronger test of exactly + * which subtransaction created it is crucial for correctness + * of this optimisation. + */ + if (cstate->freeze && + ThereAreNoPriorRegisteredSnapshots() && + ThereAreNoReadyPortals() && + cstate->rel->rd_newRelfilenodeSubid == GetCurrentSubTransactionId()) + hi_options |= HEAP_INSERT_FROZEN; } + if (cstate->freeze && (hi_options & HEAP_INSERT_FROZEN) == 0) + ereport(NOTICE, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("FREEZE option specified but pre-conditions not met"))); + /* * We need a ResultRelInfo so we can use the regular executor's * index-entry-making machinery. (There used to be a huge amount of code @@ -2046,10 +2076,10 @@ CopyFrom(CopyState cstate) econtext = GetPerTupleExprContext(estate); /* Set up callback to identify error line number */ - errcontext.callback = CopyFromErrorCallback; - errcontext.arg = (void *) cstate; - errcontext.previous = error_context_stack; - error_context_stack = &errcontext; + errcallback.callback = CopyFromErrorCallback; + errcallback.arg = (void *) cstate; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; for (;;) { @@ -2164,7 +2194,7 @@ CopyFrom(CopyState cstate) nBufferedTuples, bufferedTuples); /* Done, clean up */ - error_context_stack = errcontext.previous; + error_context_stack = errcallback.previous; FreeBulkInsertState(bistate); diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index cdbce97c4f..3c13c470fd 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -1992,27 +1992,3 @@ dbase_redo(XLogRecPtr lsn, XLogRecord *record) else elog(PANIC, "dbase_redo: unknown op code %u", info); } - -void -dbase_desc(StringInfo buf, uint8 xl_info, char *rec) -{ - uint8 info = xl_info & ~XLR_INFO_MASK; - - if (info == XLOG_DBASE_CREATE) - { - xl_dbase_create_rec *xlrec = (xl_dbase_create_rec *) rec; - - appendStringInfo(buf, "create db: copy dir %u/%u to %u/%u", - xlrec->src_db_id, xlrec->src_tablespace_id, - xlrec->db_id, xlrec->tablespace_id); - } - else if (info == XLOG_DBASE_DROP) - { - xl_dbase_drop_rec *xlrec = (xl_dbase_drop_rec *) rec; - - appendStringInfo(buf, "drop db: dir %u/%u", - xlrec->db_id, xlrec->tablespace_id); - } - else - appendStringInfo(buf, "UNKNOWN"); -} diff --git a/src/backend/commands/dropcmds.c b/src/backend/commands/dropcmds.c index 25c127b252..e4c17bd2bc 100644 --- a/src/backend/commands/dropcmds.c +++ b/src/backend/commands/dropcmds.c @@ -203,7 +203,7 @@ does_not_exist_skipping(ObjectType objtype, List *objname, List *objargs) case OBJECT_TRIGGER: msg = gettext_noop("trigger \"%s\" for table \"%s\" does not exist, skipping"); name = strVal(llast(objname)); - args = NameListToString(list_truncate(objname, + args = NameListToString(list_truncate(list_copy(objname), list_length(objname) - 1)); break; case OBJECT_EVENT_TRIGGER: @@ -213,7 +213,7 @@ does_not_exist_skipping(ObjectType objtype, List *objname, List *objargs) case OBJECT_RULE: msg = gettext_noop("rule \"%s\" for relation \"%s\" does not exist, skipping"); name = strVal(llast(objname)); - args = NameListToString(list_truncate(objname, + args = NameListToString(list_truncate(list_copy(objname), list_length(objname) - 1)); break; case OBJECT_FDW: diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c index 5aa9bbb19c..47631beb77 100644 --- a/src/backend/commands/extension.c +++ b/src/backend/commands/extension.c @@ -2204,6 +2204,7 @@ AlterExtensionNamespace(List *names, const char *newschema) Relation depRel; SysScanDesc depScan; HeapTuple depTup; + ObjectAddresses *objsMoved; if (list_length(names) != 1) ereport(ERROR, @@ -2278,6 +2279,8 @@ AlterExtensionNamespace(List *names, const char *newschema) errmsg("extension \"%s\" does not support SET SCHEMA", NameStr(extForm->extname)))); + objsMoved = new_object_addresses(); + /* * Scan pg_depend to find objects that depend directly on the extension, * and alter each one's schema. @@ -2317,9 +2320,11 @@ AlterExtensionNamespace(List *names, const char *newschema) if (dep.objectSubId != 0) /* should not happen */ elog(ERROR, "extension should not have a sub-object dependency"); + /* Relocate the object */ dep_oldNspOid = AlterObjectNamespace_oid(dep.classId, dep.objectId, - nspOid); + nspOid, + objsMoved); /* * Remember previous namespace of first object that has one diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index a58101ec6e..75f9ff19cc 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -124,6 +124,7 @@ CheckIndexCompatible(Oid oldId, Oid accessMethodId; Oid relationId; HeapTuple tuple; + Form_pg_index indexForm; Form_pg_am accessMethodForm; bool amcanorder; int16 *coloptions; @@ -193,17 +194,22 @@ CheckIndexCompatible(Oid oldId, tuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(oldId)); if (!HeapTupleIsValid(tuple)) elog(ERROR, "cache lookup failed for index %u", oldId); + indexForm = (Form_pg_index) GETSTRUCT(tuple); - /* We don't assess expressions or predicates; assume incompatibility. */ + /* + * We don't assess expressions or predicates; assume incompatibility. + * Also, if the index is invalid for any reason, treat it as incompatible. + */ if (!(heap_attisnull(tuple, Anum_pg_index_indpred) && - heap_attisnull(tuple, Anum_pg_index_indexprs))) + heap_attisnull(tuple, Anum_pg_index_indexprs) && + IndexIsValid(indexForm))) { ReleaseSysCache(tuple); return false; } /* Any change in operator class or collation breaks compatibility. */ - old_natts = ((Form_pg_index) GETSTRUCT(tuple))->indnatts; + old_natts = indexForm->indnatts; Assert(old_natts == numberOfAttributes); d = SysCacheGetAttr(INDEXRELID, tuple, Anum_pg_index_indcollation, &isnull); @@ -320,9 +326,6 @@ DefineIndex(IndexStmt *stmt, LockRelId heaprelid; LOCKTAG heaplocktag; Snapshot snapshot; - Relation pg_index; - HeapTuple indexTuple; - Form_pg_index indexForm; int i; /* @@ -596,7 +599,7 @@ DefineIndex(IndexStmt *stmt, stmt->isconstraint, stmt->deferrable, stmt->initdeferred, allowSystemTableMods, skip_build || stmt->concurrent, - stmt->concurrent); + stmt->concurrent, !check_rights); /* Add any requested comment */ if (stmt->idxcomment != NULL) @@ -717,23 +720,7 @@ DefineIndex(IndexStmt *stmt, * commit this transaction, any new transactions that open the table must * insert new entries into the index for insertions and non-HOT updates. */ - pg_index = heap_open(IndexRelationId, RowExclusiveLock); - - indexTuple = SearchSysCacheCopy1(INDEXRELID, - ObjectIdGetDatum(indexRelationId)); - if (!HeapTupleIsValid(indexTuple)) - elog(ERROR, "cache lookup failed for index %u", indexRelationId); - indexForm = (Form_pg_index) GETSTRUCT(indexTuple); - - Assert(!indexForm->indisready); - Assert(!indexForm->indisvalid); - - indexForm->indisready = true; - - simple_heap_update(pg_index, &indexTuple->t_self, indexTuple); - CatalogUpdateIndexes(pg_index, indexTuple); - - heap_close(pg_index, RowExclusiveLock); + index_set_state_flags(indexRelationId, INDEX_CREATE_SET_READY); /* we can do away with our snapshot */ PopActiveSnapshot(); @@ -857,23 +844,7 @@ DefineIndex(IndexStmt *stmt, /* * Index can now be marked valid -- update its pg_index entry */ - pg_index = heap_open(IndexRelationId, RowExclusiveLock); - - indexTuple = SearchSysCacheCopy1(INDEXRELID, - ObjectIdGetDatum(indexRelationId)); - if (!HeapTupleIsValid(indexTuple)) - elog(ERROR, "cache lookup failed for index %u", indexRelationId); - indexForm = (Form_pg_index) GETSTRUCT(indexTuple); - - Assert(indexForm->indisready); - Assert(!indexForm->indisvalid); - - indexForm->indisvalid = true; - - simple_heap_update(pg_index, &indexTuple->t_self, indexTuple); - CatalogUpdateIndexes(pg_index, indexTuple); - - heap_close(pg_index, RowExclusiveLock); + index_set_state_flags(indexRelationId, INDEX_CREATE_SET_VALID); /* * The pg_index update will cause backends (including this one) to update @@ -881,7 +852,7 @@ DefineIndex(IndexStmt *stmt, * relcache inval on the parent table to force replanning of cached plans. * Otherwise existing sessions might fail to use the new index where it * would be useful. (Note that our earlier commits did not create reasons - * to replan; relcache flush on the index itself was sufficient.) + * to replan; so relcache flush on the index itself was sufficient.) */ CacheInvalidateRelcacheByRelid(heaprelid.relId); diff --git a/src/backend/commands/portalcmds.c b/src/backend/commands/portalcmds.c index e402042332..b5ce87a29d 100644 --- a/src/backend/commands/portalcmds.c +++ b/src/backend/commands/portalcmds.c @@ -121,7 +121,7 @@ PerformCursorOpen(PlannedStmt *stmt, ParamListInfo params, /* * Start execution, inserting parameters if any. */ - PortalStart(portal, params, 0, true); + PortalStart(portal, params, 0, GetActiveSnapshot()); Assert(portal->strategy == PORTAL_ONE_SELECT); diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c index 9f993de6f1..91ef5507e2 100644 --- a/src/backend/commands/prepare.c +++ b/src/backend/commands/prepare.c @@ -289,7 +289,7 @@ ExecuteQuery(ExecuteStmt *stmt, IntoClause *intoClause, /* * Run the portal as appropriate. */ - PortalStart(portal, paramLI, eflags, true); + PortalStart(portal, paramLI, eflags, GetActiveSnapshot()); (void) PortalRun(portal, count, false, dest, dest, completionTag); diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index 4f55830ae4..634ce3f718 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -1595,21 +1595,3 @@ seq_redo(XLogRecPtr lsn, XLogRecord *record) pfree(localpage); } - -void -seq_desc(StringInfo buf, uint8 xl_info, char *rec) -{ - uint8 info = xl_info & ~XLR_INFO_MASK; - xl_seq_rec *xlrec = (xl_seq_rec *) rec; - - if (info == XLOG_SEQ_LOG) - appendStringInfo(buf, "log: "); - else - { - appendStringInfo(buf, "UNKNOWN"); - return; - } - - appendStringInfo(buf, "rel %u/%u/%u", - xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode); -} diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 359d478592..4065740b33 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -261,10 +261,10 @@ static void StoreCatalogInheritance1(Oid relationId, Oid parentOid, int16 seqNumber, Relation inhRelation); static int findAttrByName(const char *attributeName, List *schema); static void AlterIndexNamespaces(Relation classRel, Relation rel, - Oid oldNspOid, Oid newNspOid); + Oid oldNspOid, Oid newNspOid, ObjectAddresses *objsMoved); static void AlterSeqNamespaces(Relation classRel, Relation rel, - Oid oldNspOid, Oid newNspOid, - const char *newNspName, LOCKMODE lockmode); + Oid oldNspOid, Oid newNspOid, ObjectAddresses *objsMoved, + LOCKMODE lockmode); static void ATExecValidateConstraint(Relation rel, char *constrName, bool recurse, bool recursing, LOCKMODE lockmode); static int transformColumnNameList(Oid relId, List *colList, @@ -335,13 +335,15 @@ static void ATExecAddIndex(AlteredTableInfo *tab, Relation rel, IndexStmt *stmt, bool is_rebuild, LOCKMODE lockmode); static void ATExecAddConstraint(List **wqueue, AlteredTableInfo *tab, Relation rel, - Constraint *newConstraint, bool recurse, LOCKMODE lockmode); + Constraint *newConstraint, bool recurse, bool is_readd, + LOCKMODE lockmode); static void ATExecAddIndexConstraint(AlteredTableInfo *tab, Relation rel, IndexStmt *stmt, LOCKMODE lockmode); static void ATAddCheckConstraint(List **wqueue, AlteredTableInfo *tab, Relation rel, Constraint *constr, - bool recurse, bool recursing, LOCKMODE lockmode); + bool recurse, bool recursing, bool is_readd, + LOCKMODE lockmode); static void ATAddForeignKeyConstraint(AlteredTableInfo *tab, Relation rel, Constraint *fkconstraint, LOCKMODE lockmode); static void ATExecDropConstraint(Relation rel, const char *constrName, @@ -630,7 +632,8 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId) stmt->oncommit, reloptions, true, - allowSystemTableMods); + allowSystemTableMods, + false); /* Store inheritance information for new rel. */ StoreCatalogInheritance(relationId, inheritOids); @@ -741,10 +744,13 @@ RemoveRelations(DropStmt *drop) int flags = 0; LOCKMODE lockmode = AccessExclusiveLock; + /* DROP CONCURRENTLY uses a weaker lock, and has some restrictions */ if (drop->concurrent) { + flags |= PERFORM_DELETION_CONCURRENTLY; lockmode = ShareUpdateExclusiveLock; - if (list_length(drop->objects) > 1) + Assert(drop->removeType == OBJECT_INDEX); + if (list_length(drop->objects) != 1) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("DROP INDEX CONCURRENTLY does not support dropping multiple objects"))); @@ -836,19 +842,6 @@ RemoveRelations(DropStmt *drop) add_exact_object_address(&obj, objects); } - /* - * Set options and check further requirements for concurrent drop - */ - if (drop->concurrent) - { - /* - * Confirm that concurrent behaviour is restricted in grammar. - */ - Assert(drop->removeType == OBJECT_INDEX); - - flags |= PERFORM_DELETION_CONCURRENTLY; - } - performMultipleDeletions(objects, drop->behavior, flags); free_object_addresses(objects); @@ -915,7 +908,7 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid, * locking the index. index_drop() will need this anyway, and since * regular queries lock tables before their indexes, we risk deadlock if * we do it the other way around. No error if we don't find a pg_index - * entry, though --- the relation may have been droppd. + * entry, though --- the relation may have been dropped. */ if (relkind == RELKIND_INDEX && relOid != oldRelOid) { @@ -2757,6 +2750,7 @@ AlterTableGetLockLevel(List *cmds) case AT_ColumnDefault: case AT_ProcessedConstraint: /* becomes AT_AddConstraint */ case AT_AddConstraintRecurse: /* becomes AT_AddConstraint */ + case AT_ReAddConstraint: /* becomes AT_AddConstraint */ case AT_EnableTrig: case AT_EnableAlwaysTrig: case AT_EnableReplicaTrig: @@ -3248,11 +3242,15 @@ ATExecCmd(List **wqueue, AlteredTableInfo *tab, Relation rel, break; case AT_AddConstraint: /* ADD CONSTRAINT */ ATExecAddConstraint(wqueue, tab, rel, (Constraint *) cmd->def, - false, lockmode); + false, false, lockmode); break; case AT_AddConstraintRecurse: /* ADD CONSTRAINT with recursion */ ATExecAddConstraint(wqueue, tab, rel, (Constraint *) cmd->def, - true, lockmode); + true, false, lockmode); + break; + case AT_ReAddConstraint: /* Re-add pre-existing check constraint */ + ATExecAddConstraint(wqueue, tab, rel, (Constraint *) cmd->def, + false, true, lockmode); break; case AT_AddIndexConstraint: /* ADD CONSTRAINT USING INDEX */ ATExecAddIndexConstraint(tab, rel, (IndexStmt *) cmd->def, lockmode); @@ -4776,6 +4774,8 @@ ATExecDropNotNull(Relation rel, const char *colName, LOCKMODE lockmode) /* * Check that the attribute is not in a primary key + * + * Note: we'll throw error even if the pkey index is not valid. */ /* Loop over all indexes on the relation */ @@ -5499,7 +5499,8 @@ ATExecAddIndexConstraint(AlteredTableInfo *tab, Relation rel, */ static void ATExecAddConstraint(List **wqueue, AlteredTableInfo *tab, Relation rel, - Constraint *newConstraint, bool recurse, LOCKMODE lockmode) + Constraint *newConstraint, bool recurse, bool is_readd, + LOCKMODE lockmode) { Assert(IsA(newConstraint, Constraint)); @@ -5512,7 +5513,8 @@ ATExecAddConstraint(List **wqueue, AlteredTableInfo *tab, Relation rel, { case CONSTR_CHECK: ATAddCheckConstraint(wqueue, tab, rel, - newConstraint, recurse, false, lockmode); + newConstraint, recurse, false, is_readd, + lockmode); break; case CONSTR_FOREIGN: @@ -5564,11 +5566,18 @@ ATExecAddConstraint(List **wqueue, AlteredTableInfo *tab, Relation rel, * AddRelationNewConstraints would normally assign different names to the * child constraints. To fix that, we must capture the name assigned at * the parent table and pass that down. + * + * When re-adding a previously existing constraint (during ALTER COLUMN TYPE), + * we don't need to recurse here, because recursion will be carried out at a + * higher level; the constraint name issue doesn't apply because the names + * have already been assigned and are just being re-used. We need a separate + * "is_readd" flag for that; just setting recurse=false would result in an + * error if there are child tables. */ static void ATAddCheckConstraint(List **wqueue, AlteredTableInfo *tab, Relation rel, Constraint *constr, bool recurse, bool recursing, - LOCKMODE lockmode) + bool is_readd, LOCKMODE lockmode) { List *newcons; ListCell *lcon; @@ -5633,9 +5642,11 @@ ATAddCheckConstraint(List **wqueue, AlteredTableInfo *tab, Relation rel, return; /* - * Adding a NO INHERIT constraint? No need to find our children + * If adding a NO INHERIT constraint, no need to find our children. + * Likewise, in a re-add operation, we don't need to recurse (that will be + * handled at higher levels). */ - if (constr->is_no_inherit) + if (constr->is_no_inherit || is_readd) return; /* @@ -5670,7 +5681,7 @@ ATAddCheckConstraint(List **wqueue, AlteredTableInfo *tab, Relation rel, /* Recurse to child */ ATAddCheckConstraint(wqueue, childtab, childrel, - constr, recurse, true, lockmode); + constr, recurse, true, is_readd, lockmode); heap_close(childrel, NoLock); } @@ -6299,7 +6310,7 @@ transformFkeyGetPrimaryKey(Relation pkrel, Oid *indexOid, /* * Get the list of index OIDs for the table from the relcache, and look up * each one in the pg_index syscache until we find one marked primary key - * (hopefully there isn't more than one such). + * (hopefully there isn't more than one such). Insist it's valid, too. */ *indexOid = InvalidOid; @@ -6313,7 +6324,7 @@ transformFkeyGetPrimaryKey(Relation pkrel, Oid *indexOid, if (!HeapTupleIsValid(indexTuple)) elog(ERROR, "cache lookup failed for index %u", indexoid); indexStruct = (Form_pg_index) GETSTRUCT(indexTuple); - if (indexStruct->indisprimary) + if (indexStruct->indisprimary && IndexIsValid(indexStruct)) { /* * Refuse to use a deferrable primary key. This is per SQL spec, @@ -6411,10 +6422,12 @@ transformFkeyCheckAttrs(Relation pkrel, /* * Must have the right number of columns; must be unique and not a - * partial index; forget it if there are any expressions, too + * partial index; forget it if there are any expressions, too. Invalid + * indexes are out as well. */ if (indexStruct->indnatts == numattrs && indexStruct->indisunique && + IndexIsValid(indexStruct) && heap_attisnull(indexTuple, Anum_pg_index_indpred) && heap_attisnull(indexTuple, Anum_pg_index_indexprs)) { @@ -7861,6 +7874,10 @@ ATPostAlterTypeParse(Oid oldId, char *cmd, /* * Attach each generated command to the proper place in the work queue. * Note this could result in creation of entirely new work-queue entries. + * + * Also note that we have to tweak the command subtypes, because it turns + * out that re-creation of indexes and constraints has to act a bit + * differently from initial creation. */ foreach(list_item, querytree_list) { @@ -7918,6 +7935,7 @@ ATPostAlterTypeParse(Oid oldId, char *cmd, if (con->contype == CONSTR_FOREIGN && !rewrite && !tab->rewrite) TryReuseForeignKey(oldId, con); + cmd->subtype = AT_ReAddConstraint; tab->subcmds[AT_PASS_OLD_CONSTR] = lappend(tab->subcmds[AT_PASS_OLD_CONSTR], cmd); break; @@ -9710,8 +9728,8 @@ AlterTableNamespace(AlterObjectSchemaStmt *stmt) Oid relid; Oid oldNspOid; Oid nspOid; - Relation classRel; RangeVar *newrv; + ObjectAddresses *objsMoved; relid = RangeVarGetRelidExtended(stmt->relation, AccessExclusiveLock, stmt->missing_ok, false, @@ -9752,27 +9770,47 @@ AlterTableNamespace(AlterObjectSchemaStmt *stmt) /* common checks on switching namespaces */ CheckSetNamespace(oldNspOid, nspOid, RelationRelationId, relid); + objsMoved = new_object_addresses(); + AlterTableNamespaceInternal(rel, oldNspOid, nspOid, objsMoved); + free_object_addresses(objsMoved); + + /* close rel, but keep lock until commit */ + relation_close(rel, NoLock); +} + +/* + * The guts of relocating a table to another namespace: besides moving + * the table itself, its dependent objects are relocated to the new schema. + */ +void +AlterTableNamespaceInternal(Relation rel, Oid oldNspOid, Oid nspOid, + ObjectAddresses *objsMoved) +{ + Relation classRel; + + Assert(objsMoved != NULL); + /* OK, modify the pg_class row and pg_depend entry */ classRel = heap_open(RelationRelationId, RowExclusiveLock); - AlterRelationNamespaceInternal(classRel, relid, oldNspOid, nspOid, true); + AlterRelationNamespaceInternal(classRel, RelationGetRelid(rel), oldNspOid, + nspOid, true, objsMoved); /* Fix the table's row type too */ - AlterTypeNamespaceInternal(rel->rd_rel->reltype, nspOid, false, false); + AlterTypeNamespaceInternal(rel->rd_rel->reltype, + nspOid, false, false, objsMoved); /* Fix other dependent stuff */ if (rel->rd_rel->relkind == RELKIND_RELATION) { - AlterIndexNamespaces(classRel, rel, oldNspOid, nspOid); - AlterSeqNamespaces(classRel, rel, oldNspOid, nspOid, stmt->newschema, - AccessExclusiveLock); - AlterConstraintNamespaces(relid, oldNspOid, nspOid, false); + AlterIndexNamespaces(classRel, rel, oldNspOid, nspOid, objsMoved); + AlterSeqNamespaces(classRel, rel, oldNspOid, nspOid, + objsMoved, AccessExclusiveLock); + AlterConstraintNamespaces(RelationGetRelid(rel), oldNspOid, nspOid, + false, objsMoved); } heap_close(classRel, RowExclusiveLock); - - /* close rel, but keep lock until commit */ - relation_close(rel, NoLock); } /* @@ -9783,10 +9821,11 @@ AlterTableNamespace(AlterObjectSchemaStmt *stmt) void AlterRelationNamespaceInternal(Relation classRel, Oid relOid, Oid oldNspOid, Oid newNspOid, - bool hasDependEntry) + bool hasDependEntry, ObjectAddresses *objsMoved) { HeapTuple classTup; Form_pg_class classForm; + ObjectAddress thisobj; classTup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relOid)); if (!HeapTupleIsValid(classTup)) @@ -9795,27 +9834,39 @@ AlterRelationNamespaceInternal(Relation classRel, Oid relOid, Assert(classForm->relnamespace == oldNspOid); - /* check for duplicate name (more friendly than unique-index failure) */ - if (get_relname_relid(NameStr(classForm->relname), - newNspOid) != InvalidOid) - ereport(ERROR, - (errcode(ERRCODE_DUPLICATE_TABLE), - errmsg("relation \"%s\" already exists in schema \"%s\"", - NameStr(classForm->relname), - get_namespace_name(newNspOid)))); + thisobj.classId = RelationRelationId; + thisobj.objectId = relOid; + thisobj.objectSubId = 0; - /* classTup is a copy, so OK to scribble on */ - classForm->relnamespace = newNspOid; + /* + * Do nothing when there's nothing to do. + */ + if (!object_address_present(&thisobj, objsMoved)) + { + /* check for duplicate name (more friendly than unique-index failure) */ + if (get_relname_relid(NameStr(classForm->relname), + newNspOid) != InvalidOid) + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_TABLE), + errmsg("relation \"%s\" already exists in schema \"%s\"", + NameStr(classForm->relname), + get_namespace_name(newNspOid)))); + + /* classTup is a copy, so OK to scribble on */ + classForm->relnamespace = newNspOid; - simple_heap_update(classRel, &classTup->t_self, classTup); - CatalogUpdateIndexes(classRel, classTup); + simple_heap_update(classRel, &classTup->t_self, classTup); + CatalogUpdateIndexes(classRel, classTup); - /* Update dependency on schema if caller said so */ - if (hasDependEntry && - changeDependencyFor(RelationRelationId, relOid, - NamespaceRelationId, oldNspOid, newNspOid) != 1) - elog(ERROR, "failed to change schema dependency for relation \"%s\"", - NameStr(classForm->relname)); + /* Update dependency on schema if caller said so */ + if (hasDependEntry && + changeDependencyFor(RelationRelationId, relOid, + NamespaceRelationId, oldNspOid, newNspOid) != 1) + elog(ERROR, "failed to change schema dependency for relation \"%s\"", + NameStr(classForm->relname)); + + add_exact_object_address(&thisobj, objsMoved); + } heap_freetuple(classTup); } @@ -9828,7 +9879,7 @@ AlterRelationNamespaceInternal(Relation classRel, Oid relOid, */ static void AlterIndexNamespaces(Relation classRel, Relation rel, - Oid oldNspOid, Oid newNspOid) + Oid oldNspOid, Oid newNspOid, ObjectAddresses *objsMoved) { List *indexList; ListCell *l; @@ -9838,15 +9889,27 @@ AlterIndexNamespaces(Relation classRel, Relation rel, foreach(l, indexList) { Oid indexOid = lfirst_oid(l); + ObjectAddress thisobj; + + thisobj.classId = RelationRelationId; + thisobj.objectId = indexOid; + thisobj.objectSubId = 0; /* * Note: currently, the index will not have its own dependency on the * namespace, so we don't need to do changeDependencyFor(). There's no * row type in pg_type, either. + * + * XXX this objsMoved test may be pointless -- surely we have a single + * dependency link from a relation to each index? */ - AlterRelationNamespaceInternal(classRel, indexOid, - oldNspOid, newNspOid, - false); + if (!object_address_present(&thisobj, objsMoved)) + { + AlterRelationNamespaceInternal(classRel, indexOid, + oldNspOid, newNspOid, + false, objsMoved); + add_exact_object_address(&thisobj, objsMoved); + } } list_free(indexList); @@ -9861,7 +9924,8 @@ AlterIndexNamespaces(Relation classRel, Relation rel, */ static void AlterSeqNamespaces(Relation classRel, Relation rel, - Oid oldNspOid, Oid newNspOid, const char *newNspName, LOCKMODE lockmode) + Oid oldNspOid, Oid newNspOid, ObjectAddresses *objsMoved, + LOCKMODE lockmode) { Relation depRel; SysScanDesc scan; @@ -9913,14 +9977,14 @@ AlterSeqNamespaces(Relation classRel, Relation rel, /* Fix the pg_class and pg_depend entries */ AlterRelationNamespaceInternal(classRel, depForm->objid, oldNspOid, newNspOid, - true); + true, objsMoved); /* * Sequences have entries in pg_type. We need to be careful to move * them to the new namespace, too. */ AlterTypeNamespaceInternal(RelationGetForm(seqRel)->reltype, - newNspOid, false, false); + newNspOid, false, false, objsMoved); /* Now we can close it. Keep the lock till end of transaction. */ relation_close(seqRel, NoLock); diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c index 08899aeece..5081d8411e 100644 --- a/src/backend/commands/tablespace.c +++ b/src/backend/commands/tablespace.c @@ -1424,25 +1424,3 @@ tblspc_redo(XLogRecPtr lsn, XLogRecord *record) else elog(PANIC, "tblspc_redo: unknown op code %u", info); } - -void -tblspc_desc(StringInfo buf, uint8 xl_info, char *rec) -{ - uint8 info = xl_info & ~XLR_INFO_MASK; - - if (info == XLOG_TBLSPC_CREATE) - { - xl_tblspc_create_rec *xlrec = (xl_tblspc_create_rec *) rec; - - appendStringInfo(buf, "create tablespace: %u \"%s\"", - xlrec->ts_id, xlrec->ts_path); - } - else if (info == XLOG_TBLSPC_DROP) - { - xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) rec; - - appendStringInfo(buf, "drop tablespace: %u", xlrec->ts_id); - } - else - appendStringInfo(buf, "UNKNOWN"); -} diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 4d3ed9cb62..91ef779c65 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -2571,8 +2571,7 @@ GetTupleForTrigger(EState *estate, if (newSlot != NULL) { HTSU_Result test; - ItemPointerData update_ctid; - TransactionId update_xmax; + HeapUpdateFailureData hufd; *newSlot = NULL; @@ -2584,13 +2583,27 @@ GetTupleForTrigger(EState *estate, */ ltrmark:; tuple.t_self = *tid; - test = heap_lock_tuple(relation, &tuple, &buffer, - &update_ctid, &update_xmax, + test = heap_lock_tuple(relation, &tuple, estate->es_output_cid, - LockTupleExclusive, false); + LockTupleExclusive, false /* wait */, + &buffer, &hufd); switch (test) { case HeapTupleSelfUpdated: + /* + * The target tuple was already updated or deleted by the + * current command, or by a later command in the current + * transaction. We ignore the tuple in the former case, and + * throw error in the latter case, for the same reasons + * enumerated in ExecUpdate and ExecDelete in + * nodeModifyTable.c. + */ + if (hufd.cmax != estate->es_output_cid) + ereport(ERROR, + (errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION), + errmsg("tuple to be updated was already modified by an operation triggered by the current command"), + errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows."))); + /* treat it as deleted; do not process */ ReleaseBuffer(buffer); return NULL; @@ -2604,7 +2617,7 @@ ltrmark:; ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); - if (!ItemPointerEquals(&update_ctid, &tuple.t_self)) + if (!ItemPointerEquals(&hufd.ctid, &tuple.t_self)) { /* it was updated, so look at the updated version */ TupleTableSlot *epqslot; @@ -2613,11 +2626,11 @@ ltrmark:; epqstate, relation, relinfo->ri_RangeTableIndex, - &update_ctid, - update_xmax); + &hufd.ctid, + hufd.xmax); if (!TupIsNull(epqslot)) { - *tid = update_ctid; + *tid = hufd.ctid; *newSlot = epqslot; /* @@ -2649,6 +2662,16 @@ ltrmark:; buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); + /* + * Although we already know this tuple is valid, we must lock the + * buffer to ensure that no one has a buffer cleanup lock; otherwise + * they might move the tuple while we try to copy it. But we can + * release the lock before actually doing the heap_copytuple call, + * since holding pin is sufficient to prevent anyone from getting a + * cleanup lock they don't already hold. + */ + LockBuffer(buffer, BUFFER_LOCK_SHARE); + page = BufferGetPage(buffer); lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid)); @@ -2658,6 +2681,8 @@ ltrmark:; tuple.t_len = ItemIdGetLength(lp); tuple.t_self = *tid; tuple.t_tableOid = RelationGetRelid(relation); + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); } result = heap_copytuple(&tuple); diff --git a/src/backend/commands/typecmds.c b/src/backend/commands/typecmds.c index 6cb6fd56fd..36de6d7e28 100644 --- a/src/backend/commands/typecmds.c +++ b/src/backend/commands/typecmds.c @@ -1169,7 +1169,7 @@ DefineEnum(CreateEnumStmt *stmt) * Adds a new label to an existing enum. */ void -AlterEnum(AlterEnumStmt *stmt) +AlterEnum(AlterEnumStmt *stmt, bool isTopLevel) { Oid enum_type_oid; TypeName *typename; @@ -1183,12 +1183,31 @@ AlterEnum(AlterEnumStmt *stmt) if (!HeapTupleIsValid(tup)) elog(ERROR, "cache lookup failed for type %u", enum_type_oid); + /* + * Ordinarily we disallow adding values within transaction blocks, because + * we can't cope with enum OID values getting into indexes and then having + * their defining pg_enum entries go away. However, it's okay if the enum + * type was created in the current transaction, since then there can be + * no such indexes that wouldn't themselves go away on rollback. (We + * support this case because pg_dump --binary-upgrade needs it.) We test + * this by seeing if the pg_type row has xmin == current XID and is not + * HEAP_UPDATED. If it is HEAP_UPDATED, we can't be sure whether the + * type was created or only modified in this xact. So we are disallowing + * some cases that could theoretically be safe; but fortunately pg_dump + * only needs the simplest case. + */ + if (HeapTupleHeaderGetXmin(tup->t_data) == GetCurrentTransactionId() && + !(tup->t_data->t_infomask & HEAP_UPDATED)) + /* safe to do inside transaction block */ ; + else + PreventTransactionChain(isTopLevel, "ALTER TYPE ... ADD"); + /* Check it's an enum and check user has permission to ALTER the enum */ checkEnumOwner(tup); /* Add the new label */ AddEnumLabel(enum_type_oid, stmt->newVal, - stmt->newValNeighbor, stmt->newValIsAfter, + stmt->newValNeighbor, stmt->newValIsAfter, stmt->skipIfExists); ReleaseSysCache(tup); @@ -3322,6 +3341,7 @@ AlterTypeNamespace(List *names, const char *newschema, ObjectType objecttype) TypeName *typename; Oid typeOid; Oid nspOid; + ObjectAddresses *objsMoved; /* Make a TypeName so we can use standard type lookup machinery */ typename = makeTypeNameFromNameList(names); @@ -3337,11 +3357,13 @@ AlterTypeNamespace(List *names, const char *newschema, ObjectType objecttype) /* get schema OID and check its permissions */ nspOid = LookupCreationNamespace(newschema); - AlterTypeNamespace_oid(typeOid, nspOid); + objsMoved = new_object_addresses(); + AlterTypeNamespace_oid(typeOid, nspOid, objsMoved); + free_object_addresses(objsMoved); } Oid -AlterTypeNamespace_oid(Oid typeOid, Oid nspOid) +AlterTypeNamespace_oid(Oid typeOid, Oid nspOid, ObjectAddresses *objsMoved) { Oid elemOid; @@ -3360,7 +3382,7 @@ AlterTypeNamespace_oid(Oid typeOid, Oid nspOid) format_type_be(elemOid)))); /* and do the work */ - return AlterTypeNamespaceInternal(typeOid, nspOid, false, true); + return AlterTypeNamespaceInternal(typeOid, nspOid, false, true, objsMoved); } /* @@ -3381,7 +3403,8 @@ AlterTypeNamespace_oid(Oid typeOid, Oid nspOid) Oid AlterTypeNamespaceInternal(Oid typeOid, Oid nspOid, bool isImplicitArray, - bool errorOnTableType) + bool errorOnTableType, + ObjectAddresses *objsMoved) { Relation rel; HeapTuple tup; @@ -3389,6 +3412,17 @@ AlterTypeNamespaceInternal(Oid typeOid, Oid nspOid, Oid oldNspOid; Oid arrayOid; bool isCompositeType; + ObjectAddress thisobj; + + /* + * Make sure we haven't moved this object previously. + */ + thisobj.classId = TypeRelationId; + thisobj.objectId = typeOid; + thisobj.objectSubId = 0; + + if (object_address_present(&thisobj, objsMoved)) + return InvalidOid; rel = heap_open(TypeRelationId, RowExclusiveLock); @@ -3449,7 +3483,7 @@ AlterTypeNamespaceInternal(Oid typeOid, Oid nspOid, AlterRelationNamespaceInternal(classRel, typform->typrelid, oldNspOid, nspOid, - false); + false, objsMoved); heap_close(classRel, RowExclusiveLock); @@ -3458,13 +3492,14 @@ AlterTypeNamespaceInternal(Oid typeOid, Oid nspOid, * currently support this, but probably will someday). */ AlterConstraintNamespaces(typform->typrelid, oldNspOid, - nspOid, false); + nspOid, false, objsMoved); } else { /* If it's a domain, it might have constraints */ if (typform->typtype == TYPTYPE_DOMAIN) - AlterConstraintNamespaces(typeOid, oldNspOid, nspOid, true); + AlterConstraintNamespaces(typeOid, oldNspOid, nspOid, true, + objsMoved); } /* @@ -3482,9 +3517,11 @@ AlterTypeNamespaceInternal(Oid typeOid, Oid nspOid, heap_close(rel, RowExclusiveLock); + add_exact_object_address(&thisobj, objsMoved); + /* Recursively alter the associated array type, if any */ if (OidIsValid(arrayOid)) - AlterTypeNamespaceInternal(arrayOid, nspOid, true, true); + AlterTypeNamespaceInternal(arrayOid, nspOid, true, true, objsMoved); return oldNspOid; } diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 14d1c08c97..c852895448 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -1097,9 +1097,16 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound) /* - * Open all the indexes of the given relation, obtaining the specified kind - * of lock on each. Return an array of Relation pointers for the indexes - * into *Irel, and the number of indexes into *nindexes. + * Open all the vacuumable indexes of the given relation, obtaining the + * specified kind of lock on each. Return an array of Relation pointers for + * the indexes into *Irel, and the number of indexes into *nindexes. + * + * We consider an index vacuumable if it is marked insertable (IndexIsReady). + * If it isn't, probably a CREATE INDEX CONCURRENTLY command failed early in + * execution, and what we have is too corrupt to be processable. We will + * vacuum even if the index isn't indisvalid; this is important because in a + * unique index, uniqueness checks will be performed anyway and had better not + * hit dangling index pointers. */ void vac_open_indexes(Relation relation, LOCKMODE lockmode, @@ -1113,21 +1120,30 @@ vac_open_indexes(Relation relation, LOCKMODE lockmode, indexoidlist = RelationGetIndexList(relation); - *nindexes = list_length(indexoidlist); + /* allocate enough memory for all indexes */ + i = list_length(indexoidlist); - if (*nindexes > 0) - *Irel = (Relation *) palloc(*nindexes * sizeof(Relation)); + if (i > 0) + *Irel = (Relation *) palloc(i * sizeof(Relation)); else *Irel = NULL; + /* collect just the ready indexes */ i = 0; foreach(indexoidscan, indexoidlist) { Oid indexoid = lfirst_oid(indexoidscan); + Relation indrel; - (*Irel)[i++] = index_open(indexoid, lockmode); + indrel = index_open(indexoid, lockmode); + if (IndexIsReady(indrel->rd_index)) + (*Irel)[i++] = indrel; + else + index_close(indrel, lockmode); } + *nindexes = i; + list_free(indexoidlist); } diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index c9253a9c47..503684936b 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -949,6 +949,15 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, vacrelstats->scanned_pages, num_tuples); + /* + * Release any remaining pin on visibility map page. + */ + if (BufferIsValid(vmbuffer)) + { + ReleaseBuffer(vmbuffer); + vmbuffer = InvalidBuffer; + } + /* If any tuples need to be deleted, perform final vacuum cycle */ /* XXX put a threshold on min number of tuples here? */ if (vacrelstats->num_dead_tuples > 0) @@ -966,13 +975,6 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, vacrelstats->num_index_scans++; } - /* Release the pin on the visibility map page */ - if (BufferIsValid(vmbuffer)) - { - ReleaseBuffer(vmbuffer); - vmbuffer = InvalidBuffer; - } - /* Do post-vacuum cleanup and statistics update for each index */ for (i = 0; i < nindexes; i++) lazy_cleanup_index(Irel[i], indstats[i], vacrelstats); diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index d966be543e..dbd3755b1b 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -1802,8 +1802,7 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode, if (heap_fetch(relation, &SnapshotDirty, &tuple, &buffer, true, NULL)) { HTSU_Result test; - ItemPointerData update_ctid; - TransactionId update_xmax; + HeapUpdateFailureData hufd; /* * If xmin isn't what we're expecting, the slot must have been @@ -1838,13 +1837,13 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode, /* * If tuple was inserted by our own transaction, we have to check * cmin against es_output_cid: cmin >= current CID means our - * command cannot see the tuple, so we should ignore it. Without - * this we are open to the "Halloween problem" of indefinitely - * re-updating the same tuple. (We need not check cmax because - * HeapTupleSatisfiesDirty will consider a tuple deleted by our - * transaction dead, regardless of cmax.) We just checked that - * priorXmax == xmin, so we can test that variable instead of - * doing HeapTupleHeaderGetXmin again. + * command cannot see the tuple, so we should ignore it. + * Otherwise heap_lock_tuple() will throw an error, and so would + * any later attempt to update or delete the tuple. (We need not + * check cmax because HeapTupleSatisfiesDirty will consider a + * tuple deleted by our transaction dead, regardless of cmax.) + * Wee just checked that priorXmax == xmin, so we can test that + * variable instead of doing HeapTupleHeaderGetXmin again. */ if (TransactionIdIsCurrentTransactionId(priorXmax) && HeapTupleHeaderGetCmin(tuple.t_data) >= estate->es_output_cid) @@ -1856,17 +1855,29 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode, /* * This is a live tuple, so now try to lock it. */ - test = heap_lock_tuple(relation, &tuple, &buffer, - &update_ctid, &update_xmax, + test = heap_lock_tuple(relation, &tuple, estate->es_output_cid, - lockmode, false); + lockmode, false /* wait */, + &buffer, &hufd); /* We now have two pins on the buffer, get rid of one */ ReleaseBuffer(buffer); switch (test) { case HeapTupleSelfUpdated: - /* treat it as deleted; do not process */ + /* + * The target tuple was already updated or deleted by the + * current command, or by a later command in the current + * transaction. We *must* ignore the tuple in the former + * case, so as to avoid the "Halloween problem" of + * repeated update attempts. In the latter case it might + * be sensible to fetch the updated tuple instead, but + * doing so would require changing heap_lock_tuple as well + * as heap_update and heap_delete to not complain about + * updating "invisible" tuples, which seems pretty scary. + * So for now, treat the tuple as deleted and do not + * process. + */ ReleaseBuffer(buffer); return NULL; @@ -1880,12 +1891,12 @@ EvalPlanQualFetch(EState *estate, Relation relation, int lockmode, ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); - if (!ItemPointerEquals(&update_ctid, &tuple.t_self)) + if (!ItemPointerEquals(&hufd.ctid, &tuple.t_self)) { /* it was updated, so look at the updated version */ - tuple.t_self = update_ctid; + tuple.t_self = hufd.ctid; /* updated row should have xmin matching this xmax */ - priorXmax = update_xmax; + priorXmax = hufd.xmax; continue; } /* tuple was deleted, so give up */ diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index 0bbd0d4640..d6cf06c6fd 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -906,6 +906,9 @@ ExecOpenIndices(ResultRelInfo *resultRelInfo) /* * For each index, open the index relation and save pg_index info. We * acquire RowExclusiveLock, signifying we will update the index. + * + * Note: we do this even if the index is not IndexIsReady; it's not worth + * the trouble to optimize for the case where it isn't. */ i = 0; foreach(l, indexoidlist) diff --git a/src/backend/executor/nodeLockRows.c b/src/backend/executor/nodeLockRows.c index ec0825b460..6474393d7f 100644 --- a/src/backend/executor/nodeLockRows.c +++ b/src/backend/executor/nodeLockRows.c @@ -71,8 +71,7 @@ ExecLockRows(LockRowsState *node) bool isNull; HeapTupleData tuple; Buffer buffer; - ItemPointerData update_ctid; - TransactionId update_xmax; + HeapUpdateFailureData hufd; LockTupleMode lockmode; HTSU_Result test; HeapTuple copyTuple; @@ -117,15 +116,26 @@ ExecLockRows(LockRowsState *node) else lockmode = LockTupleShared; - test = heap_lock_tuple(erm->relation, &tuple, &buffer, - &update_ctid, &update_xmax, + test = heap_lock_tuple(erm->relation, &tuple, estate->es_output_cid, - lockmode, erm->noWait); + lockmode, erm->noWait, + &buffer, &hufd); ReleaseBuffer(buffer); switch (test) { case HeapTupleSelfUpdated: - /* treat it as deleted; do not process */ + /* + * The target tuple was already updated or deleted by the + * current command, or by a later command in the current + * transaction. We *must* ignore the tuple in the former + * case, so as to avoid the "Halloween problem" of repeated + * update attempts. In the latter case it might be sensible + * to fetch the updated tuple instead, but doing so would + * require changing heap_lock_tuple as well as heap_update and + * heap_delete to not complain about updating "invisible" + * tuples, which seems pretty scary. So for now, treat the + * tuple as deleted and do not process. + */ goto lnext; case HeapTupleMayBeUpdated: @@ -137,8 +147,7 @@ ExecLockRows(LockRowsState *node) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); - if (ItemPointerEquals(&update_ctid, - &tuple.t_self)) + if (ItemPointerEquals(&hufd.ctid, &tuple.t_self)) { /* Tuple was deleted, so don't return it */ goto lnext; @@ -146,7 +155,7 @@ ExecLockRows(LockRowsState *node) /* updated, so fetch and lock the updated version */ copyTuple = EvalPlanQualFetch(estate, erm->relation, lockmode, - &update_ctid, update_xmax); + &hufd.ctid, hufd.xmax); if (copyTuple == NULL) { diff --git a/src/backend/executor/nodeMergeAppend.c b/src/backend/executor/nodeMergeAppend.c index d5141ba54e..9dc25eefc4 100644 --- a/src/backend/executor/nodeMergeAppend.c +++ b/src/backend/executor/nodeMergeAppend.c @@ -41,17 +41,16 @@ #include "executor/execdebug.h" #include "executor/nodeMergeAppend.h" +#include "lib/binaryheap.h" + /* - * It gets quite confusing having a heap array (indexed by integers) which - * contains integers which index into the slots array. These typedefs try to - * clear it up, but they're only documentation. + * We have one slot for each item in the heap array. We use SlotNumber + * to store slot indexes. This doesn't actually provide any formal + * type-safety, but it makes the code more self-documenting. */ -typedef int SlotNumber; -typedef int HeapPosition; +typedef int32 SlotNumber; -static void heap_insert_slot(MergeAppendState *node, SlotNumber new_slot); -static void heap_siftup_slot(MergeAppendState *node); -static int32 heap_compare_slots(MergeAppendState *node, SlotNumber slot1, SlotNumber slot2); +static int heap_compare_slots(Datum a, Datum b, void *arg); /* ---------------------------------------------------------------- @@ -88,7 +87,8 @@ ExecInitMergeAppend(MergeAppend *node, EState *estate, int eflags) mergestate->ms_nplans = nplans; mergestate->ms_slots = (TupleTableSlot **) palloc0(sizeof(TupleTableSlot *) * nplans); - mergestate->ms_heap = (int *) palloc0(sizeof(int) * nplans); + mergestate->ms_heap = binaryheap_allocate(nplans, heap_compare_slots, + mergestate); /* * Miscellaneous initialization @@ -143,9 +143,7 @@ ExecInitMergeAppend(MergeAppend *node, EState *estate, int eflags) /* * initialize to show we have not run the subplans yet */ - mergestate->ms_heap_size = 0; mergestate->ms_initialized = false; - mergestate->ms_last_slot = -1; return mergestate; } @@ -172,101 +170,53 @@ ExecMergeAppend(MergeAppendState *node) { node->ms_slots[i] = ExecProcNode(node->mergeplans[i]); if (!TupIsNull(node->ms_slots[i])) - heap_insert_slot(node, i); + binaryheap_add_unordered(node->ms_heap, Int32GetDatum(i)); } + binaryheap_build(node->ms_heap); node->ms_initialized = true; } else { /* * Otherwise, pull the next tuple from whichever subplan we returned - * from last time, and insert it into the heap. (We could simplify - * the logic a bit by doing this before returning from the prior call, - * but it's better to not pull tuples until necessary.) + * from last time, and reinsert the subplan index into the heap, + * because it might now compare differently against the existing + * elements of the heap. (We could perhaps simplify the logic a bit + * by doing this before returning from the prior call, but it's better + * to not pull tuples until necessary.) */ - i = node->ms_last_slot; + i = DatumGetInt32(binaryheap_first(node->ms_heap)); node->ms_slots[i] = ExecProcNode(node->mergeplans[i]); if (!TupIsNull(node->ms_slots[i])) - heap_insert_slot(node, i); + binaryheap_replace_first(node->ms_heap, Int32GetDatum(i)); + else + (void) binaryheap_remove_first(node->ms_heap); } - if (node->ms_heap_size > 0) - { - /* Return the topmost heap node, and sift up the remaining nodes */ - i = node->ms_heap[0]; - result = node->ms_slots[i]; - node->ms_last_slot = i; - heap_siftup_slot(node); - } - else + if (binaryheap_empty(node->ms_heap)) { /* All the subplans are exhausted, and so is the heap */ result = ExecClearTuple(node->ps.ps_ResultTupleSlot); } - - return result; -} - -/* - * Insert a new slot into the heap. The slot must contain a valid tuple. - */ -static void -heap_insert_slot(MergeAppendState *node, SlotNumber new_slot) -{ - SlotNumber *heap = node->ms_heap; - HeapPosition j; - - Assert(!TupIsNull(node->ms_slots[new_slot])); - - j = node->ms_heap_size++; /* j is where the "hole" is */ - while (j > 0) + else { - int i = (j - 1) / 2; - - if (heap_compare_slots(node, new_slot, node->ms_heap[i]) >= 0) - break; - heap[j] = heap[i]; - j = i; + i = DatumGetInt32(binaryheap_first(node->ms_heap)); + result = node->ms_slots[i]; } - heap[j] = new_slot; -} -/* - * Delete the heap top (the slot in heap[0]), and sift up. - */ -static void -heap_siftup_slot(MergeAppendState *node) -{ - SlotNumber *heap = node->ms_heap; - HeapPosition i, - n; - - if (--node->ms_heap_size <= 0) - return; - n = node->ms_heap_size; /* heap[n] needs to be reinserted */ - i = 0; /* i is where the "hole" is */ - for (;;) - { - int j = 2 * i + 1; - - if (j >= n) - break; - if (j + 1 < n && heap_compare_slots(node, heap[j], heap[j + 1]) > 0) - j++; - if (heap_compare_slots(node, heap[n], heap[j]) <= 0) - break; - heap[i] = heap[j]; - i = j; - } - heap[i] = heap[n]; + return result; } /* * Compare the tuples in the two given slots. */ static int32 -heap_compare_slots(MergeAppendState *node, SlotNumber slot1, SlotNumber slot2) +heap_compare_slots(Datum a, Datum b, void *arg) { + MergeAppendState *node = (MergeAppendState *) arg; + SlotNumber slot1 = DatumGetInt32(a); + SlotNumber slot2 = DatumGetInt32(b); + TupleTableSlot *s1 = node->ms_slots[slot1]; TupleTableSlot *s2 = node->ms_slots[slot2]; int nkey; @@ -291,7 +241,7 @@ heap_compare_slots(MergeAppendState *node, SlotNumber slot1, SlotNumber slot2) datum2, isNull2, sortKey); if (compare != 0) - return compare; + return -compare; } return 0; } @@ -347,7 +297,5 @@ ExecReScanMergeAppend(MergeAppendState *node) if (subnode->chgParam == NULL) ExecReScan(subnode); } - node->ms_heap_size = 0; node->ms_initialized = false; - node->ms_last_slot = -1; } diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 26a59d0121..d31015c654 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -295,8 +295,7 @@ ExecDelete(ItemPointer tupleid, ResultRelInfo *resultRelInfo; Relation resultRelationDesc; HTSU_Result result; - ItemPointerData update_ctid; - TransactionId update_xmax; + HeapUpdateFailureData hufd; /* * get information on the (current) result relation @@ -348,14 +347,44 @@ ExecDelete(ItemPointer tupleid, */ ldelete:; result = heap_delete(resultRelationDesc, tupleid, - &update_ctid, &update_xmax, estate->es_output_cid, estate->es_crosscheck_snapshot, - true /* wait for commit */ ); + true /* wait for commit */, + &hufd); switch (result) { case HeapTupleSelfUpdated: - /* already deleted by self; nothing to do */ + /* + * The target tuple was already updated or deleted by the + * current command, or by a later command in the current + * transaction. The former case is possible in a join DELETE + * where multiple tuples join to the same target tuple. + * This is somewhat questionable, but Postgres has always + * allowed it: we just ignore additional deletion attempts. + * + * The latter case arises if the tuple is modified by a + * command in a BEFORE trigger, or perhaps by a command in a + * volatile function used in the query. In such situations we + * should not ignore the deletion, but it is equally unsafe to + * proceed. We don't want to discard the original DELETE + * while keeping the triggered actions based on its deletion; + * and it would be no better to allow the original DELETE + * while discarding updates that it triggered. The row update + * carries some information that might be important according + * to business rules; so throwing an error is the only safe + * course. + * + * If a trigger actually intends this type of interaction, + * it can re-execute the DELETE and then return NULL to + * cancel the outer delete. + */ + if (hufd.cmax != estate->es_output_cid) + ereport(ERROR, + (errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION), + errmsg("tuple to be updated was already modified by an operation triggered by the current command"), + errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows."))); + + /* Else, already deleted by self; nothing to do */ return NULL; case HeapTupleMayBeUpdated: @@ -366,7 +395,7 @@ ldelete:; ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); - if (!ItemPointerEquals(tupleid, &update_ctid)) + if (!ItemPointerEquals(tupleid, &hufd.ctid)) { TupleTableSlot *epqslot; @@ -374,11 +403,11 @@ ldelete:; epqstate, resultRelationDesc, resultRelInfo->ri_RangeTableIndex, - &update_ctid, - update_xmax); + &hufd.ctid, + hufd.xmax); if (!TupIsNull(epqslot)) { - *tupleid = update_ctid; + *tupleid = hufd.ctid; goto ldelete; } } @@ -482,8 +511,7 @@ ExecUpdate(ItemPointer tupleid, ResultRelInfo *resultRelInfo; Relation resultRelationDesc; HTSU_Result result; - ItemPointerData update_ctid; - TransactionId update_xmax; + HeapUpdateFailureData hufd; List *recheckIndexes = NIL; /* @@ -564,14 +592,43 @@ lreplace:; * mode transactions. */ result = heap_update(resultRelationDesc, tupleid, tuple, - &update_ctid, &update_xmax, estate->es_output_cid, estate->es_crosscheck_snapshot, - true /* wait for commit */ ); + true /* wait for commit */, + &hufd); switch (result) { case HeapTupleSelfUpdated: - /* already deleted by self; nothing to do */ + /* + * The target tuple was already updated or deleted by the + * current command, or by a later command in the current + * transaction. The former case is possible in a join UPDATE + * where multiple tuples join to the same target tuple. + * This is pretty questionable, but Postgres has always + * allowed it: we just execute the first update action and + * ignore additional update attempts. + * + * The latter case arises if the tuple is modified by a + * command in a BEFORE trigger, or perhaps by a command in a + * volatile function used in the query. In such situations we + * should not ignore the update, but it is equally unsafe to + * proceed. We don't want to discard the original UPDATE + * while keeping the triggered actions based on it; and we + * have no principled way to merge this update with the + * previous ones. So throwing an error is the only safe + * course. + * + * If a trigger actually intends this type of interaction, + * it can re-execute the UPDATE (assuming it can figure out + * how) and then return NULL to cancel the outer update. + */ + if (hufd.cmax != estate->es_output_cid) + ereport(ERROR, + (errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION), + errmsg("tuple to be updated was already modified by an operation triggered by the current command"), + errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows."))); + + /* Else, already updated by self; nothing to do */ return NULL; case HeapTupleMayBeUpdated: @@ -582,7 +639,7 @@ lreplace:; ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); - if (!ItemPointerEquals(tupleid, &update_ctid)) + if (!ItemPointerEquals(tupleid, &hufd.ctid)) { TupleTableSlot *epqslot; @@ -590,11 +647,11 @@ lreplace:; epqstate, resultRelationDesc, resultRelInfo->ri_RangeTableIndex, - &update_ctid, - update_xmax); + &hufd.ctid, + hufd.xmax); if (!TupIsNull(epqslot)) { - *tupleid = update_ctid; + *tupleid = hufd.ctid; slot = ExecFilterJunk(resultRelInfo->ri_junkFilter, epqslot); tuple = ExecMaterializeSlot(slot); goto lreplace; diff --git a/src/backend/executor/spi.c b/src/backend/executor/spi.c index bf8c4c7113..416a2c4f3b 100644 --- a/src/backend/executor/spi.c +++ b/src/backend/executor/spi.c @@ -1127,6 +1127,7 @@ SPI_cursor_open_internal(const char *name, SPIPlanPtr plan, CachedPlan *cplan; List *stmt_list; char *query_string; + Snapshot snapshot; MemoryContext oldcontext; Portal portal; @@ -1269,6 +1270,15 @@ SPI_cursor_open_internal(const char *name, SPIPlanPtr plan, } } + /* Set up the snapshot to use. */ + if (read_only) + snapshot = GetActiveSnapshot(); + else + { + CommandCounterIncrement(); + snapshot = GetTransactionSnapshot(); + } + /* * If the plan has parameters, copy them into the portal. Note that this * must be done after revalidating the plan, because in dynamic parameter @@ -1284,13 +1294,7 @@ SPI_cursor_open_internal(const char *name, SPIPlanPtr plan, /* * Start portal execution. */ - if (read_only) - PortalStart(portal, paramLI, 0, true); - else - { - CommandCounterIncrement(); - PortalStart(portal, paramLI, 0, false); - } + PortalStart(portal, paramLI, 0, snapshot); Assert(portal->strategy != PORTAL_MULTI_QUERY); diff --git a/src/backend/lib/Makefile b/src/backend/lib/Makefile index 2e1061e24a..327a1bc16d 100644 --- a/src/backend/lib/Makefile +++ b/src/backend/lib/Makefile @@ -12,6 +12,6 @@ subdir = src/backend/lib top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -OBJS = dllist.o stringinfo.o +OBJS = ilist.o binaryheap.o stringinfo.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/lib/binaryheap.c b/src/backend/lib/binaryheap.c new file mode 100644 index 0000000000..73c80e4dfd --- /dev/null +++ b/src/backend/lib/binaryheap.c @@ -0,0 +1,293 @@ +/*------------------------------------------------------------------------- + * + * binaryheap.c + * A simple binary heap implementaion + * + * Portions Copyright (c) 2012, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/lib/binaryheap.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "lib/binaryheap.h" + +static void sift_down(binaryheap *heap, int node_off); +static void sift_up(binaryheap *heap, int node_off); +static inline void swap_nodes(binaryheap *heap, int a, int b); + +/* + * binaryheap_allocate + * + * Returns a pointer to a newly-allocated heap that has the capacity to + * store the given number of nodes, with the heap property defined by + * the given comparator function, which will be invoked with the additional + * argument specified by 'arg'. + */ +binaryheap * +binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg) +{ + int sz; + binaryheap *heap; + + sz = offsetof(binaryheap, bh_nodes) + sizeof(Datum) * capacity; + heap = palloc(sz); + heap->bh_size = 0; + heap->bh_space = capacity; + heap->bh_has_heap_property = true; + heap->bh_compare = compare; + heap->bh_arg = arg; + + return heap; +} + +/* + * binaryheap_free + * + * Releases memory used by the given binaryheap. + */ +void +binaryheap_free(binaryheap *heap) +{ + pfree(heap); +} + +/* + * These utility functions return the offset of the left child, right + * child, and parent of the node at the given index, respectively. + * + * The heap is represented as an array of nodes, with the root node + * stored at index 0. The left child of node i is at index 2*i+1, and + * the right child at 2*i+2. The parent of node i is at index (i-1)/2. + */ + +static inline int +left_offset(int i) +{ + return 2 * i + 1; +} + +static inline int +right_offset(int i) +{ + return 2 * i + 2; +} + +static inline int +parent_offset(int i) +{ + return (i - 1) / 2; +} + +/* + * binaryheap_add_unordered + * + * Adds the given datum to the end of the heap's list of nodes in O(1) without + * preserving the heap property. This is a convenience to add elements quickly + * to a new heap. To obtain a valid heap, one must call binaryheap_build() + * afterwards. + */ +void +binaryheap_add_unordered(binaryheap *heap, Datum d) +{ + if (heap->bh_size >= heap->bh_space) + elog(ERROR, "out of binary heap slots"); + heap->bh_has_heap_property = false; + heap->bh_nodes[heap->bh_size] = d; + heap->bh_size++; +} + +/* + * binaryheap_build + * + * Assembles a valid heap in O(n) from the nodes added by + * binaryheap_add_unordered(). Not needed otherwise. + */ +void +binaryheap_build(binaryheap *heap) +{ + int i; + + for (i = parent_offset(heap->bh_size - 1); i >= 0; i--) + sift_down(heap, i); + heap->bh_has_heap_property = true; +} + +/* + * binaryheap_add + * + * Adds the given datum to the heap in O(log n) time, while preserving + * the heap property. + */ +void +binaryheap_add(binaryheap *heap, Datum d) +{ + if (heap->bh_size >= heap->bh_space) + elog(ERROR, "out of binary heap slots"); + heap->bh_nodes[heap->bh_size] = d; + heap->bh_size++; + sift_up(heap, heap->bh_size - 1); +} + +/* + * binaryheap_first + * + * Returns a pointer to the first (root, topmost) node in the heap + * without modifying the heap. The caller must ensure that this + * routine is not used on an empty heap. Always O(1). + */ +Datum +binaryheap_first(binaryheap *heap) +{ + Assert(!binaryheap_empty(heap) && heap->bh_has_heap_property); + return heap->bh_nodes[0]; +} + +/* + * binaryheap_remove_first + * + * Removes the first (root, topmost) node in the heap and returns a + * pointer to it after rebalancing the heap. The caller must ensure + * that this routine is not used on an empty heap. O(log n) worst + * case. + */ +Datum +binaryheap_remove_first(binaryheap *heap) +{ + Assert(!binaryheap_empty(heap) && heap->bh_has_heap_property); + + if (heap->bh_size == 1) + { + heap->bh_size--; + return heap->bh_nodes[0]; + } + + /* + * Swap the root and last nodes, decrease the size of the heap (i.e. + * remove the former root node) and sift the new root node down to its + * correct position. + */ + swap_nodes(heap, 0, heap->bh_size - 1); + heap->bh_size--; + sift_down(heap, 0); + + return heap->bh_nodes[heap->bh_size]; +} + +/* + * binaryheap_replace_first + * + * Replace the topmost element of a non-empty heap, preserving the heap + * property. O(1) in the best case, or O(log n) if it must fall back to + * sifting the new node down. + */ +void +binaryheap_replace_first(binaryheap *heap, Datum d) +{ + Assert(!binaryheap_empty(heap) && heap->bh_has_heap_property); + + heap->bh_nodes[0] = d; + + if (heap->bh_size > 1) + sift_down(heap, 0); +} + +/* + * Swap the contents of two nodes. + */ +static inline void +swap_nodes(binaryheap *heap, int a, int b) +{ + Datum swap; + + swap = heap->bh_nodes[a]; + heap->bh_nodes[a] = heap->bh_nodes[b]; + heap->bh_nodes[b] = swap; +} + +/* + * Sift a node up to the highest position it can hold according to the + * comparator. + */ +static void +sift_up(binaryheap *heap, int node_off) +{ + while (node_off != 0) + { + int cmp; + int parent_off; + + /* + * If this node is smaller than its parent, the heap condition is + * satisfied, and we're done. + */ + parent_off = parent_offset(node_off); + cmp = heap->bh_compare(heap->bh_nodes[node_off], + heap->bh_nodes[parent_off], + heap->bh_arg); + if (cmp <= 0) + break; + + /* + * Otherwise, swap the node and its parent and go on to check the + * node's new parent. + */ + swap_nodes(heap, node_off, parent_off); + node_off = parent_off; + } +} + +/* + * Sift a node down from its current position to satisfy the heap + * property. + */ +static void +sift_down(binaryheap *heap, int node_off) +{ + while (true) + { + int left_off = left_offset(node_off); + int right_off = right_offset(node_off); + int swap_off = 0; + + /* Is the left child larger than the parent? */ + if (left_off < heap->bh_size && + heap->bh_compare(heap->bh_nodes[node_off], + heap->bh_nodes[left_off], + heap->bh_arg) < 0) + swap_off = left_off; + + /* Is the right child larger than the parent? */ + if (right_off < heap->bh_size && + heap->bh_compare(heap->bh_nodes[node_off], + heap->bh_nodes[right_off], + heap->bh_arg) < 0) + { + /* swap with the larger child */ + if (!swap_off || + heap->bh_compare(heap->bh_nodes[left_off], + heap->bh_nodes[right_off], + heap->bh_arg) < 0) + swap_off = right_off; + } + + /* + * If we didn't find anything to swap, the heap condition is + * satisfied, and we're done. + */ + if (!swap_off) + break; + + /* + * Otherwise, swap the node with the child that violates the heap + * property; then go on to check its children. + */ + swap_nodes(heap, swap_off, node_off); + node_off = swap_off; + } +} diff --git a/src/backend/lib/dllist.c b/src/backend/lib/dllist.c deleted file mode 100644 index 52af56a079..0000000000 --- a/src/backend/lib/dllist.c +++ /dev/null @@ -1,214 +0,0 @@ -/*------------------------------------------------------------------------- - * - * dllist.c - * this is a simple doubly linked list implementation - * the elements of the lists are void* - * - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * - * IDENTIFICATION - * src/backend/lib/dllist.c - * - *------------------------------------------------------------------------- - */ -#include "postgres.h" - -#include "lib/dllist.h" - - -Dllist * -DLNewList(void) -{ - Dllist *l; - - l = (Dllist *) palloc(sizeof(Dllist)); - - l->dll_head = NULL; - l->dll_tail = NULL; - - return l; -} - -void -DLInitList(Dllist *list) -{ - list->dll_head = NULL; - list->dll_tail = NULL; -} - -/* - * free up a list and all the nodes in it --- but *not* whatever the nodes - * might point to! - */ -void -DLFreeList(Dllist *list) -{ - Dlelem *curr; - - while ((curr = DLRemHead(list)) != NULL) - pfree(curr); - - pfree(list); -} - -Dlelem * -DLNewElem(void *val) -{ - Dlelem *e; - - e = (Dlelem *) palloc(sizeof(Dlelem)); - - e->dle_next = NULL; - e->dle_prev = NULL; - e->dle_val = val; - e->dle_list = NULL; - return e; -} - -void -DLInitElem(Dlelem *e, void *val) -{ - e->dle_next = NULL; - e->dle_prev = NULL; - e->dle_val = val; - e->dle_list = NULL; -} - -void -DLFreeElem(Dlelem *e) -{ - pfree(e); -} - -void -DLRemove(Dlelem *e) -{ - Dllist *l = e->dle_list; - - if (e->dle_prev) - e->dle_prev->dle_next = e->dle_next; - else - { - /* must be the head element */ - Assert(e == l->dll_head); - l->dll_head = e->dle_next; - } - if (e->dle_next) - e->dle_next->dle_prev = e->dle_prev; - else - { - /* must be the tail element */ - Assert(e == l->dll_tail); - l->dll_tail = e->dle_prev; - } - - e->dle_next = NULL; - e->dle_prev = NULL; - e->dle_list = NULL; -} - -void -DLAddHead(Dllist *l, Dlelem *e) -{ - e->dle_list = l; - - if (l->dll_head) - l->dll_head->dle_prev = e; - e->dle_next = l->dll_head; - e->dle_prev = NULL; - l->dll_head = e; - - if (l->dll_tail == NULL) /* if this is first element added */ - l->dll_tail = e; -} - -void -DLAddTail(Dllist *l, Dlelem *e) -{ - e->dle_list = l; - - if (l->dll_tail) - l->dll_tail->dle_next = e; - e->dle_prev = l->dll_tail; - e->dle_next = NULL; - l->dll_tail = e; - - if (l->dll_head == NULL) /* if this is first element added */ - l->dll_head = e; -} - -Dlelem * -DLRemHead(Dllist *l) -{ - /* remove and return the head */ - Dlelem *result = l->dll_head; - - if (result == NULL) - return result; - - if (result->dle_next) - result->dle_next->dle_prev = NULL; - - l->dll_head = result->dle_next; - - if (result == l->dll_tail) /* if the head is also the tail */ - l->dll_tail = NULL; - - result->dle_next = NULL; - result->dle_list = NULL; - - return result; -} - -Dlelem * -DLRemTail(Dllist *l) -{ - /* remove and return the tail */ - Dlelem *result = l->dll_tail; - - if (result == NULL) - return result; - - if (result->dle_prev) - result->dle_prev->dle_next = NULL; - - l->dll_tail = result->dle_prev; - - if (result == l->dll_head) /* if the tail is also the head */ - l->dll_head = NULL; - - result->dle_prev = NULL; - result->dle_list = NULL; - - return result; -} - -/* Same as DLRemove followed by DLAddHead, but faster */ -void -DLMoveToFront(Dlelem *e) -{ - Dllist *l = e->dle_list; - - if (l->dll_head == e) - return; /* Fast path if already at front */ - - Assert(e->dle_prev != NULL); /* since it's not the head */ - e->dle_prev->dle_next = e->dle_next; - - if (e->dle_next) - e->dle_next->dle_prev = e->dle_prev; - else - { - /* must be the tail element */ - Assert(e == l->dll_tail); - l->dll_tail = e->dle_prev; - } - - l->dll_head->dle_prev = e; - e->dle_next = l->dll_head; - e->dle_prev = NULL; - l->dll_head = e; - /* We need not check dll_tail, since there must have been > 1 entry */ -} diff --git a/src/backend/lib/ilist.c b/src/backend/lib/ilist.c new file mode 100644 index 0000000000..0126320d42 --- /dev/null +++ b/src/backend/lib/ilist.c @@ -0,0 +1,114 @@ +/*------------------------------------------------------------------------- + * + * ilist.c + * support for integrated/inline doubly- and singly- linked lists + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/lib/ilist.c + * + * NOTES + * This file only contains functions that are too big to be considered + * for inlining. See ilist.h for most of the goodies. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +/* See ilist.h */ +#define ILIST_INCLUDE_DEFINITIONS + +#include "lib/ilist.h" + +/* + * Delete 'node' from list. + * + * It is not allowed to delete a 'node' which is is not in the list 'head' + * + * Caution: this is O(n) + */ +void +slist_delete(slist_head *head, slist_node *node) +{ + slist_node *last = &head->head; + slist_node *cur; + bool found PG_USED_FOR_ASSERTS_ONLY = false; + + while ((cur = last->next) != NULL) + { + if (cur == node) + { + last->next = cur->next; +#ifdef USE_ASSERT_CHECKING + found = true; +#endif + break; + } + last = cur; + } + Assert(found); + + slist_check(head); +} + +#ifdef ILIST_DEBUG +/* + * Verify integrity of a doubly linked list + */ +void +dlist_check(dlist_head *head) +{ + dlist_node *cur; + + if (head == NULL) + elog(ERROR, "doubly linked list head address is NULL"); + + if (head->head.next == NULL && head->head.prev == NULL) + return; /* OK, initialized as zeroes */ + + /* iterate in forward direction */ + for (cur = head->head.next; cur != &head->head; cur = cur->next) + { + if (cur == NULL || + cur->next == NULL || + cur->prev == NULL || + cur->prev->next != cur || + cur->next->prev != cur) + elog(ERROR, "doubly linked list is corrupted"); + } + + /* iterate in backward direction */ + for (cur = head->head.prev; cur != &head->head; cur = cur->prev) + { + if (cur == NULL || + cur->next == NULL || + cur->prev == NULL || + cur->prev->next != cur || + cur->next->prev != cur) + elog(ERROR, "doubly linked list is corrupted"); + } +} + +/* + * Verify integrity of a singly linked list + */ +void +slist_check(slist_head *head) +{ + slist_node *cur; + + if (head == NULL) + elog(ERROR, "singly linked list head address is NULL"); + + /* + * there isn't much we can test in a singly linked list except that it + * actually ends sometime, i.e. hasn't introduced a cycle or similar + */ + for (cur = head->head.next; cur != NULL; cur = cur->next) + ; +} + +#endif /* ILIST_DEBUG */ diff --git a/src/backend/libpq/auth.c b/src/backend/libpq/auth.c index ca470e1883..cc1140d9bc 100644 --- a/src/backend/libpq/auth.c +++ b/src/backend/libpq/auth.c @@ -2209,7 +2209,7 @@ CheckLDAPAuth(Port *port) r = ldap_search_s(ldap, port->hba->ldapbasedn, - LDAP_SCOPE_SUBTREE, + port->hba->ldapscope, filter, attributes, 0, diff --git a/src/backend/libpq/be-fsstubs.c b/src/backend/libpq/be-fsstubs.c index dbc00b450d..a28ab9b815 100644 --- a/src/backend/libpq/be-fsstubs.c +++ b/src/backend/libpq/be-fsstubs.c @@ -442,7 +442,7 @@ lo_import_with_oid(PG_FUNCTION_ARGS) static Oid lo_import_internal(text *filename, Oid lobjOid) { - File fd; + int fd; int nbytes, tmp PG_USED_FOR_ASSERTS_ONLY; char buf[BUFSIZE]; @@ -464,7 +464,7 @@ lo_import_internal(text *filename, Oid lobjOid) * open the file to be read in */ text_to_cstring_buffer(filename, fnamebuf, sizeof(fnamebuf)); - fd = PathNameOpenFile(fnamebuf, O_RDONLY | PG_BINARY, S_IRWXU); + fd = OpenTransientFile(fnamebuf, O_RDONLY | PG_BINARY, S_IRWXU); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), @@ -481,7 +481,7 @@ lo_import_internal(text *filename, Oid lobjOid) */ lobj = inv_open(oid, INV_WRITE, fscxt); - while ((nbytes = FileRead(fd, buf, BUFSIZE)) > 0) + while ((nbytes = read(fd, buf, BUFSIZE)) > 0) { tmp = inv_write(lobj, buf, nbytes); Assert(tmp == nbytes); @@ -494,7 +494,7 @@ lo_import_internal(text *filename, Oid lobjOid) fnamebuf))); inv_close(lobj); - FileClose(fd); + CloseTransientFile(fd); return oid; } @@ -508,7 +508,7 @@ lo_export(PG_FUNCTION_ARGS) { Oid lobjId = PG_GETARG_OID(0); text *filename = PG_GETARG_TEXT_PP(1); - File fd; + int fd; int nbytes, tmp; char buf[BUFSIZE]; @@ -540,8 +540,8 @@ lo_export(PG_FUNCTION_ARGS) */ text_to_cstring_buffer(filename, fnamebuf, sizeof(fnamebuf)); oumask = umask(S_IWGRP | S_IWOTH); - fd = PathNameOpenFile(fnamebuf, O_CREAT | O_WRONLY | O_TRUNC | PG_BINARY, - S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + fd = OpenTransientFile(fnamebuf, O_CREAT | O_WRONLY | O_TRUNC | PG_BINARY, + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); umask(oumask); if (fd < 0) ereport(ERROR, @@ -554,7 +554,7 @@ lo_export(PG_FUNCTION_ARGS) */ while ((nbytes = inv_read(lobj, buf, BUFSIZE)) > 0) { - tmp = FileWrite(fd, buf, nbytes); + tmp = write(fd, buf, nbytes); if (tmp != nbytes) ereport(ERROR, (errcode_for_file_access(), @@ -562,7 +562,7 @@ lo_export(PG_FUNCTION_ARGS) fnamebuf))); } - FileClose(fd); + CloseTransientFile(fd); inv_close(lobj); PG_RETURN_INT32(1); diff --git a/src/backend/libpq/hba.c b/src/backend/libpq/hba.c index 7502e82860..40727a9c8e 100644 --- a/src/backend/libpq/hba.c +++ b/src/backend/libpq/hba.c @@ -37,6 +37,13 @@ #include "utils/lsyscache.h" #include "utils/memutils.h" +#ifdef USE_LDAP +#ifndef WIN32 +#include +#endif +/* currently no Windows LDAP needed in this file */ +#endif + #define atooid(x) ((Oid) strtoul((x), NULL, 10)) #define atoxid(x) ((TransactionId) strtoul((x), NULL, 10)) @@ -1336,7 +1343,7 @@ parse_hba_line(List *line, int line_num) { ereport(LOG, (errcode(ERRCODE_CONFIG_FILE_ERROR), - errmsg("cannot use ldapbasedn, ldapbinddn, ldapbindpasswd, or ldapsearchattribute together with ldapprefix"), + errmsg("cannot use ldapbasedn, ldapbinddn, ldapbindpasswd, ldapsearchattribute, or ldapurl together with ldapprefix"), errcontext("line %d of configuration file \"%s\"", line_num, HbaFileName))); return NULL; @@ -1378,6 +1385,10 @@ parse_hba_line(List *line, int line_num) static bool parse_hba_auth_opt(char *name, char *val, HbaLine *hbaline, int line_num) { +#ifdef USE_LDAP + hbaline->ldapscope = LDAP_SCOPE_SUBTREE; +#endif + if (strcmp(name, "map") == 0) { if (hbaline->auth_method != uaIdent && @@ -1437,6 +1448,55 @@ parse_hba_auth_opt(char *name, char *val, HbaLine *hbaline, int line_num) REQUIRE_AUTH_OPTION(uaPAM, "pamservice", "pam"); hbaline->pamservice = pstrdup(val); } + else if (strcmp(name, "ldapurl") == 0) + { +#ifdef LDAP_API_FEATURE_X_OPENLDAP + LDAPURLDesc *urldata; + int rc; +#endif + + REQUIRE_AUTH_OPTION(uaLDAP, "ldapurl", "ldap"); +#ifdef LDAP_API_FEATURE_X_OPENLDAP + rc = ldap_url_parse(val, &urldata); + if (rc != LDAP_SUCCESS) + { + ereport(LOG, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("could not parse LDAP URL \"%s\": %s", val, ldap_err2string(rc)))); + return false; + } + + if (strcmp(urldata->lud_scheme, "ldap") != 0) + { + ereport(LOG, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("unsupported LDAP URL scheme: %s", urldata->lud_scheme))); + ldap_free_urldesc(urldata); + return false; + } + + hbaline->ldapserver = pstrdup(urldata->lud_host); + hbaline->ldapport = urldata->lud_port; + hbaline->ldapbasedn = pstrdup(urldata->lud_dn); + + if (urldata->lud_attrs) + hbaline->ldapsearchattribute = pstrdup(urldata->lud_attrs[0]); /* only use first one */ + hbaline->ldapscope = urldata->lud_scope; + if (urldata->lud_filter) + { + ereport(LOG, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("filters not supported in LDAP URLs"))); + ldap_free_urldesc(urldata); + return false; + } + ldap_free_urldesc(urldata); +#else /* not OpenLDAP */ + ereport(LOG, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("LDAP URLs not supported on this platform"))); +#endif /* not OpenLDAP */ + } else if (strcmp(name, "ldaptls") == 0) { REQUIRE_AUTH_OPTION(uaLDAP, "ldaptls", "ldap"); diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c index 5e86987f22..15a01a8324 100644 --- a/src/backend/libpq/pqcomm.c +++ b/src/backend/libpq/pqcomm.c @@ -308,6 +308,14 @@ StreamServerPort(int family, char *hostName, unsigned short portNumber, * that file path */ UNIXSOCK_PATH(unixSocketPath, portNumber, unixSocketDir); + if (strlen(unixSocketPath) >= UNIXSOCK_PATH_BUFLEN) + { + ereport(LOG, + (errmsg("Unix-domain socket path \"%s\" is too long (maximum %d bytes)", + unixSocketPath, + (int) (UNIXSOCK_PATH_BUFLEN - 1)))); + return STATUS_ERROR; + } if (Lock_AF_UNIX(unixSocketDir, unixSocketPath) != STATUS_OK) return STATUS_ERROR; service = unixSocketPath; diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 02a0f62a53..35c6287bc8 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -1815,6 +1815,7 @@ _outEquivalenceMember(StringInfo str, const EquivalenceMember *node) WRITE_NODE_FIELD(em_expr); WRITE_BITMAPSET_FIELD(em_relids); + WRITE_BITMAPSET_FIELD(em_nullable_relids); WRITE_BOOL_FIELD(em_is_const); WRITE_BOOL_FIELD(em_is_child); WRITE_OID_FIELD(em_datatype); diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 458dae0489..3a5efa2114 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -1835,10 +1835,10 @@ subquery_push_qual(Query *subquery, RangeTblEntry *rte, Index rti, Node *qual) * This step also ensures that when we are pushing into a setop tree, * each component query gets its own copy of the qual. */ - qual = ResolveNew(qual, rti, 0, rte, - subquery->targetList, - CMD_SELECT, 0, - &subquery->hasSubLinks); + qual = ReplaceVarsFromTargetList(qual, rti, 0, rte, + subquery->targetList, + REPLACEVARS_REPORT_ERROR, 0, + &subquery->hasSubLinks); /* * Now attach the qual to the proper place: normally WHERE, but if the diff --git a/src/backend/optimizer/path/equivclass.c b/src/backend/optimizer/path/equivclass.c index 42286a17e8..632295197a 100644 --- a/src/backend/optimizer/path/equivclass.c +++ b/src/backend/optimizer/path/equivclass.c @@ -30,7 +30,7 @@ static EquivalenceMember *add_eq_member(EquivalenceClass *ec, - Expr *expr, Relids relids, + Expr *expr, Relids relids, Relids nullable_relids, bool is_child, Oid datatype); static void generate_base_implied_equalities_const(PlannerInfo *root, EquivalenceClass *ec); @@ -106,7 +106,9 @@ process_equivalence(PlannerInfo *root, RestrictInfo *restrictinfo, Expr *item1; Expr *item2; Relids item1_relids, - item2_relids; + item2_relids, + item1_nullable_relids, + item2_nullable_relids; List *opfamilies; EquivalenceClass *ec1, *ec2; @@ -163,6 +165,12 @@ process_equivalence(PlannerInfo *root, RestrictInfo *restrictinfo, return false; /* RHS is non-strict but not constant */ } + /* Calculate nullable-relid sets for each side of the clause */ + item1_nullable_relids = bms_intersect(item1_relids, + restrictinfo->nullable_relids); + item2_nullable_relids = bms_intersect(item2_relids, + restrictinfo->nullable_relids); + /* * We use the declared input types of the operator, not exprType() of the * inputs, as the nominal datatypes for opfamily lookup. This presumes @@ -309,7 +317,8 @@ process_equivalence(PlannerInfo *root, RestrictInfo *restrictinfo, else if (ec1) { /* Case 3: add item2 to ec1 */ - em2 = add_eq_member(ec1, item2, item2_relids, false, item2_type); + em2 = add_eq_member(ec1, item2, item2_relids, item2_nullable_relids, + false, item2_type); ec1->ec_sources = lappend(ec1->ec_sources, restrictinfo); ec1->ec_below_outer_join |= below_outer_join; /* mark the RI as associated with this eclass */ @@ -322,7 +331,8 @@ process_equivalence(PlannerInfo *root, RestrictInfo *restrictinfo, else if (ec2) { /* Case 3: add item1 to ec2 */ - em1 = add_eq_member(ec2, item1, item1_relids, false, item1_type); + em1 = add_eq_member(ec2, item1, item1_relids, item1_nullable_relids, + false, item1_type); ec2->ec_sources = lappend(ec2->ec_sources, restrictinfo); ec2->ec_below_outer_join |= below_outer_join; /* mark the RI as associated with this eclass */ @@ -349,8 +359,10 @@ process_equivalence(PlannerInfo *root, RestrictInfo *restrictinfo, ec->ec_broken = false; ec->ec_sortref = 0; ec->ec_merged = NULL; - em1 = add_eq_member(ec, item1, item1_relids, false, item1_type); - em2 = add_eq_member(ec, item2, item2_relids, false, item2_type); + em1 = add_eq_member(ec, item1, item1_relids, item1_nullable_relids, + false, item1_type); + em2 = add_eq_member(ec, item2, item2_relids, item2_nullable_relids, + false, item2_type); root->eq_classes = lappend(root->eq_classes, ec); @@ -448,12 +460,13 @@ canonicalize_ec_expression(Expr *expr, Oid req_type, Oid req_collation) */ static EquivalenceMember * add_eq_member(EquivalenceClass *ec, Expr *expr, Relids relids, - bool is_child, Oid datatype) + Relids nullable_relids, bool is_child, Oid datatype) { EquivalenceMember *em = makeNode(EquivalenceMember); em->em_expr = expr; em->em_relids = relids; + em->em_nullable_relids = nullable_relids; em->em_is_const = false; em->em_is_child = is_child; em->em_datatype = datatype; @@ -609,7 +622,7 @@ get_eclass_for_sort_expr(PlannerInfo *root, elog(ERROR, "volatile EquivalenceClass has no sortref"); newem = add_eq_member(newec, copyObject(expr), pull_varnos((Node *) expr), - false, opcintype); + NULL, false, opcintype); /* * add_eq_member doesn't check for volatile functions, set-returning @@ -756,7 +769,12 @@ generate_base_implied_equalities_const(PlannerInfo *root, } } - /* Find the constant member to use */ + /* + * Find the constant member to use. We prefer an actual constant to + * pseudo-constants (such as Params), because the constraint exclusion + * machinery might be able to exclude relations on the basis of generated + * "var = const" equalities, but "var = param" won't work for that. + */ foreach(lc, ec->ec_members) { EquivalenceMember *cur_em = (EquivalenceMember *) lfirst(lc); @@ -764,7 +782,8 @@ generate_base_implied_equalities_const(PlannerInfo *root, if (cur_em->em_is_const) { const_em = cur_em; - break; + if (IsA(cur_em->em_expr, Const)) + break; } } Assert(const_em != NULL); @@ -789,7 +808,9 @@ generate_base_implied_equalities_const(PlannerInfo *root, } process_implied_equality(root, eq_op, ec->ec_collation, cur_em->em_expr, const_em->em_expr, - ec->ec_relids, + bms_copy(ec->ec_relids), + bms_union(cur_em->em_nullable_relids, + const_em->em_nullable_relids), ec->ec_below_outer_join, cur_em->em_is_const); } @@ -844,7 +865,9 @@ generate_base_implied_equalities_no_const(PlannerInfo *root, } process_implied_equality(root, eq_op, ec->ec_collation, prev_em->em_expr, cur_em->em_expr, - ec->ec_relids, + bms_copy(ec->ec_relids), + bms_union(prev_em->em_nullable_relids, + cur_em->em_nullable_relids), ec->ec_below_outer_join, false); } @@ -1312,7 +1335,9 @@ create_join_clause(PlannerInfo *root, leftem->em_expr, rightem->em_expr, bms_union(leftem->em_relids, - rightem->em_relids)); + rightem->em_relids), + bms_union(leftem->em_nullable_relids, + rightem->em_nullable_relids)); /* Mark the clause as redundant, or not */ rinfo->parent_ec = parent_ec; @@ -1534,7 +1559,8 @@ reconsider_outer_join_clause(PlannerInfo *root, RestrictInfo *rinfo, left_type, right_type, inner_datatype; - Relids inner_relids; + Relids inner_relids, + inner_nullable_relids; ListCell *lc1; Assert(is_opclause(rinfo->clause)); @@ -1561,6 +1587,8 @@ reconsider_outer_join_clause(PlannerInfo *root, RestrictInfo *rinfo, inner_datatype = left_type; inner_relids = rinfo->left_relids; } + inner_nullable_relids = bms_intersect(inner_relids, + rinfo->nullable_relids); /* Scan EquivalenceClasses for a match to outervar */ foreach(lc1, root->eq_classes) @@ -1619,7 +1647,8 @@ reconsider_outer_join_clause(PlannerInfo *root, RestrictInfo *rinfo, cur_ec->ec_collation, innervar, cur_em->em_expr, - inner_relids); + bms_copy(inner_relids), + bms_copy(inner_nullable_relids)); if (process_equivalence(root, newrinfo, true)) match = true; } @@ -1653,7 +1682,9 @@ reconsider_full_join_clause(PlannerInfo *root, RestrictInfo *rinfo) left_type, right_type; Relids left_relids, - right_relids; + right_relids, + left_nullable_relids, + right_nullable_relids; ListCell *lc1; /* Can't use an outerjoin_delayed clause here */ @@ -1669,6 +1700,10 @@ reconsider_full_join_clause(PlannerInfo *root, RestrictInfo *rinfo) rightvar = (Expr *) get_rightop(rinfo->clause); left_relids = rinfo->left_relids; right_relids = rinfo->right_relids; + left_nullable_relids = bms_intersect(left_relids, + rinfo->nullable_relids); + right_nullable_relids = bms_intersect(right_relids, + rinfo->nullable_relids); foreach(lc1, root->eq_classes) { @@ -1754,7 +1789,8 @@ reconsider_full_join_clause(PlannerInfo *root, RestrictInfo *rinfo) cur_ec->ec_collation, leftvar, cur_em->em_expr, - left_relids); + bms_copy(left_relids), + bms_copy(left_nullable_relids)); if (process_equivalence(root, newrinfo, true)) matchleft = true; } @@ -1767,7 +1803,8 @@ reconsider_full_join_clause(PlannerInfo *root, RestrictInfo *rinfo) cur_ec->ec_collation, rightvar, cur_em->em_expr, - right_relids); + bms_copy(right_relids), + bms_copy(right_nullable_relids)); if (process_equivalence(root, newrinfo, true)) matchright = true; } @@ -1894,6 +1931,7 @@ add_child_rel_equivalences(PlannerInfo *root, /* Yes, generate transformed child version */ Expr *child_expr; Relids new_relids; + Relids new_nullable_relids; child_expr = (Expr *) adjust_appendrel_attrs(root, @@ -1910,7 +1948,21 @@ add_child_rel_equivalences(PlannerInfo *root, parent_rel->relids); new_relids = bms_add_members(new_relids, child_rel->relids); - (void) add_eq_member(cur_ec, child_expr, new_relids, + /* + * And likewise for nullable_relids. Note this code assumes + * parent and child relids are singletons. + */ + new_nullable_relids = cur_em->em_nullable_relids; + if (bms_overlap(new_nullable_relids, parent_rel->relids)) + { + new_nullable_relids = bms_difference(new_nullable_relids, + parent_rel->relids); + new_nullable_relids = bms_add_members(new_nullable_relids, + child_rel->relids); + } + + (void) add_eq_member(cur_ec, child_expr, + new_relids, new_nullable_relids, true, cur_em->em_datatype); } } @@ -1921,12 +1973,12 @@ add_child_rel_equivalences(PlannerInfo *root, /* * mutate_eclass_expressions * Apply an expression tree mutator to all expressions stored in - * equivalence classes. + * equivalence classes (but ignore child exprs unless include_child_exprs). * * This is a bit of a hack ... it's currently needed only by planagg.c, * which needs to do a global search-and-replace of MIN/MAX Aggrefs * after eclasses are already set up. Without changing the eclasses too, - * subsequent matching of ORDER BY clauses would fail. + * subsequent matching of ORDER BY and DISTINCT clauses would fail. * * Note that we assume the mutation won't affect relation membership or any * other properties we keep track of (which is a bit bogus, but by the time @@ -1936,7 +1988,8 @@ add_child_rel_equivalences(PlannerInfo *root, void mutate_eclass_expressions(PlannerInfo *root, Node *(*mutator) (), - void *context) + void *context, + bool include_child_exprs) { ListCell *lc1; @@ -1949,6 +2002,9 @@ mutate_eclass_expressions(PlannerInfo *root, { EquivalenceMember *cur_em = (EquivalenceMember *) lfirst(lc2); + if (cur_em->em_is_child && !include_child_exprs) + continue; /* ignore children unless requested */ + cur_em->em_expr = (Expr *) mutator((Node *) cur_em->em_expr, context); } diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 444ec2a40e..a1a7defef6 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -92,6 +92,7 @@ static void consider_index_join_outer_rels(PlannerInfo *root, RelOptInfo *rel, IndexClauseSet *eclauseset, List **bitindexpaths, List *indexjoinclauses, + int considered_clauses, List **considered_relids); static void get_join_index_paths(PlannerInfo *root, RelOptInfo *rel, IndexOptInfo *index, @@ -101,6 +102,8 @@ static void get_join_index_paths(PlannerInfo *root, RelOptInfo *rel, List **bitindexpaths, Relids relids, List **considered_relids); +static bool eclass_already_used(EquivalenceClass *parent_ec, Relids oldrelids, + List *indexjoinclauses); static bool bms_equal_any(Relids relids, List *relids_list); static void get_index_paths(PlannerInfo *root, RelOptInfo *rel, IndexOptInfo *index, IndexClauseSet *clauses, @@ -447,6 +450,7 @@ consider_index_join_clauses(PlannerInfo *root, RelOptInfo *rel, IndexClauseSet *eclauseset, List **bitindexpaths) { + int considered_clauses = 0; List *considered_relids = NIL; int indexcol; @@ -460,8 +464,11 @@ consider_index_join_clauses(PlannerInfo *root, RelOptInfo *rel, * filter (qpqual); which is where an available clause would end up being * applied if we omit it from the indexquals. * - * This looks expensive, but in practical cases there won't be very many - * distinct sets of outer rels to consider. + * This looks expensive, but in most practical cases there won't be very + * many distinct sets of outer rels to consider. As a safety valve when + * that's not true, we use a heuristic: limit the number of outer rel sets + * considered to a multiple of the number of clauses considered. (We'll + * always consider using each individual join clause, though.) * * For simplicity in selecting relevant clauses, we represent each set of * outer rels as a maximum set of clause_relids --- that is, the indexed @@ -471,16 +478,20 @@ consider_index_join_clauses(PlannerInfo *root, RelOptInfo *rel, for (indexcol = 0; indexcol < index->ncolumns; indexcol++) { /* Consider each applicable simple join clause */ + considered_clauses += list_length(jclauseset->indexclauses[indexcol]); consider_index_join_outer_rels(root, rel, index, rclauseset, jclauseset, eclauseset, bitindexpaths, jclauseset->indexclauses[indexcol], + considered_clauses, &considered_relids); /* Consider each applicable eclass join clause */ + considered_clauses += list_length(eclauseset->indexclauses[indexcol]); consider_index_join_outer_rels(root, rel, index, rclauseset, jclauseset, eclauseset, bitindexpaths, eclauseset->indexclauses[indexcol], + considered_clauses, &considered_relids); } } @@ -494,6 +505,7 @@ consider_index_join_clauses(PlannerInfo *root, RelOptInfo *rel, * 'rel', 'index', 'rclauseset', 'jclauseset', 'eclauseset', and * 'bitindexpaths' as above * 'indexjoinclauses' is a list of RestrictInfos for join clauses + * 'considered_clauses' is the total number of clauses considered (so far) * '*considered_relids' is a list of all relids sets already considered */ static void @@ -504,6 +516,7 @@ consider_index_join_outer_rels(PlannerInfo *root, RelOptInfo *rel, IndexClauseSet *eclauseset, List **bitindexpaths, List *indexjoinclauses, + int considered_clauses, List **considered_relids) { ListCell *lc; @@ -522,7 +535,9 @@ consider_index_join_outer_rels(PlannerInfo *root, RelOptInfo *rel, /* * Generate the union of this clause's relids set with each * previously-tried set. This ensures we try this clause along with - * every interesting subset of previous clauses. + * every interesting subset of previous clauses. However, to avoid + * exponential growth of planning time when there are many clauses, + * limit the number of relid sets accepted to 10 * considered_clauses. * * Note: get_join_index_paths adds entries to *considered_relids, but * it prepends them to the list, so that we won't visit new entries @@ -543,6 +558,27 @@ consider_index_join_outer_rels(PlannerInfo *root, RelOptInfo *rel, if (bms_subset_compare(clause_relids, oldrelids) != BMS_DIFFERENT) continue; + /* + * If this clause was derived from an equivalence class, the + * clause list may contain other clauses derived from the same + * eclass. We should not consider that combining this clause with + * one of those clauses generates a usefully different + * parameterization; so skip if any clause derived from the same + * eclass would already have been included when using oldrelids. + */ + if (rinfo->parent_ec && + eclass_already_used(rinfo->parent_ec, oldrelids, + indexjoinclauses)) + continue; + + /* + * If the number of relid sets considered exceeds our heuristic + * limit, stop considering combinations of clauses. We'll still + * consider the current clause alone, though (below this loop). + */ + if (list_length(*considered_relids) >= 10 * considered_clauses) + break; + /* OK, try the union set */ get_join_index_paths(root, rel, index, rclauseset, jclauseset, eclauseset, @@ -647,6 +683,28 @@ get_join_index_paths(PlannerInfo *root, RelOptInfo *rel, *considered_relids = lcons(relids, *considered_relids); } +/* + * eclass_already_used + * True if any join clause usable with oldrelids was generated from + * the specified equivalence class. + */ +static bool +eclass_already_used(EquivalenceClass *parent_ec, Relids oldrelids, + List *indexjoinclauses) +{ + ListCell *lc; + + foreach(lc, indexjoinclauses) + { + RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc); + + if (rinfo->parent_ec == parent_ec && + bms_is_subset(rinfo->clause_relids, oldrelids)) + return true; + } + return false; +} + /* * bms_equal_any * True if relids is bms_equal to any member of relids_list @@ -2531,12 +2589,83 @@ match_clause_to_ordering_op(IndexOptInfo *index, void check_partial_indexes(PlannerInfo *root, RelOptInfo *rel) { - List *restrictinfo_list = rel->baserestrictinfo; - ListCell *ilist; + List *clauselist; + bool have_partial; + Relids otherrels; + ListCell *lc; + + /* + * Frequently, there will be no partial indexes, so first check to make + * sure there's something useful to do here. + */ + have_partial = false; + foreach(lc, rel->indexlist) + { + IndexOptInfo *index = (IndexOptInfo *) lfirst(lc); - foreach(ilist, rel->indexlist) + if (index->indpred == NIL) + continue; /* ignore non-partial indexes */ + + if (index->predOK) + continue; /* don't repeat work if already proven OK */ + + have_partial = true; + break; + } + if (!have_partial) + return; + + /* + * Construct a list of clauses that we can assume true for the purpose + * of proving the index(es) usable. Restriction clauses for the rel are + * always usable, and so are any join clauses that are "movable to" this + * rel. Also, we can consider any EC-derivable join clauses (which must + * be "movable to" this rel, by definition). + */ + clauselist = list_copy(rel->baserestrictinfo); + + /* Scan the rel's join clauses */ + foreach(lc, rel->joininfo) { - IndexOptInfo *index = (IndexOptInfo *) lfirst(ilist); + RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc); + + /* Check if clause can be moved to this rel */ + if (!join_clause_is_movable_to(rinfo, rel->relid)) + continue; + + clauselist = lappend(clauselist, rinfo); + } + + /* + * Add on any equivalence-derivable join clauses. Computing the correct + * relid sets for generate_join_implied_equalities is slightly tricky + * because the rel could be a child rel rather than a true baserel, and + * in that case we must remove its parent's relid from all_baserels. + */ + if (rel->reloptkind == RELOPT_OTHER_MEMBER_REL) + { + /* Lookup parent->child translation data */ + AppendRelInfo *appinfo = find_childrel_appendrelinfo(root, rel); + + otherrels = bms_difference(root->all_baserels, + bms_make_singleton(appinfo->parent_relid)); + } + else + otherrels = bms_difference(root->all_baserels, rel->relids); + + if (!bms_is_empty(otherrels)) + clauselist = + list_concat(clauselist, + generate_join_implied_equalities(root, + bms_union(rel->relids, + otherrels), + otherrels, + rel)); + + /* Now try to prove each index predicate true */ + foreach(lc, rel->indexlist) + { + IndexOptInfo *index = (IndexOptInfo *) lfirst(lc); if (index->indpred == NIL) continue; /* ignore non-partial indexes */ @@ -2544,8 +2673,7 @@ check_partial_indexes(PlannerInfo *root, RelOptInfo *rel) if (index->predOK) continue; /* don't repeat work if already proven OK */ - index->predOK = predicate_implied_by(index->indpred, - restrictinfo_list); + index->predOK = predicate_implied_by(index->indpred, clauselist); } } diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c index 9565e2d607..bd719b57a6 100644 --- a/src/backend/optimizer/plan/initsplan.c +++ b/src/backend/optimizer/plan/initsplan.c @@ -51,9 +51,12 @@ static void distribute_qual_to_rels(PlannerInfo *root, Node *clause, JoinType jointype, Relids qualscope, Relids ojscope, - Relids outerjoin_nonnullable); + Relids outerjoin_nonnullable, + Relids deduced_nullable_relids); static bool check_outerjoin_delay(PlannerInfo *root, Relids *relids_p, Relids *nullable_relids_p, bool is_pushed_down); +static bool check_equivalence_delay(PlannerInfo *root, + RestrictInfo *restrictinfo); static bool check_redundant_nullability_qual(PlannerInfo *root, Node *clause); static void check_mergejoinable(RestrictInfo *restrictinfo); static void check_hashjoinable(RestrictInfo *restrictinfo); @@ -641,7 +644,7 @@ deconstruct_recurse(PlannerInfo *root, Node *jtnode, bool below_outer_join, distribute_qual_to_rels(root, qual, false, below_outer_join, JOIN_INNER, - *qualscope, NULL, NULL); + *qualscope, NULL, NULL, NULL); } } else if (IsA(jtnode, JoinExpr)) @@ -765,7 +768,7 @@ deconstruct_recurse(PlannerInfo *root, Node *jtnode, bool below_outer_join, distribute_qual_to_rels(root, qual, false, below_outer_join, j->jointype, *qualscope, - ojscope, nonnullable_rels); + ojscope, nonnullable_rels, NULL); } /* Now we can add the SpecialJoinInfo to join_info_list */ @@ -1074,13 +1077,19 @@ make_outerjoininfo(PlannerInfo *root, * baserels appearing on the outer (nonnullable) side of the join * (for FULL JOIN this includes both sides of the join, and must in fact * equal qualscope) + * 'deduced_nullable_relids': if is_deduced is TRUE, the nullable relids to + * impute to the clause; otherwise NULL * * 'qualscope' identifies what level of JOIN the qual came from syntactically. * 'ojscope' is needed if we decide to force the qual up to the outer-join * level, which will be ojscope not necessarily qualscope. * - * At the time this is called, root->join_info_list must contain entries for - * all and only those special joins that are syntactically below this qual. + * In normal use (when is_deduced is FALSE), at the time this is called, + * root->join_info_list must contain entries for all and only those special + * joins that are syntactically below this qual. But when is_deduced is TRUE, + * we are adding new deduced clauses after completion of deconstruct_jointree, + * so it cannot be assumed that root->join_info_list has anything to do with + * qual placement. */ static void distribute_qual_to_rels(PlannerInfo *root, Node *clause, @@ -1089,7 +1098,8 @@ distribute_qual_to_rels(PlannerInfo *root, Node *clause, JoinType jointype, Relids qualscope, Relids ojscope, - Relids outerjoin_nonnullable) + Relids outerjoin_nonnullable, + Relids deduced_nullable_relids) { Relids relids; bool is_pushed_down; @@ -1211,12 +1221,13 @@ distribute_qual_to_rels(PlannerInfo *root, Node *clause, * If the qual came from implied-equality deduction, it should not be * outerjoin-delayed, else deducer blew it. But we can't check this * because the join_info_list may now contain OJs above where the qual - * belongs. + * belongs. For the same reason, we must rely on caller to supply the + * correct nullable_relids set. */ Assert(!ojscope); is_pushed_down = true; outerjoin_delayed = false; - nullable_relids = NULL; + nullable_relids = deduced_nullable_relids; /* Don't feed it back for more deductions */ maybe_equivalence = false; maybe_outer_join = false; @@ -1388,7 +1399,8 @@ distribute_qual_to_rels(PlannerInfo *root, Node *clause, { if (maybe_equivalence) { - if (process_equivalence(root, restrictinfo, below_outer_join)) + if (check_equivalence_delay(root, restrictinfo) && + process_equivalence(root, restrictinfo, below_outer_join)) return; /* EC rejected it, so set left_ec/right_ec the hard way ... */ initialize_mergeclause_eclasses(root, restrictinfo); @@ -1560,6 +1572,44 @@ check_outerjoin_delay(PlannerInfo *root, return outerjoin_delayed; } +/* + * check_equivalence_delay + * Detect whether a potential equivalence clause is rendered unsafe + * by outer-join-delay considerations. Return TRUE if it's safe. + * + * The initial tests in distribute_qual_to_rels will consider a mergejoinable + * clause to be a potential equivalence clause if it is not outerjoin_delayed. + * But since the point of equivalence processing is that we will recombine the + * two sides of the clause with others, we have to check that each side + * satisfies the not-outerjoin_delayed condition on its own; otherwise it might + * not be safe to evaluate everywhere we could place a derived equivalence + * condition. + */ +static bool +check_equivalence_delay(PlannerInfo *root, + RestrictInfo *restrictinfo) +{ + Relids relids; + Relids nullable_relids; + + /* fast path if no special joins */ + if (root->join_info_list == NIL) + return true; + + /* must copy restrictinfo's relids to avoid changing it */ + relids = bms_copy(restrictinfo->left_relids); + /* check left side does not need delay */ + if (check_outerjoin_delay(root, &relids, &nullable_relids, true)) + return false; + + /* and similarly for the right side */ + relids = bms_copy(restrictinfo->right_relids); + if (check_outerjoin_delay(root, &relids, &nullable_relids, true)) + return false; + + return true; +} + /* * check_redundant_nullability_qual * Check to see if the qual is an IS NULL qual that is redundant with @@ -1670,11 +1720,20 @@ distribute_restrictinfo_to_rels(PlannerInfo *root, * variable-free. Otherwise the qual is applied at the lowest join level * that provides all its variables. * + * "nullable_relids" is the set of relids used in the expressions that are + * potentially nullable below the expressions. (This has to be supplied by + * caller because this function is used after deconstruct_jointree, so we + * don't have knowledge of where the clause items came from.) + * * "both_const" indicates whether both items are known pseudo-constant; * in this case it is worth applying eval_const_expressions() in case we * can produce constant TRUE or constant FALSE. (Otherwise it's not, * because the expressions went through eval_const_expressions already.) * + * Note: this function will copy item1 and item2, but it is caller's + * responsibility to make sure that the Relids parameters are fresh copies + * not shared with other uses. + * * This is currently used only when an EquivalenceClass is found to * contain pseudoconstants. See path/pathkeys.c for more details. */ @@ -1685,6 +1744,7 @@ process_implied_equality(PlannerInfo *root, Expr *item1, Expr *item2, Relids qualscope, + Relids nullable_relids, bool below_outer_join, bool both_const) { @@ -1718,15 +1778,12 @@ process_implied_equality(PlannerInfo *root, } } - /* Make a copy of qualscope to avoid problems if source EC changes */ - qualscope = bms_copy(qualscope); - /* * Push the new clause into all the appropriate restrictinfo lists. */ distribute_qual_to_rels(root, (Node *) clause, true, below_outer_join, JOIN_INNER, - qualscope, NULL, NULL); + qualscope, NULL, NULL, nullable_relids); } /* @@ -1735,6 +1792,10 @@ process_implied_equality(PlannerInfo *root, * This overlaps the functionality of process_implied_equality(), but we * must return the RestrictInfo, not push it into the joininfo tree. * + * Note: this function will copy item1 and item2, but it is caller's + * responsibility to make sure that the Relids parameters are fresh copies + * not shared with other uses. + * * Note: we do not do initialize_mergeclause_eclasses() here. It is * caller's responsibility that left_ec/right_ec be set as necessary. */ @@ -1743,7 +1804,8 @@ build_implied_join_equality(Oid opno, Oid collation, Expr *item1, Expr *item2, - Relids qualscope) + Relids qualscope, + Relids nullable_relids) { RestrictInfo *restrictinfo; Expr *clause; @@ -1760,9 +1822,6 @@ build_implied_join_equality(Oid opno, InvalidOid, collation); - /* Make a copy of qualscope to avoid problems if source EC changes */ - qualscope = bms_copy(qualscope); - /* * Build the RestrictInfo node itself. */ @@ -1772,7 +1831,7 @@ build_implied_join_equality(Oid opno, false, /* pseudoconstant */ qualscope, /* required_relids */ NULL, /* outer_relids */ - NULL); /* nullable_relids */ + nullable_relids); /* nullable_relids */ /* Set mergejoinability/hashjoinability flags */ check_mergejoinable(restrictinfo); diff --git a/src/backend/optimizer/plan/planagg.c b/src/backend/optimizer/plan/planagg.c index 55a5ed7b4c..658a4abc31 100644 --- a/src/backend/optimizer/plan/planagg.c +++ b/src/backend/optimizer/plan/planagg.c @@ -257,7 +257,10 @@ optimize_minmax_aggregates(PlannerInfo *root, List *tlist, /* * We have to replace Aggrefs with Params in equivalence classes too, else - * ORDER BY or DISTINCT on an optimized aggregate will fail. + * ORDER BY or DISTINCT on an optimized aggregate will fail. We don't + * need to process child eclass members though, since they aren't of + * interest anymore --- and replace_aggs_with_params_mutator isn't able + * to handle Aggrefs containing translated child Vars, anyway. * * Note: at some point it might become necessary to mutate other data * structures too, such as the query's sortClause or distinctClause. Right @@ -265,7 +268,8 @@ optimize_minmax_aggregates(PlannerInfo *root, List *tlist, */ mutate_eclass_expressions(root, replace_aggs_with_params_mutator, - (void *) root); + (void *) root, + false); /* * Generate the output plan --- basically just a Result diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index abcd0ee574..04d502866b 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -170,9 +170,10 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, * Ignore invalid indexes, since they can't safely be used for * queries. Note that this is OK because the data structure we * are constructing is only used by the planner --- the executor - * still needs to insert into "invalid" indexes! + * still needs to insert into "invalid" indexes, if they're marked + * IndexIsReady. */ - if (!index->indisvalid) + if (!IndexIsValid(index)) { index_close(indexRelation, NoLock); continue; diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index 823d3b445a..7085035c52 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -26,6 +26,7 @@ #include "access/sysattr.h" #include "catalog/pg_type.h" +#include "miscadmin.h" #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" #include "optimizer/var.h" @@ -1487,6 +1488,9 @@ transformSetOperationTree(ParseState *pstate, SelectStmt *stmt, Assert(stmt && IsA(stmt, SelectStmt)); + /* Guard against stack overflow due to overly complex set-expressions */ + check_stack_depth(); + /* * Validity-check both leaf and internal SELECTs for disallowed ops. */ diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index e4ff76e66e..ad98b364f1 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -2383,6 +2383,10 @@ copy_opt_item: { $$ = makeDefElem("oids", (Node *)makeInteger(TRUE)); } + | FREEZE + { + $$ = makeDefElem("freeze", (Node *)makeInteger(TRUE)); + } | DELIMITER opt_as Sconst { $$ = makeDefElem("delimiter", (Node *)makeString($3)); diff --git a/src/backend/parser/parse_node.c b/src/backend/parser/parse_node.c index 2e9fad0f97..91a1891e97 100644 --- a/src/backend/parser/parse_node.c +++ b/src/backend/parser/parse_node.c @@ -145,10 +145,10 @@ setup_parser_errposition_callback(ParseCallbackState *pcbstate, /* Setup error traceback support for ereport() */ pcbstate->pstate = pstate; pcbstate->location = location; - pcbstate->errcontext.callback = pcb_error_callback; - pcbstate->errcontext.arg = (void *) pcbstate; - pcbstate->errcontext.previous = error_context_stack; - error_context_stack = &pcbstate->errcontext; + pcbstate->errcallback.callback = pcb_error_callback; + pcbstate->errcallback.arg = (void *) pcbstate; + pcbstate->errcallback.previous = error_context_stack; + error_context_stack = &pcbstate->errcallback; } /* @@ -158,7 +158,7 @@ void cancel_parser_errposition_callback(ParseCallbackState *pcbstate) { /* Pop the error context stack */ - error_context_stack = pcbstate->errcontext.previous; + error_context_stack = pcbstate->errcallback.previous; } /* diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index 95c57e81b5..086cc75e22 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -1533,18 +1533,12 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt) index_name, RelationGetRelationName(heap_rel)), parser_errposition(cxt->pstate, constraint->location))); - if (!index_form->indisvalid) + if (!IndexIsValid(index_form)) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("index \"%s\" is not valid", index_name), parser_errposition(cxt->pstate, constraint->location))); - if (!index_form->indisready) - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("index \"%s\" is not ready", index_name), - parser_errposition(cxt->pstate, constraint->location))); - if (!index_form->indisunique) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), diff --git a/src/backend/port/ipc_test.c b/src/backend/port/ipc_test.c index 4632a628e3..c59fd3efad 100644 --- a/src/backend/port/ipc_test.c +++ b/src/backend/port/ipc_test.c @@ -113,7 +113,7 @@ ProcessInterrupts(void) { } -int +void ExceptionalCondition(const char *conditionName, const char *errorType, const char *fileName, @@ -123,7 +123,6 @@ ExceptionalCondition(const char *conditionName, errorType, conditionName, fileName, lineNumber); abort(); - return 0; } diff --git a/src/backend/port/unix_latch.c b/src/backend/port/unix_latch.c index 335e9f66af..d3b2247f05 100644 --- a/src/backend/port/unix_latch.c +++ b/src/backend/port/unix_latch.c @@ -33,6 +33,7 @@ #include "postgres.h" #include +#include #include #include #include @@ -48,6 +49,7 @@ #endif #include "miscadmin.h" +#include "portability/instr_time.h" #include "postmaster/postmaster.h" #include "storage/latch.h" #include "storage/pmsignal.h" @@ -60,21 +62,50 @@ static volatile sig_atomic_t waiting = false; static int selfpipe_readfd = -1; static int selfpipe_writefd = -1; -/* private function prototypes */ -static void initSelfPipe(void); -static void drainSelfPipe(void); +/* Private function prototypes */ static void sendSelfPipeByte(void); +static void drainSelfPipe(void); +/* + * Initialize the process-local latch infrastructure. + * + * This must be called once during startup of any process that can wait on + * latches, before it issues any InitLatch() or OwnLatch() calls. + */ +void +InitializeLatchSupport(void) +{ + int pipefd[2]; + + Assert(selfpipe_readfd == -1); + + /* + * Set up the self-pipe that allows a signal handler to wake up the + * select() in WaitLatch. Make the write-end non-blocking, so that + * SetLatch won't block if the event has already been set many times + * filling the kernel buffer. Make the read-end non-blocking too, so that + * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK. + */ + if (pipe(pipefd) < 0) + elog(FATAL, "pipe() failed: %m"); + if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) < 0) + elog(FATAL, "fcntl() failed on read-end of self-pipe: %m"); + if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) < 0) + elog(FATAL, "fcntl() failed on write-end of self-pipe: %m"); + + selfpipe_readfd = pipefd[0]; + selfpipe_writefd = pipefd[1]; +} + /* * Initialize a backend-local latch. */ void InitLatch(volatile Latch *latch) { - /* Initialize the self-pipe if this is our first latch in the process */ - if (selfpipe_readfd == -1) - initSelfPipe(); + /* Assert InitializeLatchSupport has been called in this process */ + Assert(selfpipe_readfd >= 0); latch->is_set = false; latch->owner_pid = MyProcPid; @@ -116,11 +147,10 @@ InitSharedLatch(volatile Latch *latch) void OwnLatch(volatile Latch *latch) { - Assert(latch->is_shared); + /* Assert InitializeLatchSupport has been called in this process */ + Assert(selfpipe_readfd >= 0); - /* Initialize the self-pipe if this is our first latch in this process */ - if (selfpipe_readfd == -1) - initSelfPipe(); + Assert(latch->is_shared); /* sanity check */ if (latch->owner_pid != 0) @@ -147,13 +177,10 @@ DisownLatch(volatile Latch *latch) * to wait for. If the latch is already set (and WL_LATCH_SET is given), the * function returns immediately. * - * The 'timeout' is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag - * is given. On some platforms, signals do not interrupt the wait, or even - * cause the timeout to be restarted, so beware that the function can sleep - * for several times longer than the requested timeout. However, this - * difficulty is not so great as it seems, because the signal handlers for any - * signals that the caller should respond to ought to be programmed to end the - * wait by calling SetLatch. Ideally, the timeout parameter is vestigial. + * The "timeout" is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag + * is given. Although it is declared as "long", we don't actually support + * timeouts longer than INT_MAX milliseconds. Note that some extra overhead + * is incurred when WL_TIMEOUT is given, so avoid using a timeout if possible. * * The latch must be owned by the current process, ie. it must be a * backend-local latch initialized with InitLatch, or a shared latch @@ -183,13 +210,16 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock, { int result = 0; int rc; + instr_time start_time, + cur_time; + long cur_timeout; #ifdef HAVE_POLL struct pollfd pfds[3]; int nfds; #else struct timeval tv, - *tvp = NULL; + *tvp; fd_set input_mask; fd_set output_mask; int hifd; @@ -206,21 +236,30 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock, if ((wakeEvents & WL_LATCH_SET) && latch->owner_pid != MyProcPid) elog(ERROR, "cannot wait on a latch owned by another process"); - /* Initialize timeout */ + /* + * Initialize timeout if requested. We must record the current time so + * that we can determine the remaining timeout if the poll() or select() + * is interrupted. (On some platforms, select() will update the contents + * of "tv" for us, but unfortunately we can't rely on that.) + */ if (wakeEvents & WL_TIMEOUT) { - Assert(timeout >= 0); + INSTR_TIME_SET_CURRENT(start_time); + Assert(timeout >= 0 && timeout <= INT_MAX); + cur_timeout = timeout; + #ifndef HAVE_POLL - tv.tv_sec = timeout / 1000L; - tv.tv_usec = (timeout % 1000L) * 1000L; + tv.tv_sec = cur_timeout / 1000L; + tv.tv_usec = (cur_timeout % 1000L) * 1000L; tvp = &tv; #endif } else { -#ifdef HAVE_POLL - /* make sure poll() agrees there is no timeout */ - timeout = -1; + cur_timeout = -1; + +#ifndef HAVE_POLL + tvp = NULL; #endif } @@ -283,54 +322,62 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock, } /* Sleep */ - rc = poll(pfds, nfds, (int) timeout); + rc = poll(pfds, nfds, (int) cur_timeout); /* Check return code */ if (rc < 0) { - if (errno == EINTR) - continue; - waiting = false; - ereport(ERROR, - (errcode_for_socket_access(), - errmsg("poll() failed: %m"))); + /* EINTR is okay, otherwise complain */ + if (errno != EINTR) + { + waiting = false; + ereport(ERROR, + (errcode_for_socket_access(), + errmsg("poll() failed: %m"))); + } } - if (rc == 0 && (wakeEvents & WL_TIMEOUT)) + else if (rc == 0) { /* timeout exceeded */ - result |= WL_TIMEOUT; - } - if ((wakeEvents & WL_SOCKET_READABLE) && - (pfds[0].revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL))) - { - /* data available in socket, or EOF/error condition */ - result |= WL_SOCKET_READABLE; + if (wakeEvents & WL_TIMEOUT) + result |= WL_TIMEOUT; } - if ((wakeEvents & WL_SOCKET_WRITEABLE) && - (pfds[0].revents & POLLOUT)) + else { - result |= WL_SOCKET_WRITEABLE; - } + /* at least one event occurred, so check revents values */ + if ((wakeEvents & WL_SOCKET_READABLE) && + (pfds[0].revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL))) + { + /* data available in socket, or EOF/error condition */ + result |= WL_SOCKET_READABLE; + } + if ((wakeEvents & WL_SOCKET_WRITEABLE) && + (pfds[0].revents & POLLOUT)) + { + result |= WL_SOCKET_WRITEABLE; + } - /* - * We expect a POLLHUP when the remote end is closed, but because we - * don't expect the pipe to become readable or to have any errors - * either, treat those as postmaster death, too. - */ - if ((wakeEvents & WL_POSTMASTER_DEATH) && - (pfds[nfds - 1].revents & (POLLHUP | POLLIN | POLLERR | POLLNVAL))) - { /* - * According to the select(2) man page on Linux, select(2) may - * spuriously return and report a file descriptor as readable, - * when it's not; and presumably so can poll(2). It's not clear - * that the relevant cases would ever apply to the postmaster - * pipe, but since the consequences of falsely returning - * WL_POSTMASTER_DEATH could be pretty unpleasant, we take the - * trouble to positively verify EOF with PostmasterIsAlive(). + * We expect a POLLHUP when the remote end is closed, but because + * we don't expect the pipe to become readable or to have any + * errors either, treat those cases as postmaster death, too. */ - if (!PostmasterIsAlive()) - result |= WL_POSTMASTER_DEATH; + if ((wakeEvents & WL_POSTMASTER_DEATH) && + (pfds[nfds - 1].revents & (POLLHUP | POLLIN | POLLERR | POLLNVAL))) + { + /* + * According to the select(2) man page on Linux, select(2) may + * spuriously return and report a file descriptor as readable, + * when it's not; and presumably so can poll(2). It's not + * clear that the relevant cases would ever apply to the + * postmaster pipe, but since the consequences of falsely + * returning WL_POSTMASTER_DEATH could be pretty unpleasant, + * we take the trouble to positively verify EOF with + * PostmasterIsAlive(). + */ + if (!PostmasterIsAlive()) + result |= WL_POSTMASTER_DEATH; + } } #else /* !HAVE_POLL */ @@ -367,43 +414,66 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock, /* Check return code */ if (rc < 0) { - if (errno == EINTR) - continue; - waiting = false; - ereport(ERROR, - (errcode_for_socket_access(), - errmsg("select() failed: %m"))); + /* EINTR is okay, otherwise complain */ + if (errno != EINTR) + { + waiting = false; + ereport(ERROR, + (errcode_for_socket_access(), + errmsg("select() failed: %m"))); + } } - if (rc == 0 && (wakeEvents & WL_TIMEOUT)) + else if (rc == 0) { /* timeout exceeded */ - result |= WL_TIMEOUT; - } - if ((wakeEvents & WL_SOCKET_READABLE) && FD_ISSET(sock, &input_mask)) - { - /* data available in socket, or EOF */ - result |= WL_SOCKET_READABLE; + if (wakeEvents & WL_TIMEOUT) + result |= WL_TIMEOUT; } - if ((wakeEvents & WL_SOCKET_WRITEABLE) && FD_ISSET(sock, &output_mask)) + else { - result |= WL_SOCKET_WRITEABLE; - } - if ((wakeEvents & WL_POSTMASTER_DEATH) && + /* at least one event occurred, so check masks */ + if ((wakeEvents & WL_SOCKET_READABLE) && FD_ISSET(sock, &input_mask)) + { + /* data available in socket, or EOF */ + result |= WL_SOCKET_READABLE; + } + if ((wakeEvents & WL_SOCKET_WRITEABLE) && FD_ISSET(sock, &output_mask)) + { + result |= WL_SOCKET_WRITEABLE; + } + if ((wakeEvents & WL_POSTMASTER_DEATH) && FD_ISSET(postmaster_alive_fds[POSTMASTER_FD_WATCH], &input_mask)) - { - /* - * According to the select(2) man page on Linux, select(2) may - * spuriously return and report a file descriptor as readable, - * when it's not; and presumably so can poll(2). It's not clear - * that the relevant cases would ever apply to the postmaster - * pipe, but since the consequences of falsely returning - * WL_POSTMASTER_DEATH could be pretty unpleasant, we take the - * trouble to positively verify EOF with PostmasterIsAlive(). - */ - if (!PostmasterIsAlive()) - result |= WL_POSTMASTER_DEATH; + { + /* + * According to the select(2) man page on Linux, select(2) may + * spuriously return and report a file descriptor as readable, + * when it's not; and presumably so can poll(2). It's not + * clear that the relevant cases would ever apply to the + * postmaster pipe, but since the consequences of falsely + * returning WL_POSTMASTER_DEATH could be pretty unpleasant, + * we take the trouble to positively verify EOF with + * PostmasterIsAlive(). + */ + if (!PostmasterIsAlive()) + result |= WL_POSTMASTER_DEATH; + } } #endif /* HAVE_POLL */ + + /* If we're not done, update cur_timeout for next iteration */ + if (result == 0 && cur_timeout >= 0) + { + INSTR_TIME_SET_CURRENT(cur_time); + INSTR_TIME_SUBTRACT(cur_time, start_time); + cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time); + if (cur_timeout < 0) + cur_timeout = 0; + +#ifndef HAVE_POLL + tv.tv_sec = cur_timeout / 1000L; + tv.tv_usec = (cur_timeout % 1000L) * 1000L; +#endif + } } while (result == 0); waiting = false; @@ -514,30 +584,6 @@ latch_sigusr1_handler(void) sendSelfPipeByte(); } -/* initialize the self-pipe */ -static void -initSelfPipe(void) -{ - int pipefd[2]; - - /* - * Set up the self-pipe that allows a signal handler to wake up the - * select() in WaitLatch. Make the write-end non-blocking, so that - * SetLatch won't block if the event has already been set many times - * filling the kernel buffer. Make the read-end non-blocking too, so that - * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK. - */ - if (pipe(pipefd) < 0) - elog(FATAL, "pipe() failed: %m"); - if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) < 0) - elog(FATAL, "fcntl() failed on read-end of self-pipe: %m"); - if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) < 0) - elog(FATAL, "fcntl() failed on write-end of self-pipe: %m"); - - selfpipe_readfd = pipefd[0]; - selfpipe_writefd = pipefd[1]; -} - /* Send one byte to the self-pipe, to wake up WaitLatch */ static void sendSelfPipeByte(void) diff --git a/src/backend/port/win32_latch.c b/src/backend/port/win32_latch.c index 1f1ed33dc2..575035c28d 100644 --- a/src/backend/port/win32_latch.c +++ b/src/backend/port/win32_latch.c @@ -20,16 +20,24 @@ #include "postgres.h" #include +#include #include #include #include "miscadmin.h" +#include "portability/instr_time.h" #include "postmaster/postmaster.h" #include "storage/latch.h" #include "storage/pmsignal.h" #include "storage/shmem.h" +void +InitializeLatchSupport(void) +{ + /* currently, nothing to do here for Windows */ +} + void InitLatch(volatile Latch *latch) { @@ -94,6 +102,9 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock, long timeout) { DWORD rc; + instr_time start_time, + cur_time; + long cur_timeout; HANDLE events[4]; HANDLE latchevent; HANDLE sockevent = WSA_INVALID_EVENT; @@ -112,11 +123,19 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock, if ((wakeEvents & WL_LATCH_SET) && latch->owner_pid != MyProcPid) elog(ERROR, "cannot wait on a latch owned by another process"); - /* Convert timeout to form used by WaitForMultipleObjects() */ + /* + * Initialize timeout if requested. We must record the current time so + * that we can determine the remaining timeout if WaitForMultipleObjects + * is interrupted. + */ if (wakeEvents & WL_TIMEOUT) - Assert(timeout >= 0); + { + INSTR_TIME_SET_CURRENT(start_time); + Assert(timeout >= 0 && timeout <= INT_MAX); + cur_timeout = timeout; + } else - timeout = INFINITE; + cur_timeout = INFINITE; /* * Construct an array of event handles for WaitforMultipleObjects(). @@ -181,7 +200,7 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock, break; } - rc = WaitForMultipleObjects(numevents, events, FALSE, timeout); + rc = WaitForMultipleObjects(numevents, events, FALSE, cur_timeout); if (rc == WAIT_FAILED) elog(ERROR, "WaitForMultipleObjects() failed: error code %lu", @@ -197,7 +216,11 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock, } else if (rc == WAIT_OBJECT_0 + 1) { - /* Latch is set, we'll handle that on next iteration of loop */ + /* + * Latch is set. We'll handle that on next iteration of loop, but + * let's not waste the cycles to update cur_timeout below. + */ + continue; } else if ((wakeEvents & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) && rc == WAIT_OBJECT_0 + 2) /* socket is at event slot 2 */ @@ -234,8 +257,17 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock, } else elog(ERROR, "unexpected return code from WaitForMultipleObjects(): %lu", rc); - } - while (result == 0); + + /* If we're not done, update cur_timeout for next iteration */ + if (result == 0 && cur_timeout != INFINITE) + { + INSTR_TIME_SET_CURRENT(cur_time); + INSTR_TIME_SUBTRACT(cur_time, start_time); + cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time); + if (cur_timeout < 0) + cur_timeout = 0; + } + } while (result == 0); /* Clean up the event object we created for the socket */ if (sockevent != WSA_INVALID_EVENT) diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 74db821387..6977bcf75e 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -77,7 +77,7 @@ #include "catalog/pg_database.h" #include "commands/dbcommands.h" #include "commands/vacuum.h" -#include "lib/dllist.h" +#include "lib/ilist.h" #include "libpq/pqsignal.h" #include "miscadmin.h" #include "pgstat.h" @@ -152,6 +152,7 @@ typedef struct avl_dbase Oid adl_datid; /* hash key -- must be first */ TimestampTz adl_next_worker; int adl_score; + dlist_node adl_node; } avl_dbase; /* struct to keep track of databases in worker */ @@ -208,7 +209,7 @@ typedef struct autovac_table */ typedef struct WorkerInfoData { - SHM_QUEUE wi_links; + dlist_node wi_links; Oid wi_dboid; Oid wi_tableoid; PGPROC *wi_proc; @@ -251,15 +252,18 @@ typedef struct { sig_atomic_t av_signal[AutoVacNumSignals]; pid_t av_launcherpid; - WorkerInfo av_freeWorkers; - SHM_QUEUE av_runningWorkers; + dlist_head av_freeWorkers; + dlist_head av_runningWorkers; WorkerInfo av_startingWorker; } AutoVacuumShmemStruct; static AutoVacuumShmemStruct *AutoVacuumShmem; -/* the database list in the launcher, and the context that contains it */ -static Dllist *DatabaseList = NULL; +/* + * the database list (of avl_dbase elements) in the launcher, and the context + * that contains it + */ +static dlist_head DatabaseList = DLIST_STATIC_INIT(DatabaseList); static MemoryContext DatabaseListCxt = NULL; /* Pointer to my own WorkerInfo, valid on each worker */ @@ -508,7 +512,7 @@ AutoVacLauncherMain(int argc, char *argv[]) /* don't leave dangling pointers to freed memory */ DatabaseListCxt = NULL; - DatabaseList = NULL; + dlist_init(&DatabaseList); /* * Make sure pgstat also considers our stat data as gone. Note: we @@ -576,7 +580,6 @@ AutoVacLauncherMain(int argc, char *argv[]) struct timeval nap; TimestampTz current_time = 0; bool can_launch; - Dlelem *elem; int rc; /* @@ -586,7 +589,7 @@ AutoVacLauncherMain(int argc, char *argv[]) * wakening conditions. */ - launcher_determine_sleep((AutoVacuumShmem->av_freeWorkers != NULL), + launcher_determine_sleep(!dlist_is_empty(&AutoVacuumShmem->av_freeWorkers), false, &nap); /* Allow sinval catchup interrupts while sleeping */ @@ -679,7 +682,7 @@ AutoVacLauncherMain(int argc, char *argv[]) current_time = GetCurrentTimestamp(); LWLockAcquire(AutovacuumLock, LW_SHARED); - can_launch = (AutoVacuumShmem->av_freeWorkers != NULL); + can_launch = !dlist_is_empty(&AutoVacuumShmem->av_freeWorkers); if (AutoVacuumShmem->av_startingWorker != NULL) { @@ -721,8 +724,8 @@ AutoVacLauncherMain(int argc, char *argv[]) worker->wi_tableoid = InvalidOid; worker->wi_proc = NULL; worker->wi_launchtime = 0; - worker->wi_links.next = (SHM_QUEUE *) AutoVacuumShmem->av_freeWorkers; - AutoVacuumShmem->av_freeWorkers = worker; + dlist_push_head(&AutoVacuumShmem->av_freeWorkers, + &worker->wi_links); AutoVacuumShmem->av_startingWorker = NULL; elog(WARNING, "worker took too long to start; canceled"); } @@ -738,20 +741,7 @@ AutoVacLauncherMain(int argc, char *argv[]) /* We're OK to start a new worker */ - elem = DLGetTail(DatabaseList); - if (elem != NULL) - { - avl_dbase *avdb = DLE_VAL(elem); - - /* - * launch a worker if next_worker is right now or it is in the - * past - */ - if (TimestampDifferenceExceeds(avdb->adl_next_worker, - current_time, 0)) - launch_worker(current_time); - } - else + if (dlist_is_empty(&DatabaseList)) { /* * Special case when the list is empty: start a worker right away. @@ -763,6 +753,25 @@ AutoVacLauncherMain(int argc, char *argv[]) */ launch_worker(current_time); } + else + { + /* + * because rebuild_database_list constructs a list with most + * distant adl_next_worker first, we obtain our database from the + * tail of the list. + */ + avl_dbase *avdb; + + avdb = dlist_tail_element(avl_dbase, adl_node, &DatabaseList); + + /* + * launch a worker if next_worker is right now or it is in the + * past + */ + if (TimestampDifferenceExceeds(avdb->adl_next_worker, + current_time, 0)) + launch_worker(current_time); + } } /* Normal exit from the autovac launcher is here */ @@ -783,8 +792,6 @@ AutoVacLauncherMain(int argc, char *argv[]) static void launcher_determine_sleep(bool canlaunch, bool recursing, struct timeval * nap) { - Dlelem *elem; - /* * We sleep until the next scheduled vacuum. We trust that when the * database list was built, care was taken so that no entries have times @@ -796,14 +803,16 @@ launcher_determine_sleep(bool canlaunch, bool recursing, struct timeval * nap) nap->tv_sec = autovacuum_naptime; nap->tv_usec = 0; } - else if ((elem = DLGetTail(DatabaseList)) != NULL) + else if (!dlist_is_empty(&DatabaseList)) { - avl_dbase *avdb = DLE_VAL(elem); TimestampTz current_time = GetCurrentTimestamp(); TimestampTz next_wakeup; + avl_dbase *avdb; long secs; int usecs; + avdb = dlist_tail_element(avl_dbase, adl_node, &DatabaseList); + next_wakeup = avdb->adl_next_worker; TimestampDifference(current_time, next_wakeup, &secs, &usecs); @@ -867,6 +876,7 @@ rebuild_database_list(Oid newdb) int score; int nelems; HTAB *dbhash; + dlist_iter iter; /* use fresh stats */ autovac_refresh_stats(); @@ -927,36 +937,28 @@ rebuild_database_list(Oid newdb) } /* Now insert the databases from the existing list */ - if (DatabaseList != NULL) + dlist_foreach(iter, &DatabaseList) { - Dlelem *elem; - - elem = DLGetHead(DatabaseList); - while (elem != NULL) - { - avl_dbase *avdb = DLE_VAL(elem); - avl_dbase *db; - bool found; - PgStat_StatDBEntry *entry; - - elem = DLGetSucc(elem); + avl_dbase *avdb = dlist_container(avl_dbase, adl_node, iter.cur); + avl_dbase *db; + bool found; + PgStat_StatDBEntry *entry; - /* - * skip databases with no stat entries -- in particular, this gets - * rid of dropped databases - */ - entry = pgstat_fetch_stat_dbentry(avdb->adl_datid); - if (entry == NULL) - continue; + /* + * skip databases with no stat entries -- in particular, this gets + * rid of dropped databases + */ + entry = pgstat_fetch_stat_dbentry(avdb->adl_datid); + if (entry == NULL) + continue; - db = hash_search(dbhash, &(avdb->adl_datid), HASH_ENTER, &found); + db = hash_search(dbhash, &(avdb->adl_datid), HASH_ENTER, &found); - if (!found) - { - /* hash_search already filled in the key */ - db->adl_score = score++; - /* next_worker is filled in later */ - } + if (!found) + { + /* hash_search already filled in the key */ + db->adl_score = score++; + /* next_worker is filled in later */ } } @@ -987,7 +989,7 @@ rebuild_database_list(Oid newdb) /* from here on, the allocated memory belongs to the new list */ MemoryContextSwitchTo(newcxt); - DatabaseList = DLNewList(); + dlist_init(&DatabaseList); if (nelems > 0) { @@ -1029,15 +1031,13 @@ rebuild_database_list(Oid newdb) for (i = 0; i < nelems; i++) { avl_dbase *db = &(dbary[i]); - Dlelem *elem; current_time = TimestampTzPlusMilliseconds(current_time, millis_increment); db->adl_next_worker = current_time; - elem = DLNewElem(db); /* later elements should go closer to the head of the list */ - DLAddHead(DatabaseList, elem); + dlist_push_head(&DatabaseList, &db->adl_node); } } @@ -1086,7 +1086,7 @@ do_start_worker(void) /* return quickly when there are no free workers */ LWLockAcquire(AutovacuumLock, LW_SHARED); - if (AutoVacuumShmem->av_freeWorkers == NULL) + if (dlist_is_empty(&AutoVacuumShmem->av_freeWorkers)) { LWLockRelease(AutovacuumLock); return InvalidOid; @@ -1147,7 +1147,7 @@ do_start_worker(void) foreach(cell, dblist) { avw_dbase *tmp = lfirst(cell); - Dlelem *elem; + dlist_iter iter; /* Check to see if this one is at risk of wraparound */ if (TransactionIdPrecedes(tmp->adw_frozenxid, xidForceLimit)) @@ -1179,11 +1179,10 @@ do_start_worker(void) * autovacuum time yet. */ skipit = false; - elem = DatabaseList ? DLGetTail(DatabaseList) : NULL; - while (elem != NULL) + dlist_reverse_foreach(iter, &DatabaseList) { - avl_dbase *dbp = DLE_VAL(elem); + avl_dbase *dbp = dlist_container(avl_dbase, adl_node, iter.cur); if (dbp->adl_datid == tmp->adw_datid) { @@ -1200,7 +1199,6 @@ do_start_worker(void) break; } - elem = DLGetPred(elem); } if (skipit) continue; @@ -1218,20 +1216,17 @@ do_start_worker(void) if (avdb != NULL) { WorkerInfo worker; + dlist_node *wptr; LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE); /* * Get a worker entry from the freelist. We checked above, so there - * really should be a free slot -- complain very loudly if there - * isn't. + * really should be a free slot. */ - worker = AutoVacuumShmem->av_freeWorkers; - if (worker == NULL) - elog(FATAL, "no free worker found"); - - AutoVacuumShmem->av_freeWorkers = (WorkerInfo) worker->wi_links.next; + wptr = dlist_pop_head_node(&AutoVacuumShmem->av_freeWorkers); + worker = dlist_container(WorkerInfoData, wi_links, wptr); worker->wi_dboid = avdb->adw_datid; worker->wi_proc = NULL; worker->wi_launchtime = GetCurrentTimestamp(); @@ -1274,22 +1269,25 @@ static void launch_worker(TimestampTz now) { Oid dbid; - Dlelem *elem; + dlist_iter iter; dbid = do_start_worker(); if (OidIsValid(dbid)) { + bool found = false; + /* * Walk the database list and update the corresponding entry. If the * database is not on the list, we'll recreate the list. */ - elem = (DatabaseList == NULL) ? NULL : DLGetHead(DatabaseList); - while (elem != NULL) + dlist_foreach(iter, &DatabaseList) { - avl_dbase *avdb = DLE_VAL(elem); + avl_dbase *avdb = dlist_container(avl_dbase, adl_node, iter.cur); if (avdb->adl_datid == dbid) { + found = true; + /* * add autovacuum_naptime seconds to the current time, and use * that as the new "next_worker" field for this database. @@ -1297,10 +1295,9 @@ launch_worker(TimestampTz now) avdb->adl_next_worker = TimestampTzPlusMilliseconds(now, autovacuum_naptime * 1000); - DLMoveToFront(elem); + dlist_move_head(&DatabaseList, iter.cur); break; } - elem = DLGetSucc(elem); } /* @@ -1310,7 +1307,7 @@ launch_worker(TimestampTz now) * pgstat entry, but this is not a problem because we don't want to * schedule workers regularly into those in any case. */ - if (elem == NULL) + if (!found) rebuild_database_list(dbid); } } @@ -1590,8 +1587,8 @@ AutoVacWorkerMain(int argc, char *argv[]) MyWorkerInfo->wi_proc = MyProc; /* insert into the running list */ - SHMQueueInsertBefore(&AutoVacuumShmem->av_runningWorkers, - &MyWorkerInfo->wi_links); + dlist_push_head(&AutoVacuumShmem->av_runningWorkers, + &MyWorkerInfo->wi_links); /* * remove from the "starting" pointer, so that the launcher can start @@ -1681,8 +1678,7 @@ FreeWorkerInfo(int code, Datum arg) */ AutovacuumLauncherPid = AutoVacuumShmem->av_launcherpid; - SHMQueueDelete(&MyWorkerInfo->wi_links); - MyWorkerInfo->wi_links.next = (SHM_QUEUE *) AutoVacuumShmem->av_freeWorkers; + dlist_delete(&MyWorkerInfo->wi_links); MyWorkerInfo->wi_dboid = InvalidOid; MyWorkerInfo->wi_tableoid = InvalidOid; MyWorkerInfo->wi_proc = NULL; @@ -1690,7 +1686,8 @@ FreeWorkerInfo(int code, Datum arg) MyWorkerInfo->wi_cost_delay = 0; MyWorkerInfo->wi_cost_limit = 0; MyWorkerInfo->wi_cost_limit_base = 0; - AutoVacuumShmem->av_freeWorkers = MyWorkerInfo; + dlist_push_head(&AutoVacuumShmem->av_freeWorkers, + &MyWorkerInfo->wi_links); /* not mine anymore */ MyWorkerInfo = NULL; @@ -1740,7 +1737,7 @@ autovac_balance_cost(void) autovacuum_vac_cost_delay : VacuumCostDelay); double cost_total; double cost_avail; - WorkerInfo worker; + dlist_iter iter; /* not set? nothing to do */ if (vac_cost_limit <= 0 || vac_cost_delay <= 0) @@ -1748,19 +1745,14 @@ autovac_balance_cost(void) /* caculate the total base cost limit of active workers */ cost_total = 0.0; - worker = (WorkerInfo) SHMQueueNext(&AutoVacuumShmem->av_runningWorkers, - &AutoVacuumShmem->av_runningWorkers, - offsetof(WorkerInfoData, wi_links)); - while (worker) + dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers) { + WorkerInfo worker = dlist_container(WorkerInfoData, wi_links, iter.cur); + if (worker->wi_proc != NULL && worker->wi_cost_limit_base > 0 && worker->wi_cost_delay > 0) cost_total += (double) worker->wi_cost_limit_base / worker->wi_cost_delay; - - worker = (WorkerInfo) SHMQueueNext(&AutoVacuumShmem->av_runningWorkers, - &worker->wi_links, - offsetof(WorkerInfoData, wi_links)); } /* there are no cost limits -- nothing to do */ if (cost_total <= 0) @@ -1771,11 +1763,10 @@ autovac_balance_cost(void) * limit to autovacuum_vacuum_cost_limit. */ cost_avail = (double) vac_cost_limit / vac_cost_delay; - worker = (WorkerInfo) SHMQueueNext(&AutoVacuumShmem->av_runningWorkers, - &AutoVacuumShmem->av_runningWorkers, - offsetof(WorkerInfoData, wi_links)); - while (worker) + dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers) { + WorkerInfo worker = dlist_container(WorkerInfoData, wi_links, iter.cur); + if (worker->wi_proc != NULL && worker->wi_cost_limit_base > 0 && worker->wi_cost_delay > 0) { @@ -1797,10 +1788,6 @@ autovac_balance_cost(void) worker->wi_cost_limit, worker->wi_cost_limit_base, worker->wi_cost_delay); } - - worker = (WorkerInfo) SHMQueueNext(&AutoVacuumShmem->av_runningWorkers, - &worker->wi_links, - offsetof(WorkerInfoData, wi_links)); } } @@ -2177,10 +2164,10 @@ do_autovacuum(void) { Oid relid = lfirst_oid(cell); autovac_table *tab; - WorkerInfo worker; bool skipit; int stdVacuumCostDelay; int stdVacuumCostLimit; + dlist_iter iter; CHECK_FOR_INTERRUPTS(); @@ -2197,29 +2184,23 @@ do_autovacuum(void) * worker. */ skipit = false; - worker = (WorkerInfo) SHMQueueNext(&AutoVacuumShmem->av_runningWorkers, - &AutoVacuumShmem->av_runningWorkers, - offsetof(WorkerInfoData, wi_links)); - while (worker) + dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers) { + WorkerInfo worker = dlist_container(WorkerInfoData, wi_links, iter.cur); + /* ignore myself */ if (worker == MyWorkerInfo) - goto next_worker; + continue; /* ignore workers in other databases */ if (worker->wi_dboid != MyDatabaseId) - goto next_worker; + continue; if (worker->wi_tableoid == relid) { skipit = true; break; } - - next_worker: - worker = (WorkerInfo) SHMQueueNext(&AutoVacuumShmem->av_runningWorkers, - &worker->wi_links, - offsetof(WorkerInfoData, wi_links)); } LWLockRelease(AutovacuumLock); if (skipit) @@ -2875,8 +2856,8 @@ AutoVacuumShmemInit(void) Assert(!found); AutoVacuumShmem->av_launcherpid = 0; - AutoVacuumShmem->av_freeWorkers = NULL; - SHMQueueInit(&AutoVacuumShmem->av_runningWorkers); + dlist_init(&AutoVacuumShmem->av_freeWorkers); + dlist_init(&AutoVacuumShmem->av_runningWorkers); AutoVacuumShmem->av_startingWorker = NULL; worker = (WorkerInfo) ((char *) AutoVacuumShmem + @@ -2884,10 +2865,8 @@ AutoVacuumShmemInit(void) /* initialize the WorkerInfo free list */ for (i = 0; i < autovacuum_max_workers; i++) - { - worker[i].wi_links.next = (SHM_QUEUE *) AutoVacuumShmem->av_freeWorkers; - AutoVacuumShmem->av_freeWorkers = &worker[i]; - } + dlist_push_head(&AutoVacuumShmem->av_freeWorkers, + &worker[i].wi_links); } else Assert(found); diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 748fd85edb..709ccf1f25 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -183,6 +183,7 @@ BackgroundWriterMain(void) false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); + AtEOXact_SMgr(); AtEOXact_Files(); AtEOXact_HashTables(false); diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index c5f32059a7..18e6a4e8c4 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -291,6 +291,7 @@ CheckpointerMain(void) false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); + AtEOXact_SMgr(); AtEOXact_Files(); AtEOXact_HashTables(false); diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c index d5d8be0587..a6c0aea3d6 100644 --- a/src/backend/postmaster/pgarch.c +++ b/src/backend/postmaster/pgarch.c @@ -234,8 +234,6 @@ PgArchiverMain(int argc, char *argv[]) MyProcPid = getpid(); /* reset MyProcPid */ - InitLatch(&mainloop_latch); /* initialize latch used in main loop */ - MyStartTime = time(NULL); /* record Start Time for logging */ /* @@ -247,6 +245,10 @@ PgArchiverMain(int argc, char *argv[]) elog(FATAL, "setsid() failed: %m"); #endif + InitializeLatchSupport(); /* needed for latch waits */ + + InitLatch(&mainloop_latch); /* initialize latch used in main loop */ + /* * Ignore all signals usually bound to some action in the postmaster, * except for SIGHUP, SIGTERM, SIGUSR1, SIGUSR2, and SIGQUIT. diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 8389d5c4ae..be3adf16d9 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -3022,6 +3022,8 @@ PgstatCollectorMain(int argc, char *argv[]) elog(FATAL, "setsid() failed: %m"); #endif + InitializeLatchSupport(); /* needed for latch waits */ + /* Initialize private latch for use by signal handlers */ InitLatch(&pgStatLatch); diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index dfe40492d2..6f93d93fa3 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -95,7 +95,7 @@ #include "access/xlog.h" #include "bootstrap/bootstrap.h" #include "catalog/pg_control.h" -#include "lib/dllist.h" +#include "lib/ilist.h" #include "libpq/auth.h" #include "libpq/ip.h" #include "libpq/libpq.h" @@ -146,10 +146,10 @@ typedef struct bkend int child_slot; /* PMChildSlot for this backend, if any */ bool is_autovacuum; /* is it an autovacuum process? */ bool dead_end; /* is it going to send an error and quit? */ - Dlelem elem; /* list link in BackendList */ + dlist_node elem; /* list link in BackendList */ } Backend; -static Dllist *BackendList; +static dlist_head BackendList = DLIST_STATIC_INIT(BackendList); #ifdef EXEC_BACKEND static Backend *ShmemBackendArray; @@ -1027,11 +1027,6 @@ PostmasterMain(int argc, char *argv[]) */ set_stack_base(); - /* - * Initialize the list of active backends. - */ - BackendList = DLNewList(); - /* * Initialize pipe (or process handle on Windows) that allows children to * wake up from sleep on postmaster death. @@ -1872,7 +1867,7 @@ processCancelRequest(Port *port, void *pkt) Backend *bp; #ifndef EXEC_BACKEND - Dlelem *curr; + dlist_iter iter; #else int i; #endif @@ -1886,9 +1881,9 @@ processCancelRequest(Port *port, void *pkt) * duplicate array in shared memory. */ #ifndef EXEC_BACKEND - for (curr = DLGetHead(BackendList); curr; curr = DLGetSucc(curr)) + dlist_foreach(iter, &BackendList) { - bp = (Backend *) DLE_VAL(curr); + bp = dlist_container(Backend, elem, iter.cur); #else for (i = MaxLivePostmasterChildren() - 1; i >= 0; i--) { @@ -2266,9 +2261,9 @@ pmdie(SIGNAL_ARGS) if (pmState == PM_RECOVERY) { /* - * Only startup, bgwriter, and checkpointer should be active - * in this state; we just signaled the first two, and we don't - * want to kill checkpointer yet. + * Only startup, bgwriter, walreceiver, and/or checkpointer + * should be active in this state; we just signaled the first + * three, and we don't want to kill checkpointer yet. */ pmState = PM_WAIT_BACKENDS; } @@ -2359,6 +2354,18 @@ reaper(SIGNAL_ARGS) { StartupPID = 0; + /* + * Startup process exited in response to a shutdown request (or it + * completed normally regardless of the shutdown request). + */ + if (Shutdown > NoShutdown && + (EXIT_STATUS_0(exitstatus) || EXIT_STATUS_1(exitstatus))) + { + pmState = PM_WAIT_BACKENDS; + /* PostmasterStateMachine logic does the rest */ + continue; + } + /* * Unexpected exit of startup process (including FATAL exit) * during PM_STARTUP is treated as catastrophic. There are no @@ -2373,18 +2380,6 @@ reaper(SIGNAL_ARGS) ExitPostmaster(1); } - /* - * Startup process exited in response to a shutdown request (or it - * completed normally regardless of the shutdown request). - */ - if (Shutdown > NoShutdown && - (EXIT_STATUS_0(exitstatus) || EXIT_STATUS_1(exitstatus))) - { - pmState = PM_WAIT_BACKENDS; - /* PostmasterStateMachine logic does the rest */ - continue; - } - /* * After PM_STARTUP, any unexpected exit (including FATAL exit) of * the startup process is catastrophic, so kill other children, @@ -2648,7 +2643,7 @@ static void CleanupBackend(int pid, int exitstatus) /* child's exit status. */ { - Dlelem *curr; + dlist_mutable_iter iter; LogChildExit(DEBUG2, _("server process"), pid, exitstatus); @@ -2680,9 +2675,9 @@ CleanupBackend(int pid, return; } - for (curr = DLGetHead(BackendList); curr; curr = DLGetSucc(curr)) + dlist_foreach_modify(iter, &BackendList) { - Backend *bp = (Backend *) DLE_VAL(curr); + Backend *bp = dlist_container(Backend, elem, iter.cur); if (bp->pid == pid) { @@ -2701,7 +2696,7 @@ CleanupBackend(int pid, ShmemBackendArrayRemove(bp); #endif } - DLRemove(curr); + dlist_delete(iter.cur); free(bp); break; } @@ -2718,8 +2713,7 @@ CleanupBackend(int pid, static void HandleChildCrash(int pid, int exitstatus, const char *procname) { - Dlelem *curr, - *next; + dlist_mutable_iter iter; Backend *bp; /* @@ -2734,10 +2728,10 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) } /* Process regular backends */ - for (curr = DLGetHead(BackendList); curr; curr = next) + dlist_foreach_modify(iter, &BackendList) { - next = DLGetSucc(curr); - bp = (Backend *) DLE_VAL(curr); + bp = dlist_container(Backend, elem, iter.cur); + if (bp->pid == pid) { /* @@ -2750,7 +2744,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) ShmemBackendArrayRemove(bp); #endif } - DLRemove(curr); + dlist_delete(iter.cur); free(bp); /* Keep looping so we can signal remaining backends */ } @@ -3113,7 +3107,7 @@ PostmasterStateMachine(void) * normal state transition leading up to PM_WAIT_DEAD_END, or during * FatalError processing. */ - if (DLGetHead(BackendList) == NULL && + if (dlist_is_empty(&BackendList) && PgArchPID == 0 && PgStatPID == 0) { /* These other guys should be dead already */ @@ -3239,12 +3233,12 @@ signal_child(pid_t pid, int signal) static bool SignalSomeChildren(int signal, int target) { - Dlelem *curr; + dlist_iter iter; bool signaled = false; - for (curr = DLGetHead(BackendList); curr; curr = DLGetSucc(curr)) + dlist_foreach(iter, &BackendList) { - Backend *bp = (Backend *) DLE_VAL(curr); + Backend *bp = dlist_container(Backend, elem, iter.cur); if (bp->dead_end) continue; @@ -3382,8 +3376,8 @@ BackendStartup(Port *port) */ bn->pid = pid; bn->is_autovacuum = false; - DLInitElem(&bn->elem, bn); - DLAddHead(BackendList, &bn->elem); + dlist_push_head(&BackendList, &bn->elem); + #ifdef EXEC_BACKEND if (!bn->dead_end) ShmemBackendArrayAdd(bn); @@ -4289,7 +4283,7 @@ sigusr1_handler(SIGNAL_ARGS) * first. We don't want to go back to recovery in that case. */ if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_STARTED) && - pmState == PM_STARTUP) + pmState == PM_STARTUP && Shutdown == NoShutdown) { /* WAL redo has started. We're out of reinitialization. */ FatalError = false; @@ -4306,7 +4300,7 @@ sigusr1_handler(SIGNAL_ARGS) pmState = PM_RECOVERY; } if (CheckPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY) && - pmState == PM_RECOVERY) + pmState == PM_RECOVERY && Shutdown == NoShutdown) { /* * Likewise, start other special children as needed. @@ -4337,7 +4331,8 @@ sigusr1_handler(SIGNAL_ARGS) signal_child(SysLoggerPID, SIGUSR1); } - if (CheckPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER)) + if (CheckPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER) && + Shutdown == NoShutdown) { /* * Start one iteration of the autovacuum daemon, even if autovacuuming @@ -4351,7 +4346,8 @@ sigusr1_handler(SIGNAL_ARGS) start_autovac_launcher = true; } - if (CheckPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER)) + if (CheckPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER) && + Shutdown == NoShutdown) { /* The autovacuum launcher wants us to start a worker process. */ StartAutovacuumWorker(); @@ -4360,7 +4356,8 @@ sigusr1_handler(SIGNAL_ARGS) if (CheckPostmasterSignal(PMSIGNAL_START_WALRECEIVER) && WalReceiverPID == 0 && (pmState == PM_STARTUP || pmState == PM_RECOVERY || - pmState == PM_HOT_STANDBY || pmState == PM_WAIT_READONLY)) + pmState == PM_HOT_STANDBY || pmState == PM_WAIT_READONLY) && + Shutdown == NoShutdown) { /* Startup Process wants us to start the walreceiver process. */ WalReceiverPID = StartWalReceiver(); @@ -4491,12 +4488,12 @@ PostmasterRandom(void) static int CountChildren(int target) { - Dlelem *curr; + dlist_iter iter; int cnt = 0; - for (curr = DLGetHead(BackendList); curr; curr = DLGetSucc(curr)) + dlist_foreach(iter, &BackendList) { - Backend *bp = (Backend *) DLE_VAL(curr); + Backend *bp = dlist_container(Backend, elem, iter.cur); if (bp->dead_end) continue; @@ -4675,8 +4672,7 @@ StartAutovacuumWorker(void) if (bn->pid > 0) { bn->is_autovacuum = true; - DLInitElem(&bn->elem, bn); - DLAddHead(BackendList, &bn->elem); + dlist_push_head(&BackendList, &bn->elem); #ifdef EXEC_BACKEND ShmemBackendArrayAdd(bn); #endif diff --git a/src/backend/postmaster/syslogger.c b/src/backend/postmaster/syslogger.c index 0febf64d87..bb36633ee5 100644 --- a/src/backend/postmaster/syslogger.c +++ b/src/backend/postmaster/syslogger.c @@ -24,6 +24,7 @@ #include "postgres.h" #include +#include #include #include #include @@ -251,6 +252,8 @@ SysLoggerMain(int argc, char *argv[]) elog(FATAL, "setsid() failed: %m"); #endif + InitializeLatchSupport(); /* needed for latch waits */ + /* Initialize private latch for use by signal handlers */ InitLatch(&sysLoggerLatch); @@ -414,11 +417,23 @@ SysLoggerMain(int argc, char *argv[]) * above is still close enough. Note we can't make this calculation * until after calling logfile_rotate(), since it will advance * next_rotation_time. + * + * Also note that we need to beware of overflow in calculation of the + * timeout: with large settings of Log_RotationAge, next_rotation_time + * could be more than INT_MAX msec in the future. In that case we'll + * wait no more than INT_MAX msec, and try again. */ if (Log_RotationAge > 0 && !rotation_disabled) { - if (now < next_rotation_time) - cur_timeout = (next_rotation_time - now) * 1000L; /* msec */ + pg_time_t delay; + + delay = next_rotation_time - now; + if (delay > 0) + { + if (delay > INT_MAX / 1000) + delay = INT_MAX / 1000; + cur_timeout = delay * 1000L; /* msec */ + } else cur_timeout = 0; cur_flags = WL_TIMEOUT; diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c index 43139017c2..c3e15ef759 100644 --- a/src/backend/postmaster/walwriter.c +++ b/src/backend/postmaster/walwriter.c @@ -188,6 +188,7 @@ WalWriterMain(void) false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); + AtEOXact_SMgr(); AtEOXact_Files(); AtEOXact_HashTables(false); diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index b613df4c6a..62135037f1 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -39,9 +39,9 @@ #include #include "access/xlog_internal.h" +#include "libpq/pqformat.h" #include "libpq/pqsignal.h" #include "miscadmin.h" -#include "replication/walprotocol.h" #include "replication/walreceiver.h" #include "replication/walsender.h" #include "storage/ipc.h" @@ -93,8 +93,8 @@ static struct XLogRecPtr Flush; /* last byte + 1 flushed in the standby */ } LogstreamResult; -static StandbyReplyMessage reply_message; -static StandbyHSFeedbackMessage feedback_message; +static StringInfoData reply_message; +static StringInfoData incoming_message; /* * About SIGTERM handling: @@ -279,10 +279,10 @@ WalReceiverMain(void) walrcv_connect(conninfo, startpoint); DisableWalRcvImmediateExit(); - /* Initialize LogstreamResult, reply_message and feedback_message */ + /* Initialize LogstreamResult and buffers for processing messages */ LogstreamResult.Write = LogstreamResult.Flush = GetXLogReplayRecPtr(NULL); - MemSet(&reply_message, 0, sizeof(reply_message)); - MemSet(&feedback_message, 0, sizeof(feedback_message)); + initStringInfo(&reply_message); + initStringInfo(&incoming_message); /* Initialize the last recv timestamp */ last_recv_timestamp = GetCurrentTimestamp(); @@ -480,41 +480,58 @@ WalRcvQuickDieHandler(SIGNAL_ARGS) static void XLogWalRcvProcessMsg(unsigned char type, char *buf, Size len) { + int hdrlen; + XLogRecPtr dataStart; + XLogRecPtr walEnd; + TimestampTz sendTime; + bool replyRequested; + + resetStringInfo(&incoming_message); + switch (type) { case 'w': /* WAL records */ { - WalDataMessageHeader msghdr; - - if (len < sizeof(WalDataMessageHeader)) + /* copy message to StringInfo */ + hdrlen = sizeof(int64) + sizeof(int64) + sizeof(int64); + if (len < hdrlen) ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg_internal("invalid WAL message received from primary"))); - /* memcpy is required here for alignment reasons */ - memcpy(&msghdr, buf, sizeof(WalDataMessageHeader)); - - ProcessWalSndrMessage(msghdr.walEnd, msghdr.sendTime); - - buf += sizeof(WalDataMessageHeader); - len -= sizeof(WalDataMessageHeader); - XLogWalRcvWrite(buf, len, msghdr.dataStart); + appendBinaryStringInfo(&incoming_message, buf, hdrlen); + + /* read the fields */ + dataStart = pq_getmsgint64(&incoming_message); + walEnd = pq_getmsgint64(&incoming_message); + sendTime = IntegerTimestampToTimestampTz( + pq_getmsgint64(&incoming_message)); + ProcessWalSndrMessage(walEnd, sendTime); + + buf += hdrlen; + len -= hdrlen; + XLogWalRcvWrite(buf, len, dataStart); break; } case 'k': /* Keepalive */ { - PrimaryKeepaliveMessage keepalive; - - if (len != sizeof(PrimaryKeepaliveMessage)) + /* copy message to StringInfo */ + hdrlen = sizeof(int64) + sizeof(int64) + sizeof(char); + if (len != hdrlen) ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg_internal("invalid keepalive message received from primary"))); - /* memcpy is required here for alignment reasons */ - memcpy(&keepalive, buf, sizeof(PrimaryKeepaliveMessage)); + appendBinaryStringInfo(&incoming_message, buf, hdrlen); - ProcessWalSndrMessage(keepalive.walEnd, keepalive.sendTime); + /* read the fields */ + walEnd = pq_getmsgint64(&incoming_message); + sendTime = IntegerTimestampToTimestampTz( + pq_getmsgint64(&incoming_message)); + replyRequested = pq_getmsgbyte(&incoming_message); + + ProcessWalSndrMessage(walEnd, sendTime); /* If the primary requested a reply, send one immediately */ - if (keepalive.replyRequested) + if (replyRequested) XLogWalRcvSendReply(true, false); break; } @@ -674,7 +691,7 @@ XLogWalRcvFlush(bool dying) * xmin and the current time. * * If 'force' is not set, the message is only sent if enough time has - * passed since last status update to reach wal_receiver_status_internal. + * passed since last status update to reach wal_receiver_status_interval. * If wal_receiver_status_interval is disabled altogether and 'force' is * false, this is a no-op. * @@ -685,7 +702,10 @@ XLogWalRcvFlush(bool dying) static void XLogWalRcvSendReply(bool force, bool requestReply) { - char buf[sizeof(StandbyReplyMessage) + 1]; + static XLogRecPtr writePtr = 0; + static XLogRecPtr flushPtr = 0; + XLogRecPtr applyPtr; + static TimestampTz sendTime = 0; TimestampTz now; /* @@ -708,28 +728,34 @@ XLogWalRcvSendReply(bool force, bool requestReply) * probably OK. */ if (!force - && XLByteEQ(reply_message.write, LogstreamResult.Write) - && XLByteEQ(reply_message.flush, LogstreamResult.Flush) - && !TimestampDifferenceExceeds(reply_message.sendTime, now, + && XLByteEQ(writePtr, LogstreamResult.Write) + && XLByteEQ(flushPtr, LogstreamResult.Flush) + && !TimestampDifferenceExceeds(sendTime, now, wal_receiver_status_interval * 1000)) return; + sendTime = now; /* Construct a new message */ - reply_message.write = LogstreamResult.Write; - reply_message.flush = LogstreamResult.Flush; - reply_message.apply = GetXLogReplayRecPtr(NULL); - reply_message.sendTime = now; - reply_message.replyRequested = requestReply; - - elog(DEBUG2, "sending write %X/%X flush %X/%X apply %X/%X", - (uint32) (reply_message.write >> 32), (uint32) reply_message.write, - (uint32) (reply_message.flush >> 32), (uint32) reply_message.flush, - (uint32) (reply_message.apply >> 32), (uint32) reply_message.apply); - - /* Prepend with the message type and send it. */ - buf[0] = 'r'; - memcpy(&buf[1], &reply_message, sizeof(StandbyReplyMessage)); - walrcv_send(buf, sizeof(StandbyReplyMessage) + 1); + writePtr = LogstreamResult.Write; + flushPtr = LogstreamResult.Flush; + applyPtr = GetXLogReplayRecPtr(NULL); + + resetStringInfo(&reply_message); + pq_sendbyte(&reply_message, 'r'); + pq_sendint64(&reply_message, writePtr); + pq_sendint64(&reply_message, flushPtr); + pq_sendint64(&reply_message, applyPtr); + pq_sendint64(&reply_message, GetCurrentIntegerTimestamp()); + pq_sendbyte(&reply_message, requestReply ? 1 : 0); + + /* Send it */ + elog(DEBUG2, "sending write %X/%X flush %X/%X apply %X/%X%s", + (uint32) (writePtr >> 32), (uint32) writePtr, + (uint32) (flushPtr >> 32), (uint32) flushPtr, + (uint32) (applyPtr >> 32), (uint32) applyPtr, + requestReply ? " (reply requested)" : ""); + + walrcv_send(reply_message.data, reply_message.len); } /* @@ -739,11 +765,11 @@ XLogWalRcvSendReply(bool force, bool requestReply) static void XLogWalRcvSendHSFeedback(void) { - char buf[sizeof(StandbyHSFeedbackMessage) + 1]; TimestampTz now; TransactionId nextXid; uint32 nextEpoch; TransactionId xmin; + static TimestampTz sendTime = 0; /* * If the user doesn't want status to be reported to the master, be sure @@ -758,9 +784,10 @@ XLogWalRcvSendHSFeedback(void) /* * Send feedback at most once per wal_receiver_status_interval. */ - if (!TimestampDifferenceExceeds(feedback_message.sendTime, now, + if (!TimestampDifferenceExceeds(sendTime, now, wal_receiver_status_interval * 1000)) return; + sendTime = now; /* * If Hot Standby is not yet active there is nothing to send. Check this @@ -783,25 +810,23 @@ XLogWalRcvSendHSFeedback(void) if (nextXid < xmin) nextEpoch--; - /* - * Always send feedback message. - */ - feedback_message.sendTime = now; - feedback_message.xmin = xmin; - feedback_message.epoch = nextEpoch; - elog(DEBUG2, "sending hot standby feedback xmin %u epoch %u", - feedback_message.xmin, - feedback_message.epoch); - - /* Prepend with the message type and send it. */ - buf[0] = 'h'; - memcpy(&buf[1], &feedback_message, sizeof(StandbyHSFeedbackMessage)); - walrcv_send(buf, sizeof(StandbyHSFeedbackMessage) + 1); + xmin, nextEpoch); + + /* Construct the the message and send it. */ + resetStringInfo(&reply_message); + pq_sendbyte(&reply_message, 'h'); + pq_sendint64(&reply_message, GetCurrentIntegerTimestamp()); + pq_sendint(&reply_message, xmin, 4); + pq_sendint(&reply_message, nextEpoch, 4); + walrcv_send(reply_message.data, reply_message.len); } /* - * Keep track of important messages from primary. + * Update shared memory status upon receiving a message from primary. + * + * 'walEnd' and 'sendTime' are the end-of-WAL and timestamp of the latest + * message, reported by primary. */ static void ProcessWalSndrMessage(XLogRecPtr walEnd, TimestampTz sendTime) diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 2af38f1cbe..8774d7e822 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -48,7 +48,6 @@ #include "nodes/replnodes.h" #include "replication/basebackup.h" #include "replication/syncrep.h" -#include "replication/walprotocol.h" #include "replication/walreceiver.h" #include "replication/walsender.h" #include "replication/walsender_private.h" @@ -66,6 +65,16 @@ #include "utils/timeout.h" #include "utils/timestamp.h" +/* + * Maximum data payload in a WAL data message. Must be >= XLOG_BLCKSZ. + * + * We don't have a good idea of what a good value would be; there's some + * overhead per message in both walsender and walreceiver, but on the other + * hand sending large batches makes walsender less responsive to signals + * because signals are checked only between messages. 128kB (with + * default 8k blocks) seems like a reasonable guess for now. + */ +#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* Array of WalSnds in shared memory */ WalSndCtlData *WalSndCtl = NULL; @@ -103,13 +112,10 @@ static uint32 sendOff = 0; */ static XLogRecPtr sentPtr = 0; -/* Buffer for processing reply messages. */ +/* Buffers for constructing outgoing messages and processing reply messages. */ +static StringInfoData output_message; static StringInfoData reply_message; -/* - * Buffer for constructing outgoing messages. - * (1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE bytes) - */ -static char *output_message; +static StringInfoData tmpbuf; /* * Timestamp of the last receipt of the reply from the standby. @@ -526,17 +532,26 @@ ProcessStandbyMessage(void) static void ProcessStandbyReplyMessage(void) { - StandbyReplyMessage reply; - - pq_copymsgbytes(&reply_message, (char *) &reply, sizeof(StandbyReplyMessage)); - - elog(DEBUG2, "write %X/%X flush %X/%X apply %X/%X", - (uint32) (reply.write >> 32), (uint32) reply.write, - (uint32) (reply.flush >> 32), (uint32) reply.flush, - (uint32) (reply.apply >> 32), (uint32) reply.apply); + XLogRecPtr writePtr, + flushPtr, + applyPtr; + bool replyRequested; + + /* the caller already consumed the msgtype byte */ + writePtr = pq_getmsgint64(&reply_message); + flushPtr = pq_getmsgint64(&reply_message); + applyPtr = pq_getmsgint64(&reply_message); + (void) pq_getmsgint64(&reply_message); /* sendTime; not used ATM */ + replyRequested = pq_getmsgbyte(&reply_message); + + elog(DEBUG2, "write %X/%X flush %X/%X apply %X/%X%s", + (uint32) (writePtr >> 32), (uint32) writePtr, + (uint32) (flushPtr >> 32), (uint32) flushPtr, + (uint32) (applyPtr >> 32), (uint32) applyPtr, + replyRequested ? " (reply requested)" : ""); /* Send a reply if the standby requested one. */ - if (reply.replyRequested) + if (replyRequested) WalSndKeepalive(false); /* @@ -548,9 +563,9 @@ ProcessStandbyReplyMessage(void) volatile WalSnd *walsnd = MyWalSnd; SpinLockAcquire(&walsnd->mutex); - walsnd->write = reply.write; - walsnd->flush = reply.flush; - walsnd->apply = reply.apply; + walsnd->write = writePtr; + walsnd->flush = flushPtr; + walsnd->apply = applyPtr; SpinLockRelease(&walsnd->mutex); } @@ -564,20 +579,25 @@ ProcessStandbyReplyMessage(void) static void ProcessStandbyHSFeedbackMessage(void) { - StandbyHSFeedbackMessage reply; TransactionId nextXid; uint32 nextEpoch; + TransactionId feedbackXmin; + uint32 feedbackEpoch; - /* Decipher the reply message */ - pq_copymsgbytes(&reply_message, (char *) &reply, - sizeof(StandbyHSFeedbackMessage)); + /* + * Decipher the reply message. The caller already consumed the msgtype + * byte. + */ + (void) pq_getmsgint64(&reply_message); /* sendTime; not used ATM */ + feedbackXmin = pq_getmsgint(&reply_message, 4); + feedbackEpoch = pq_getmsgint(&reply_message, 4); elog(DEBUG2, "hot standby feedback xmin %u epoch %u", - reply.xmin, - reply.epoch); + feedbackXmin, + feedbackEpoch); /* Ignore invalid xmin (can't actually happen with current walreceiver) */ - if (!TransactionIdIsNormal(reply.xmin)) + if (!TransactionIdIsNormal(feedbackXmin)) return; /* @@ -589,18 +609,18 @@ ProcessStandbyHSFeedbackMessage(void) */ GetNextXidAndEpoch(&nextXid, &nextEpoch); - if (reply.xmin <= nextXid) + if (feedbackXmin <= nextXid) { - if (reply.epoch != nextEpoch) + if (feedbackEpoch != nextEpoch) return; } else { - if (reply.epoch + 1 != nextEpoch) + if (feedbackEpoch + 1 != nextEpoch) return; } - if (!TransactionIdPrecedesOrEquals(reply.xmin, nextXid)) + if (!TransactionIdPrecedesOrEquals(feedbackXmin, nextXid)) return; /* epoch OK, but it's wrapped around */ /* @@ -610,9 +630,9 @@ ProcessStandbyHSFeedbackMessage(void) * cleanup conflicts on the standby server. * * There is a small window for a race condition here: although we just - * checked that reply.xmin precedes nextXid, the nextXid could have gotten + * checked that feedbackXmin precedes nextXid, the nextXid could have gotten * advanced between our fetching it and applying the xmin below, perhaps - * far enough to make reply.xmin wrap around. In that case the xmin we + * far enough to make feedbackXmin wrap around. In that case the xmin we * set here would be "in the future" and have no effect. No point in * worrying about this since it's too late to save the desired data * anyway. Assuming that the standby sends us an increasing sequence of @@ -625,7 +645,7 @@ ProcessStandbyHSFeedbackMessage(void) * safe, and if we're moving it backwards, well, the data is at risk * already since a VACUUM could have just finished calling GetOldestXmin.) */ - MyPgXact->xmin = reply.xmin; + MyPgXact->xmin = feedbackXmin; } /* Main loop of walsender process that streams the WAL over Copy messages. */ @@ -635,17 +655,12 @@ WalSndLoop(void) bool caughtup = false; /* - * Allocate buffer that will be used for each output message. We do this - * just once to reduce palloc overhead. The buffer must be made large - * enough for maximum-sized messages. - */ - output_message = palloc(1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE); - - /* - * Allocate buffer that will be used for processing reply messages. As - * above, do this just once to reduce palloc overhead. + * Allocate buffers that will be used for each outgoing and incoming + * message. We do this just once to reduce palloc overhead. */ + initStringInfo(&output_message); initStringInfo(&reply_message); + initStringInfo(&tmpbuf); /* Initialize the last reply timestamp */ last_reply_timestamp = GetCurrentTimestamp(); @@ -1048,7 +1063,6 @@ XLogSend(bool *caughtup) XLogRecPtr startptr; XLogRecPtr endptr; Size nbytes; - WalDataMessageHeader msghdr; /* * Attempt to send all data that's already been written out and fsync'd to @@ -1125,25 +1139,31 @@ XLogSend(bool *caughtup) /* * OK to read and send the slice. */ - output_message[0] = 'w'; + resetStringInfo(&output_message); + pq_sendbyte(&output_message, 'w'); + + pq_sendint64(&output_message, startptr); /* dataStart */ + pq_sendint64(&output_message, SendRqstPtr); /* walEnd */ + pq_sendint64(&output_message, 0); /* sendtime, filled in last */ /* * Read the log directly into the output buffer to avoid extra memcpy * calls. */ - XLogRead(output_message + 1 + sizeof(WalDataMessageHeader), startptr, nbytes); + enlargeStringInfo(&output_message, nbytes); + XLogRead(&output_message.data[output_message.len], startptr, nbytes); + output_message.len += nbytes; + output_message.data[output_message.len] = '\0'; /* - * We fill the message header last so that the send timestamp is taken as - * late as possible. + * Fill the send timestamp last, so that it is taken as late as possible. */ - msghdr.dataStart = startptr; - msghdr.walEnd = SendRqstPtr; - msghdr.sendTime = GetCurrentTimestamp(); + resetStringInfo(&tmpbuf); + pq_sendint64(&tmpbuf, GetCurrentIntegerTimestamp()); + memcpy(&output_message.data[1 + sizeof(int64) + sizeof(int64)], + tmpbuf.data, sizeof(int64)); - memcpy(output_message + 1, &msghdr, sizeof(WalDataMessageHeader)); - - pq_putmessage_noblock('d', output_message, 1 + sizeof(WalDataMessageHeader) + nbytes); + pq_putmessage_noblock('d', output_message.data, output_message.len); sentPtr = endptr; @@ -1518,19 +1538,17 @@ pg_stat_get_wal_senders(PG_FUNCTION_ARGS) static void WalSndKeepalive(bool requestReply) { - PrimaryKeepaliveMessage keepalive_message; - - /* Construct a new message */ - keepalive_message.walEnd = sentPtr; - keepalive_message.sendTime = GetCurrentTimestamp(); - keepalive_message.replyRequested = requestReply; - elog(DEBUG2, "sending replication keepalive"); - /* Prepend with the message type and send it. */ - output_message[0] = 'k'; - memcpy(output_message + 1, &keepalive_message, sizeof(PrimaryKeepaliveMessage)); - pq_putmessage_noblock('d', output_message, sizeof(PrimaryKeepaliveMessage) + 1); + /* construct the message... */ + resetStringInfo(&output_message); + pq_sendbyte(&output_message, 'k'); + pq_sendint64(&output_message, sentPtr); + pq_sendint64(&output_message, GetCurrentIntegerTimestamp()); + pq_sendbyte(&output_message, requestReply ? 1 : 0); + + /* ... and send it wrapped in CopyData */ + pq_putmessage_noblock('d', output_message.data, output_message.len); } /* diff --git a/src/backend/rewrite/rewriteDefine.c b/src/backend/rewrite/rewriteDefine.c index 8efc9fc9d5..55b0fed5f7 100644 --- a/src/backend/rewrite/rewriteDefine.c +++ b/src/backend/rewrite/rewriteDefine.c @@ -18,6 +18,7 @@ #include "access/htup_details.h" #include "catalog/catalog.h" #include "catalog/dependency.h" +#include "catalog/heap.h" #include "catalog/indexing.h" #include "catalog/namespace.h" #include "catalog/objectaccess.h" @@ -510,13 +511,19 @@ DefineQueryRewrite(char *rulename, } /* - * IF the relation is becoming a view, delete the storage files associated - * with it. NB: we had better have AccessExclusiveLock to do this ... + * If the relation is becoming a view, delete the storage files associated + * with it. Also, get rid of any system attribute entries in pg_attribute, + * because a view shouldn't have any of those. + * + * NB: we had better have AccessExclusiveLock to do this ... * * XXX what about getting rid of its TOAST table? For now, we don't. */ if (RelisBecomingView) + { RelationDropStorage(event_relation); + DeleteSystemAttributeTuples(event_relid); + } /* Close rel, but keep lock till commit... */ heap_close(event_relation, NoLock); diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c index 8f75948d0d..b785c269a0 100644 --- a/src/backend/rewrite/rewriteHandler.c +++ b/src/backend/rewrite/rewriteHandler.c @@ -502,25 +502,27 @@ rewriteRuleAction(Query *parsetree, AddQual(sub_action, parsetree->jointree->quals); /* - * Rewrite new.attribute w/ right hand side of target-list entry for + * Rewrite new.attribute with right hand side of target-list entry for * appropriate field name in insert/update. * - * KLUGE ALERT: since ResolveNew returns a mutated copy, we can't just - * apply it to sub_action; we have to remember to update the sublink - * inside rule_action, too. + * KLUGE ALERT: since ReplaceVarsFromTargetList returns a mutated copy, we + * can't just apply it to sub_action; we have to remember to update the + * sublink inside rule_action, too. */ if ((event == CMD_INSERT || event == CMD_UPDATE) && sub_action->commandType != CMD_UTILITY) { - sub_action = (Query *) ResolveNew((Node *) sub_action, - new_varno, - 0, - rt_fetch(new_varno, - sub_action->rtable), - parsetree->targetList, - event, - current_varno, - NULL); + sub_action = (Query *) + ReplaceVarsFromTargetList((Node *) sub_action, + new_varno, + 0, + rt_fetch(new_varno, sub_action->rtable), + parsetree->targetList, + (event == CMD_UPDATE) ? + REPLACEVARS_CHANGE_VARNO : + REPLACEVARS_SUBSTITUTE_NULL, + current_varno, + NULL); if (sub_action_ptr) *sub_action_ptr = sub_action; else @@ -543,15 +545,15 @@ rewriteRuleAction(Query *parsetree, errmsg("cannot have RETURNING lists in multiple rules"))); *returning_flag = true; rule_action->returningList = (List *) - ResolveNew((Node *) parsetree->returningList, - parsetree->resultRelation, - 0, - rt_fetch(parsetree->resultRelation, - parsetree->rtable), - rule_action->returningList, - CMD_SELECT, - 0, - &rule_action->hasSubLinks); + ReplaceVarsFromTargetList((Node *) parsetree->returningList, + parsetree->resultRelation, + 0, + rt_fetch(parsetree->resultRelation, + parsetree->rtable), + rule_action->returningList, + REPLACEVARS_REPORT_ERROR, + 0, + &rule_action->hasSubLinks); /* * There could have been some SubLinks in parsetree's returningList, @@ -1703,14 +1705,17 @@ CopyAndAddInvertedQual(Query *parsetree, ChangeVarNodes(new_qual, PRS2_OLD_VARNO, rt_index, 0); /* Fix references to NEW */ if (event == CMD_INSERT || event == CMD_UPDATE) - new_qual = ResolveNew(new_qual, - PRS2_NEW_VARNO, - 0, - rt_fetch(rt_index, parsetree->rtable), - parsetree->targetList, - event, - rt_index, - &parsetree->hasSubLinks); + new_qual = ReplaceVarsFromTargetList(new_qual, + PRS2_NEW_VARNO, + 0, + rt_fetch(rt_index, + parsetree->rtable), + parsetree->targetList, + (event == CMD_UPDATE) ? + REPLACEVARS_CHANGE_VARNO : + REPLACEVARS_SUBSTITUTE_NULL, + rt_index, + &parsetree->hasSubLinks); /* And attach the fixed qual */ AddInvertedQual(parsetree, new_qual); diff --git a/src/backend/rewrite/rewriteManip.c b/src/backend/rewrite/rewriteManip.c index ef04c342a2..bea00e48e5 100644 --- a/src/backend/rewrite/rewriteManip.c +++ b/src/backend/rewrite/rewriteManip.c @@ -1323,12 +1323,16 @@ map_variable_attnos(Node *node, /* - * ResolveNew - replace Vars with corresponding items from a targetlist + * ReplaceVarsFromTargetList - replace Vars with items from a targetlist * * Vars matching target_varno and sublevels_up are replaced by the * entry with matching resno from targetlist, if there is one. - * If not, we either change the unmatched Var's varno to update_varno - * (when event == CMD_UPDATE) or replace it with a constant NULL. + * + * If there is no matching resno for such a Var, the action depends on the + * nomatch_option: + * REPLACEVARS_REPORT_ERROR: throw an error + * REPLACEVARS_CHANGE_VARNO: change Var's varno to nomatch_varno + * REPLACEVARS_SUBSTITUTE_NULL: replace Var with a NULL Const of same type * * The caller must also provide target_rte, the RTE describing the target * relation. This is needed to handle whole-row Vars referencing the target. @@ -1341,15 +1345,15 @@ typedef struct { RangeTblEntry *target_rte; List *targetlist; - int event; - int update_varno; -} ResolveNew_context; + ReplaceVarsNoMatchOption nomatch_option; + int nomatch_varno; +} ReplaceVarsFromTargetList_context; static Node * -ResolveNew_callback(Var *var, - replace_rte_variables_context *context) +ReplaceVarsFromTargetList_callback(Var *var, + replace_rte_variables_context *context) { - ResolveNew_context *rcon = (ResolveNew_context *) context->callback_arg; + ReplaceVarsFromTargetList_context *rcon = (ReplaceVarsFromTargetList_context *) context->callback_arg; TargetEntry *tle; if (var->varattno == InvalidAttrNumber) @@ -1388,29 +1392,37 @@ ResolveNew_callback(Var *var, if (tle == NULL || tle->resjunk) { - /* Failed to find column in insert/update tlist */ - if (rcon->event == CMD_UPDATE) - { - /* For update, just change unmatched var's varno */ - var = (Var *) copyObject(var); - var->varno = rcon->update_varno; - var->varnoold = rcon->update_varno; - return (Node *) var; - } - else + /* Failed to find column in targetlist */ + switch (rcon->nomatch_option) { - /* Otherwise replace unmatched var with a null */ - /* need coerce_to_domain in case of NOT NULL domain constraint */ - return coerce_to_domain((Node *) makeNullConst(var->vartype, - var->vartypmod, - var->varcollid), - InvalidOid, -1, - var->vartype, - COERCE_IMPLICIT_CAST, - -1, - false, - false); + case REPLACEVARS_REPORT_ERROR: + /* fall through, throw error below */ + break; + + case REPLACEVARS_CHANGE_VARNO: + var = (Var *) copyObject(var); + var->varno = rcon->nomatch_varno; + var->varnoold = rcon->nomatch_varno; + return (Node *) var; + + case REPLACEVARS_SUBSTITUTE_NULL: + /* + * If Var is of domain type, we should add a CoerceToDomain + * node, in case there is a NOT NULL domain constraint. + */ + return coerce_to_domain((Node *) makeNullConst(var->vartype, + var->vartypmod, + var->varcollid), + InvalidOid, -1, + var->vartype, + COERCE_IMPLICIT_CAST, + -1, + false, + false); } + elog(ERROR, "could not find replacement targetlist entry for attno %d", + var->varattno); + return NULL; /* keep compiler quiet */ } else { @@ -1426,20 +1438,23 @@ ResolveNew_callback(Var *var, } Node * -ResolveNew(Node *node, int target_varno, int sublevels_up, - RangeTblEntry *target_rte, - List *targetlist, int event, int update_varno, - bool *outer_hasSubLinks) +ReplaceVarsFromTargetList(Node *node, + int target_varno, int sublevels_up, + RangeTblEntry *target_rte, + List *targetlist, + ReplaceVarsNoMatchOption nomatch_option, + int nomatch_varno, + bool *outer_hasSubLinks) { - ResolveNew_context context; + ReplaceVarsFromTargetList_context context; context.target_rte = target_rte; context.targetlist = targetlist; - context.event = event; - context.update_varno = update_varno; + context.nomatch_option = nomatch_option; + context.nomatch_varno = nomatch_varno; return replace_rte_variables(node, target_varno, sublevels_up, - ResolveNew_callback, + ReplaceVarsFromTargetList_callback, (void *) &context, outer_hasSubLinks); } diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 56095b3250..dddb6c0321 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -1882,16 +1882,13 @@ BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum, * written.) * * If the caller has an smgr reference for the buffer's relation, pass it - * as the second parameter. If not, pass NULL. In the latter case, the - * relation will be marked as "transient" so that the corresponding - * kernel-level file descriptors are closed when the current transaction ends, - * if any. + * as the second parameter. If not, pass NULL. */ static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) { XLogRecPtr recptr; - ErrorContextCallback errcontext; + ErrorContextCallback errcallback; instr_time io_start, io_time; @@ -1904,17 +1901,14 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) return; /* Setup error traceback support for ereport() */ - errcontext.callback = shared_buffer_write_error_callback; - errcontext.arg = (void *) buf; - errcontext.previous = error_context_stack; - error_context_stack = &errcontext; + errcallback.callback = shared_buffer_write_error_callback; + errcallback.arg = (void *) buf; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; - /* Find smgr relation for buffer, and mark it as transient */ + /* Find smgr relation for buffer */ if (reln == NULL) - { reln = smgropen(buf->tag.rnode, InvalidBackendId); - smgrsettransient(reln); - } TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum, buf->tag.blockNum, @@ -1973,7 +1967,7 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) reln->smgr_rnode.node.relNode); /* Pop the error context stack */ - error_context_stack = errcontext.previous; + error_context_stack = errcallback.previous; } /* @@ -2259,13 +2253,13 @@ FlushRelationBuffers(Relation rel) if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) && (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY)) { - ErrorContextCallback errcontext; + ErrorContextCallback errcallback; /* Setup error traceback support for ereport() */ - errcontext.callback = local_buffer_write_error_callback; - errcontext.arg = (void *) bufHdr; - errcontext.previous = error_context_stack; - error_context_stack = &errcontext; + errcallback.callback = local_buffer_write_error_callback; + errcallback.arg = (void *) bufHdr; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; smgrwrite(rel->rd_smgr, bufHdr->tag.forkNum, @@ -2276,7 +2270,7 @@ FlushRelationBuffers(Relation rel) bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); /* Pop the error context stack */ - error_context_stack = errcontext.previous; + error_context_stack = errcallback.previous; } } diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c index cf47708a79..973894d9c5 100644 --- a/src/backend/storage/file/copydir.c +++ b/src/backend/storage/file/copydir.c @@ -162,14 +162,14 @@ copy_file(char *fromfile, char *tofile) /* * Open the files */ - srcfd = BasicOpenFile(fromfile, O_RDONLY | PG_BINARY, 0); + srcfd = OpenTransientFile(fromfile, O_RDONLY | PG_BINARY, 0); if (srcfd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", fromfile))); - dstfd = BasicOpenFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, - S_IRUSR | S_IWUSR); + dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + S_IRUSR | S_IWUSR); if (dstfd < 0) ereport(ERROR, (errcode_for_file_access(), @@ -209,12 +209,12 @@ copy_file(char *fromfile, char *tofile) (void) pg_flush_data(dstfd, offset, nbytes); } - if (close(dstfd)) + if (CloseTransientFile(dstfd)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", tofile))); - close(srcfd); + CloseTransientFile(srcfd); pfree(buffer); } @@ -238,13 +238,13 @@ fsync_fname(char *fname, bool isdir) * cases here */ if (!isdir) - fd = BasicOpenFile(fname, - O_RDWR | PG_BINARY, - S_IRUSR | S_IWUSR); + fd = OpenTransientFile(fname, + O_RDWR | PG_BINARY, + S_IRUSR | S_IWUSR); else - fd = BasicOpenFile(fname, - O_RDONLY | PG_BINARY, - S_IRUSR | S_IWUSR); + fd = OpenTransientFile(fname, + O_RDONLY | PG_BINARY, + S_IRUSR | S_IWUSR); /* * Some OSs don't allow us to open directories at all (Windows returns @@ -263,7 +263,7 @@ fsync_fname(char *fname, bool isdir) /* Some OSs don't allow us to fsync directories at all */ if (returncode != 0 && isdir && errno == EBADF) { - close(fd); + CloseTransientFile(fd); return; } @@ -272,5 +272,5 @@ fsync_fname(char *fname, bool isdir) (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", fname))); - close(fd); + CloseTransientFile(fd); } diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index fed25fd7e7..07ee51cf5a 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -30,11 +30,29 @@ * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we * may find ourselves short of real file descriptors anyway. * - * This file used to contain a bunch of stuff to support RAID levels 0 - * (jbod), 1 (duplex) and 5 (xor parity). That stuff is all gone - * because the parallel query processing code that called it is all - * gone. If you really need it you could get it from the original - * POSTGRES source. + * INTERFACE ROUTINES + * + * PathNameOpenFile and OpenTemporaryFile are used to open virtual files. + * A File opened with OpenTemporaryFile is automatically deleted when the + * File is closed, either explicitly or implicitly at end of transaction or + * process exit. PathNameOpenFile is intended for files that are held open + * for a long time, like relation files. It is the caller's responsibility + * to close them, there is no automatic mechanism in fd.c for that. + * + * AllocateFile, AllocateDir and OpenTransientFile are wrappers around + * fopen(3), opendir(3), and open(2), respectively. They behave like the + * corresponding native functions, except that the handle is registered with + * the current subtransaction, and will be automatically closed at abort. + * These are intended for short operations like reading a configuration file. + * and there is a fixed limit on the number files that can be open using these + * functions at any one time. + * + * Finally, BasicOpenFile is a just thin wrapper around open() that can + * release file descriptors in use by the virtual file descriptors if + * necessary. There is no automatic cleanup of file descriptors returned by + * BasicOpenFile, it is solely the caller's responsibility to close the file + * descriptor by calling close(2). + * *------------------------------------------------------------------------- */ @@ -94,11 +112,11 @@ int max_files_per_process = 1000; /* * Maximum number of file descriptors to open for either VFD entries or - * AllocateFile/AllocateDir operations. This is initialized to a conservative - * value, and remains that way indefinitely in bootstrap or standalone-backend - * cases. In normal postmaster operation, the postmaster calls - * set_max_safe_fds() late in initialization to update the value, and that - * value is then inherited by forked subprocesses. + * AllocateFile/AllocateDir/OpenTransientFile operations. This is initialized + * to a conservative value, and remains that way indefinitely in bootstrap or + * standalone-backend cases. In normal postmaster operation, the postmaster + * calls set_max_safe_fds() late in initialization to update the value, and + * that value is then inherited by forked subprocesses. * * Note: the value of max_files_per_process is taken into account while * setting this variable, and so need not be tested separately. @@ -126,8 +144,6 @@ int max_safe_fds = 32; /* default if not changed */ /* these are the assigned bits in fdstate below: */ #define FD_TEMPORARY (1 << 0) /* T = delete when closed */ #define FD_XACT_TEMPORARY (1 << 1) /* T = delete at eoXact */ -#define FD_XACT_TRANSIENT (1 << 2) /* T = close (not delete) at aoXact, - * but keep VFD */ typedef struct vfd { @@ -158,8 +174,11 @@ static Size SizeVfdCache = 0; */ static int nfile = 0; -/* True if there are files to close/delete at end of transaction */ -static bool have_pending_fd_cleanup = false; +/* + * Flag to tell whether it's worth scanning VfdCache looking for temp files + * to close + */ +static bool have_xact_temporary_files = false; /* * Tracks the total size of all temporary files. Note: when temp_file_limit @@ -170,10 +189,10 @@ static bool have_pending_fd_cleanup = false; static uint64 temporary_files_size = 0; /* - * List of stdio FILEs and DIRs opened with AllocateFile - * and AllocateDir. + * List of OS handles opened with AllocateFile, AllocateDir and + * OpenTransientFile. * - * Since we don't want to encourage heavy use of AllocateFile or AllocateDir, + * Since we don't want to encourage heavy use of those functions, * it seems OK to put a pretty small maximum limit on the number of * simultaneously allocated descs. */ @@ -182,7 +201,8 @@ static uint64 temporary_files_size = 0; typedef enum { AllocateDescFile, - AllocateDescDir + AllocateDescDir, + AllocateDescRawFD } AllocateDescKind; typedef struct @@ -192,6 +212,7 @@ typedef struct { FILE *file; DIR *dir; + int fd; } desc; SubTransactionId create_subid; } AllocateDesc; @@ -607,7 +628,6 @@ LruDelete(File file) Vfd *vfdP; Assert(file != 0); - Assert(!FileIsNotOpen(file)); DO_DB(elog(LOG, "LruDelete %d (%s)", file, VfdCache[file].fileName)); @@ -971,7 +991,7 @@ OpenTemporaryFile(bool interXact) VfdCache[file].resowner = CurrentResourceOwner; /* ensure cleanup happens at eoxact */ - have_pending_fd_cleanup = true; + have_xact_temporary_files = true; } return file; @@ -1044,25 +1064,6 @@ OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError) return file; } -/* - * Set the transient flag on a file - * - * This flag tells CleanupTempFiles to close the kernel-level file descriptor - * (but not the VFD itself) at end of transaction. - */ -void -FileSetTransient(File file) -{ - Vfd *vfdP; - - Assert(FileIsValid(file)); - - vfdP = &VfdCache[file]; - vfdP->fdstate |= FD_XACT_TRANSIENT; - - have_pending_fd_cleanup = true; -} - /* * close a file when done with it */ @@ -1542,8 +1543,49 @@ AllocateFile(const char *name, const char *mode) return NULL; } + /* - * Free an AllocateDesc of either type. + * Like AllocateFile, but returns an unbuffered fd like open(2) + */ +int +OpenTransientFile(FileName fileName, int fileFlags, int fileMode) +{ + int fd; + + + DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)", + numAllocatedDescs, fileName)); + + /* + * The test against MAX_ALLOCATED_DESCS prevents us from overflowing + * allocatedFiles[]; the test against max_safe_fds prevents BasicOpenFile + * from hogging every one of the available FDs, which'd lead to infinite + * looping. + */ + if (numAllocatedDescs >= MAX_ALLOCATED_DESCS || + numAllocatedDescs >= max_safe_fds - 1) + elog(ERROR, "exceeded MAX_ALLOCATED_DESCS while trying to open file \"%s\"", + fileName); + + fd = BasicOpenFile(fileName, fileFlags, fileMode); + + if (fd >= 0) + { + AllocateDesc *desc = &allocatedDescs[numAllocatedDescs]; + + desc->kind = AllocateDescRawFD; + desc->desc.fd = fd; + desc->create_subid = GetCurrentSubTransactionId(); + numAllocatedDescs++; + + return fd; + } + + return -1; /* failure */ +} + +/* + * Free an AllocateDesc of any type. * * The argument *must* point into the allocatedDescs[] array. */ @@ -1561,6 +1603,9 @@ FreeDesc(AllocateDesc *desc) case AllocateDescDir: result = closedir(desc->desc.dir); break; + case AllocateDescRawFD: + result = close(desc->desc.fd); + break; default: elog(ERROR, "AllocateDesc kind not recognized"); result = 0; /* keep compiler quiet */ @@ -1602,6 +1647,33 @@ FreeFile(FILE *file) return fclose(file); } +/* + * Close a file returned by OpenTransientFile. + * + * Note we do not check close's return value --- it is up to the caller + * to handle close errors. + */ +int +CloseTransientFile(int fd) +{ + int i; + + DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs)); + + /* Remove fd from list of allocated files, if it's present */ + for (i = numAllocatedDescs; --i >= 0;) + { + AllocateDesc *desc = &allocatedDescs[i]; + + if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd) + return FreeDesc(desc); + } + + /* Only get here if someone passes us a file not in allocatedDescs */ + elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile"); + + return close(fd); +} /* * Routines that want to use (ie, DIR*) should use AllocateDir @@ -1863,9 +1935,8 @@ AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, * particularly care which). All still-open per-transaction temporary file * VFDs are closed, which also causes the underlying files to be deleted * (although they should've been closed already by the ResourceOwner - * cleanup). Transient files have their kernel file descriptors closed. - * Furthermore, all "allocated" stdio files are closed. We also forget any - * transaction-local temp tablespace list. + * cleanup). Furthermore, all "allocated" stdio files are closed. We also + * forget any transaction-local temp tablespace list. */ void AtEOXact_Files(void) @@ -1888,16 +1959,13 @@ AtProcExit_Files(int code, Datum arg) } /* - * General cleanup routine for fd.c. - * - * Temporary files are closed, and their underlying files deleted. - * Transient files are closed. + * Close temporary files and delete their underlying files. * * isProcExit: if true, this is being called as the backend process is * exiting. If that's the case, we should remove all temporary files; if * that's not the case, we are being called for transaction commit/abort * and should only remove transaction-local temp files. In either case, - * also clean up "allocated" stdio files and dirs. + * also clean up "allocated" stdio files, dirs and fds. */ static void CleanupTempFiles(bool isProcExit) @@ -1908,54 +1976,38 @@ CleanupTempFiles(bool isProcExit) * Careful here: at proc_exit we need extra cleanup, not just * xact_temporary files. */ - if (isProcExit || have_pending_fd_cleanup) + if (isProcExit || have_xact_temporary_files) { Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */ for (i = 1; i < SizeVfdCache; i++) { unsigned short fdstate = VfdCache[i].fdstate; - if (VfdCache[i].fileName != NULL) + if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL) { - if (fdstate & FD_TEMPORARY) - { - /* - * If we're in the process of exiting a backend process, - * close all temporary files. Otherwise, only close - * temporary files local to the current transaction. They - * should be closed by the ResourceOwner mechanism - * already, so this is just a debugging cross-check. - */ - if (isProcExit) - FileClose(i); - else if (fdstate & FD_XACT_TEMPORARY) - { - elog(WARNING, - "temporary file %s not closed at end-of-transaction", - VfdCache[i].fileName); - FileClose(i); - } - } - else if (fdstate & FD_XACT_TRANSIENT) + /* + * If we're in the process of exiting a backend process, close + * all temporary files. Otherwise, only close temporary files + * local to the current transaction. They should be closed by + * the ResourceOwner mechanism already, so this is just a + * debugging cross-check. + */ + if (isProcExit) + FileClose(i); + else if (fdstate & FD_XACT_TEMPORARY) { - /* - * Close the FD, and remove the entry from the LRU ring, - * but also remove the flag from the VFD. This is to - * ensure that if the VFD is reused in the future for - * non-transient access, we don't close it inappropriately - * then. - */ - if (!FileIsNotOpen(i)) - LruDelete(i); - VfdCache[i].fdstate &= ~FD_XACT_TRANSIENT; + elog(WARNING, + "temporary file %s not closed at end-of-transaction", + VfdCache[i].fileName); + FileClose(i); } } } - have_pending_fd_cleanup = false; + have_xact_temporary_files = false; } - /* Clean up "allocated" stdio files and dirs. */ + /* Clean up "allocated" stdio files, dirs and fds. */ while (numAllocatedDescs > 0) FreeDesc(&allocatedDescs[0]); } diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index d38164caa7..94f58a9b9d 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -400,7 +400,7 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) pgxact->xmin = InvalidTransactionId; /* must be cleared with xid/xmin: */ pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK; - pgxact->inCommit = false; /* be sure this is cleared in abort */ + pgxact->delayChkpt = false; /* be sure this is cleared in abort */ proc->recoveryConflictPending = false; /* Clear the subtransaction-XID cache too while holding the lock */ @@ -427,7 +427,7 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) pgxact->xmin = InvalidTransactionId; /* must be cleared with xid/xmin: */ pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK; - pgxact->inCommit = false; /* be sure this is cleared in abort */ + pgxact->delayChkpt = false; /* be sure this is cleared in abort */ proc->recoveryConflictPending = false; Assert(pgxact->nxids == 0); @@ -462,7 +462,7 @@ ProcArrayClearTransaction(PGPROC *proc) /* redundant, but just in case */ pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK; - pgxact->inCommit = false; + pgxact->delayChkpt = false; /* Clear the subtransaction-XID cache too */ pgxact->nxids = 0; @@ -501,6 +501,13 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running) * Remove stale transactions, if any. */ ExpireOldKnownAssignedTransactionIds(running->oldestRunningXid); + + /* + * Remove stale locks, if any. + * + * Locks are always assigned to the toplevel xid so we don't need to care + * about subxcnt/subxids (and by extension not about ->suboverflowed). + */ StandbyReleaseOldLocks(running->xcnt, running->xids); /* @@ -581,13 +588,13 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running) * Allocate a temporary array to avoid modifying the array passed as * argument. */ - xids = palloc(sizeof(TransactionId) * running->xcnt); + xids = palloc(sizeof(TransactionId) * (running->xcnt + running->subxcnt)); /* * Add to the temp array any xids which have not already completed. */ nxids = 0; - for (i = 0; i < running->xcnt; i++) + for (i = 0; i < running->xcnt + running->subxcnt; i++) { TransactionId xid = running->xids[i]; @@ -1564,6 +1571,11 @@ ProcArrayInstallImportedXmin(TransactionId xmin, TransactionId sourcexid) * We don't worry about updating other counters, we want to keep this as * simple as possible and leave GetSnapshotData() as the primary code for * that bookkeeping. + * + * Note that if any transaction has overflowed its cached subtransactions + * then there is no real need include any subtransactions. That isn't a + * common enough case to worry about optimising the size of the WAL record, + * and we may wish to see that data for diagnostic purposes anyway. */ RunningTransactions GetRunningTransactionData(void) @@ -1622,15 +1634,13 @@ GetRunningTransactionData(void) oldestRunningXid = ShmemVariableCache->nextXid; /* - * Spin over procArray collecting all xids and subxids. + * Spin over procArray collecting all xids */ for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - volatile PGPROC *proc = &allProcs[pgprocno]; volatile PGXACT *pgxact = &allPgXact[pgprocno]; TransactionId xid; - int nxids; /* Fetch xid just once - see GetNewTransactionId */ xid = pgxact->xid; @@ -1647,30 +1657,46 @@ GetRunningTransactionData(void) if (TransactionIdPrecedes(xid, oldestRunningXid)) oldestRunningXid = xid; - /* - * Save subtransaction XIDs. Other backends can't add or remove - * entries while we're holding XidGenLock. - */ - nxids = pgxact->nxids; - if (nxids > 0) - { - memcpy(&xids[count], (void *) proc->subxids.xids, - nxids * sizeof(TransactionId)); - count += nxids; - subcount += nxids; + if (pgxact->overflowed) + suboverflowed = true; + } - if (pgxact->overflowed) - suboverflowed = true; + /* + * Spin over procArray collecting all subxids, but only if there hasn't + * been a suboverflow. + */ + if (!suboverflowed) + { + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + volatile PGPROC *proc = &allProcs[pgprocno]; + volatile PGXACT *pgxact = &allPgXact[pgprocno]; + int nxids; /* - * Top-level XID of a transaction is always less than any of its - * subxids, so we don't need to check if any of the subxids are - * smaller than oldestRunningXid + * Save subtransaction XIDs. Other backends can't add or remove + * entries while we're holding XidGenLock. */ + nxids = pgxact->nxids; + if (nxids > 0) + { + memcpy(&xids[count], (void *) proc->subxids.xids, + nxids * sizeof(TransactionId)); + count += nxids; + subcount += nxids; + + /* + * Top-level XID of a transaction is always less than any of + * its subxids, so we don't need to check if any of the subxids + * are smaller than oldestRunningXid + */ + } } } - CurrentRunningXacts->xcnt = count; + CurrentRunningXacts->xcnt = count - subcount; + CurrentRunningXacts->subxcnt = subcount; CurrentRunningXacts->subxid_overflow = suboverflowed; CurrentRunningXacts->nextXid = ShmemVariableCache->nextXid; CurrentRunningXacts->oldestRunningXid = oldestRunningXid; @@ -1752,65 +1778,70 @@ GetOldestActiveTransactionId(void) } /* - * GetTransactionsInCommit -- Get the XIDs of transactions that are committing + * GetVirtualXIDsDelayingChkpt -- Get the VXIDs of transactions that are + * delaying checkpoint because they have critical actions in progress. * - * Constructs an array of XIDs of transactions that are currently in commit - * critical sections, as shown by having inCommit set in their PGXACT entries. + * Constructs an array of VXIDs of transactions that are currently in commit + * critical sections, as shown by having delayChkpt set in their PGXACT. * - * *xids_p is set to a palloc'd array that should be freed by the caller. - * The return value is the number of valid entries. + * Returns a palloc'd array that should be freed by the caller. + * *nvxids is the number of valid entries. * - * Note that because backends set or clear inCommit without holding any lock, + * Note that because backends set or clear delayChkpt without holding any lock, * the result is somewhat indeterminate, but we don't really care. Even in * a multiprocessor with delayed writes to shared memory, it should be certain - * that setting of inCommit will propagate to shared memory when the backend - * takes the WALInsertLock, so we cannot fail to see an xact as inCommit if + * that setting of delayChkpt will propagate to shared memory when the backend + * takes a lock, so we cannot fail to see an virtual xact as delayChkpt if * it's already inserted its commit record. Whether it takes a little while - * for clearing of inCommit to propagate is unimportant for correctness. + * for clearing of delayChkpt to propagate is unimportant for correctness. */ -int -GetTransactionsInCommit(TransactionId **xids_p) +VirtualTransactionId * +GetVirtualXIDsDelayingChkpt(int *nvxids) { + VirtualTransactionId *vxids; ProcArrayStruct *arrayP = procArray; - TransactionId *xids; - int nxids; + int count = 0; int index; - xids = (TransactionId *) palloc(arrayP->maxProcs * sizeof(TransactionId)); - nxids = 0; + /* allocate what's certainly enough result space */ + vxids = (VirtualTransactionId *) + palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs); LWLockAcquire(ProcArrayLock, LW_SHARED); for (index = 0; index < arrayP->numProcs; index++) { - int pgprocno = arrayP->pgprocnos[index]; - volatile PGXACT *pgxact = &allPgXact[pgprocno]; - TransactionId pxid; + int pgprocno = arrayP->pgprocnos[index]; + volatile PGPROC *proc = &allProcs[pgprocno]; + volatile PGXACT *pgxact = &allPgXact[pgprocno]; - /* Fetch xid just once - see GetNewTransactionId */ - pxid = pgxact->xid; + if (pgxact->delayChkpt) + { + VirtualTransactionId vxid; - if (pgxact->inCommit && TransactionIdIsValid(pxid)) - xids[nxids++] = pxid; + GET_VXID_FROM_PGPROC(vxid, *proc); + if (VirtualTransactionIdIsValid(vxid)) + vxids[count++] = vxid; + } } LWLockRelease(ProcArrayLock); - *xids_p = xids; - return nxids; + *nvxids = count; + return vxids; } /* - * HaveTransactionsInCommit -- Are any of the specified XIDs in commit? + * HaveVirtualXIDsDelayingChkpt -- Are any of the specified VXIDs delaying? * - * This is used with the results of GetTransactionsInCommit to see if any - * of the specified XIDs are still in their commit critical sections. + * This is used with the results of GetVirtualXIDsDelayingChkpt to see if any + * of the specified VXIDs are still in critical sections of code. * - * Note: this is O(N^2) in the number of xacts that are/were in commit, but + * Note: this is O(N^2) in the number of vxacts that are/were delaying, but * those numbers should be small enough for it not to be a problem. */ bool -HaveTransactionsInCommit(TransactionId *xids, int nxids) +HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids) { bool result = false; ProcArrayStruct *arrayP = procArray; @@ -1818,30 +1849,32 @@ HaveTransactionsInCommit(TransactionId *xids, int nxids) LWLockAcquire(ProcArrayLock, LW_SHARED); - for (index = 0; index < arrayP->numProcs; index++) + while (VirtualTransactionIdIsValid(*vxids)) { - int pgprocno = arrayP->pgprocnos[index]; - volatile PGXACT *pgxact = &allPgXact[pgprocno]; - TransactionId pxid; - - /* Fetch xid just once - see GetNewTransactionId */ - pxid = pgxact->xid; - - if (pgxact->inCommit && TransactionIdIsValid(pxid)) + for (index = 0; index < arrayP->numProcs; index++) { - int i; + int pgprocno = arrayP->pgprocnos[index]; + volatile PGPROC *proc = &allProcs[pgprocno]; + volatile PGXACT *pgxact = &allPgXact[pgprocno]; + VirtualTransactionId vxid; - for (i = 0; i < nxids; i++) + GET_VXID_FROM_PGPROC(vxid, *proc); + if (VirtualTransactionIdIsValid(vxid)) { - if (xids[i] == pxid) + if (VirtualTransactionIdEquals(vxid, *vxids) && + pgxact->delayChkpt) { result = true; break; } } - if (result) - break; } + + if (result) + break; + + /* The virtual transaction is gone now, wait for the next one */ + vxids++; } LWLockRelease(ProcArrayLock); diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 43f7411273..9f7cce4063 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -105,6 +105,9 @@ ShutdownRecoveryTransactionEnvironment(void) /* Release all locks the tracked transactions were holding */ StandbyReleaseAllLocks(); + + /* Cleanup our VirtualTransaction */ + VirtualXactLockTableCleanup(); } @@ -540,6 +543,10 @@ StandbyTimeoutHandler(void) * RelationLockList, so we can keep track of the various entries made by * the Startup process's virtual xid in the shared lock table. * + * We record the lock against the top-level xid, rather than individual + * subtransaction xids. This means AccessExclusiveLocks held by aborted + * subtransactions are not released as early as possible on standbys. + * * List elements use type xl_rel_lock, since the WAL record type exactly * matches the information that we need to keep track of. * @@ -673,8 +680,8 @@ StandbyReleaseAllLocks(void) /* * StandbyReleaseOldLocks - * Release standby locks held by XIDs that aren't running, as long - * as they're not prepared transactions. + * Release standby locks held by top-level XIDs that aren't running, + * as long as they're not prepared transactions. */ void StandbyReleaseOldLocks(int nxids, TransactionId *xids) @@ -771,6 +778,7 @@ standby_redo(XLogRecPtr lsn, XLogRecord *record) RunningTransactionsData running; running.xcnt = xlrec->xcnt; + running.subxcnt = xlrec->subxcnt; running.subxid_overflow = xlrec->subxid_overflow; running.nextXid = xlrec->nextXid; running.latestCompletedXid = xlrec->latestCompletedXid; @@ -783,54 +791,6 @@ standby_redo(XLogRecPtr lsn, XLogRecord *record) elog(PANIC, "standby_redo: unknown op code %u", info); } -static void -standby_desc_running_xacts(StringInfo buf, xl_running_xacts *xlrec) -{ - int i; - - appendStringInfo(buf, " nextXid %u latestCompletedXid %u oldestRunningXid %u", - xlrec->nextXid, - xlrec->latestCompletedXid, - xlrec->oldestRunningXid); - if (xlrec->xcnt > 0) - { - appendStringInfo(buf, "; %d xacts:", xlrec->xcnt); - for (i = 0; i < xlrec->xcnt; i++) - appendStringInfo(buf, " %u", xlrec->xids[i]); - } - - if (xlrec->subxid_overflow) - appendStringInfo(buf, "; subxid ovf"); -} - -void -standby_desc(StringInfo buf, uint8 xl_info, char *rec) -{ - uint8 info = xl_info & ~XLR_INFO_MASK; - - if (info == XLOG_STANDBY_LOCK) - { - xl_standby_locks *xlrec = (xl_standby_locks *) rec; - int i; - - appendStringInfo(buf, "AccessExclusive locks:"); - - for (i = 0; i < xlrec->nlocks; i++) - appendStringInfo(buf, " xid %u db %u rel %u", - xlrec->locks[i].xid, xlrec->locks[i].dbOid, - xlrec->locks[i].relOid); - } - else if (info == XLOG_RUNNING_XACTS) - { - xl_running_xacts *xlrec = (xl_running_xacts *) rec; - - appendStringInfo(buf, " running xacts:"); - standby_desc_running_xacts(buf, xlrec); - } - else - appendStringInfo(buf, "UNKNOWN"); -} - /* * Log details of the current snapshot to WAL. This allows the snapshot state * to be reconstructed on the standby. @@ -888,7 +848,7 @@ standby_desc(StringInfo buf, uint8 xl_info, char *rec) * from a time when they were possible. */ void -LogStandbySnapshot(TransactionId *nextXid) +LogStandbySnapshot(void) { RunningTransactions running; xl_standby_lock *locks; @@ -917,8 +877,6 @@ LogStandbySnapshot(TransactionId *nextXid) LogCurrentRunningXacts(running); /* GetRunningTransactionData() acquired XidGenLock, we must release it */ LWLockRelease(XidGenLock); - - *nextXid = running->nextXid; } /* @@ -938,6 +896,7 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) XLogRecPtr recptr; xlrec.xcnt = CurrRunningXacts->xcnt; + xlrec.subxcnt = CurrRunningXacts->subxcnt; xlrec.subxid_overflow = CurrRunningXacts->subxid_overflow; xlrec.nextXid = CurrRunningXacts->nextXid; xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid; @@ -953,7 +912,7 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) { rdata[0].next = &(rdata[1]); rdata[1].data = (char *) CurrRunningXacts->xids; - rdata[1].len = xlrec.xcnt * sizeof(TransactionId); + rdata[1].len = (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId); rdata[1].buffer = InvalidBuffer; lastrdata = 1; } @@ -972,8 +931,8 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) CurrRunningXacts->nextXid); else elog(trace_recovery(DEBUG2), - "snapshot of %u running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)", - CurrRunningXacts->xcnt, + "snapshot of %u+%u running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)", + CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt, (uint32) (recptr >> 32), (uint32) recptr, CurrRunningXacts->oldestRunningXid, CurrRunningXacts->latestCompletedXid, diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 32cc229c86..0183443746 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -210,7 +210,6 @@ static bool FastPathUnGrantRelationLock(Oid relid, LOCKMODE lockmode); static bool FastPathTransferRelationLocks(LockMethod lockMethodTable, const LOCKTAG *locktag, uint32 hashcode); static PROCLOCK *FastPathGetRelationLockEntry(LOCALLOCK *locallock); -static void VirtualXactLockTableCleanup(void); /* * To make the fast-path lock mechanism work, we must have some way of @@ -3791,7 +3790,7 @@ VirtualXactLockTableInsert(VirtualTransactionId vxid) * Check whether a VXID lock has been materialized; if so, release it, * unblocking waiters. */ -static void +void VirtualXactLockTableCleanup() { bool fastpath; diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 677042962a..41af7924c0 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -279,6 +279,13 @@ InitProcess(void) if (MyProc != NULL) elog(ERROR, "you already exist"); + /* + * Initialize process-local latch support. This could fail if the kernel + * is low on resources, and if so we want to exit cleanly before acquiring + * any shared-memory resources. + */ + InitializeLatchSupport(); + /* * Try to get a proc struct from the free list. If this fails, we must be * out of PGPROC structures (not to mention semaphores). @@ -334,6 +341,8 @@ InitProcess(void) SHMQueueElemInit(&(MyProc->links)); MyProc->waitStatus = STATUS_OK; MyProc->lxid = InvalidLocalTransactionId; + MyProc->fpVXIDLock = false; + MyProc->fpLocalTransactionId = InvalidLocalTransactionId; MyPgXact->xid = InvalidTransactionId; MyPgXact->xmin = InvalidTransactionId; MyProc->pid = MyProcPid; @@ -341,7 +350,7 @@ InitProcess(void) MyProc->backendId = InvalidBackendId; MyProc->databaseId = InvalidOid; MyProc->roleId = InvalidOid; - MyPgXact->inCommit = false; + MyPgXact->delayChkpt = false; MyPgXact->vacuumFlags = 0; /* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */ if (IsAutoVacuumWorkerProcess()) @@ -451,6 +460,13 @@ InitAuxiliaryProcess(void) if (MyProc != NULL) elog(ERROR, "you already exist"); + /* + * Initialize process-local latch support. This could fail if the kernel + * is low on resources, and if so we want to exit cleanly before acquiring + * any shared-memory resources. + */ + InitializeLatchSupport(); + /* * We use the ProcStructLock to protect assignment and releasing of * AuxiliaryProcs entries. @@ -493,12 +509,14 @@ InitAuxiliaryProcess(void) SHMQueueElemInit(&(MyProc->links)); MyProc->waitStatus = STATUS_OK; MyProc->lxid = InvalidLocalTransactionId; + MyProc->fpVXIDLock = false; + MyProc->fpLocalTransactionId = InvalidLocalTransactionId; MyPgXact->xid = InvalidTransactionId; MyPgXact->xmin = InvalidTransactionId; MyProc->backendId = InvalidBackendId; MyProc->databaseId = InvalidOid; MyProc->roleId = InvalidOid; - MyPgXact->inCommit = false; + MyPgXact->delayChkpt = false; MyPgXact->vacuumFlags = 0; MyProc->lwWaiting = false; MyProc->lwWaitMode = 0; diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 97742b92bb..384acaeae7 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -300,9 +300,6 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) pfree(path); - if (reln->smgr_transient) - FileSetTransient(fd); - reln->md_fd[forkNum] = _fdvec_alloc(); reln->md_fd[forkNum]->mdfd_vfd = fd; @@ -404,14 +401,14 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) /* truncate(2) would be easier here, but Windows hasn't got it */ int fd; - fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0); + fd = OpenTransientFile(path, O_RDWR | PG_BINARY, 0); if (fd >= 0) { int save_errno; ret = ftruncate(fd, 0); save_errno = errno; - close(fd); + CloseTransientFile(fd); errno = save_errno; } else @@ -585,9 +582,6 @@ mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior) pfree(path); - if (reln->smgr_transient) - FileSetTransient(fd); - reln->md_fd[forknum] = mdfd = _fdvec_alloc(); mdfd->mdfd_vfd = fd; @@ -1680,9 +1674,6 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, if (fd < 0) return NULL; - if (reln->smgr_transient) - FileSetTransient(fd); - /* allocate an mdfdvec entry for it */ v = _fdvec_alloc(); diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 0cec1477f3..5dff8b3702 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -76,11 +76,15 @@ static const int NSmgr = lengthof(smgrsw); /* * Each backend has a hashtable that stores all extant SMgrRelation objects. + * In addition, "unowned" SMgrRelation objects are chained together in a list. */ static HTAB *SMgrRelationHash = NULL; +static SMgrRelation first_unowned_reln = NULL; + /* local function prototypes */ static void smgrshutdown(int code, Datum arg); +static void remove_from_unowned_list(SMgrRelation reln); /* @@ -124,7 +128,7 @@ smgrshutdown(int code, Datum arg) /* * smgropen() -- Return an SMgrRelation object, creating it if need be. * - * This does not attempt to actually open the object. + * This does not attempt to actually open the underlying file. */ SMgrRelation smgropen(RelFileNode rnode, BackendId backend) @@ -144,6 +148,7 @@ smgropen(RelFileNode rnode, BackendId backend) ctl.hash = tag_hash; SMgrRelationHash = hash_create("smgr relation table", 400, &ctl, HASH_ELEM | HASH_FUNCTION); + first_unowned_reln = NULL; } /* Look up or create an entry */ @@ -163,33 +168,20 @@ smgropen(RelFileNode rnode, BackendId backend) reln->smgr_targblock = InvalidBlockNumber; reln->smgr_fsm_nblocks = InvalidBlockNumber; reln->smgr_vm_nblocks = InvalidBlockNumber; - reln->smgr_transient = false; reln->smgr_which = 0; /* we only have md.c at present */ /* mark it not open */ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) reln->md_fd[forknum] = NULL; + + /* place it at head of unowned list (to make smgrsetowner cheap) */ + reln->next_unowned_reln = first_unowned_reln; + first_unowned_reln = reln; } - else - /* if it was transient before, it no longer is */ - reln->smgr_transient = false; return reln; } -/* - * smgrsettransient() -- mark an SMgrRelation object as transaction-bound - * - * The main effect of this is that all opened files are marked to be - * kernel-level closed (but not necessarily VFD-closed) when the current - * transaction ends. - */ -void -smgrsettransient(SMgrRelation reln) -{ - reln->smgr_transient = true; -} - /* * smgrsetowner() -- Establish a long-lived reference to an SMgrRelation object * @@ -199,20 +191,60 @@ smgrsettransient(SMgrRelation reln) void smgrsetowner(SMgrRelation *owner, SMgrRelation reln) { + /* We don't currently support "disowning" an SMgrRelation here */ + Assert(owner != NULL); + /* * First, unhook any old owner. (Normally there shouldn't be any, but it * seems possible that this can happen during swap_relation_files() * depending on the order of processing. It's ok to close the old * relcache entry early in that case.) + * + * If there isn't an old owner, then the reln should be in the unowned + * list, and we need to remove it. */ if (reln->smgr_owner) *(reln->smgr_owner) = NULL; + else + remove_from_unowned_list(reln); /* Now establish the ownership relationship. */ reln->smgr_owner = owner; *owner = reln; } +/* + * remove_from_unowned_list -- unlink an SMgrRelation from the unowned list + * + * If the reln is not present in the list, nothing happens. Typically this + * would be caller error, but there seems no reason to throw an error. + * + * In the worst case this could be rather slow; but in all the cases that seem + * likely to be performance-critical, the reln being sought will actually be + * first in the list. Furthermore, the number of unowned relns touched in any + * one transaction shouldn't be all that high typically. So it doesn't seem + * worth expending the additional space and management logic needed for a + * doubly-linked list. + */ +static void +remove_from_unowned_list(SMgrRelation reln) +{ + SMgrRelation *link; + SMgrRelation cur; + + for (link = &first_unowned_reln, cur = *link; + cur != NULL; + link = &cur->next_unowned_reln, cur = *link) + { + if (cur == reln) + { + *link = cur->next_unowned_reln; + cur->next_unowned_reln = NULL; + break; + } + } +} + /* * smgrexists() -- Does the underlying file for a fork exist? */ @@ -236,6 +268,9 @@ smgrclose(SMgrRelation reln) owner = reln->smgr_owner; + if (!owner) + remove_from_unowned_list(reln); + if (hash_search(SMgrRelationHash, (void *) &(reln->smgr_rnode), HASH_REMOVE, NULL) == NULL) @@ -617,3 +652,29 @@ smgrpostckpt(void) (*(smgrsw[i].smgr_post_ckpt)) (); } } + +/* + * AtEOXact_SMgr + * + * This routine is called during transaction commit or abort (it doesn't + * particularly care which). All transient SMgrRelation objects are closed. + * + * We do this as a compromise between wanting transient SMgrRelations to + * live awhile (to amortize the costs of blind writes of multiple blocks) + * and needing them to not live forever (since we're probably holding open + * a kernel file descriptor for the underlying file, and we need to ensure + * that gets closed reasonably soon if the file gets deleted). + */ +void +AtEOXact_SMgr(void) +{ + /* + * Zap all unowned SMgrRelations. We rely on smgrclose() to remove each + * one from the list. + */ + while (first_unowned_reln != NULL) + { + Assert(first_unowned_reln->smgr_owner == NULL); + smgrclose(first_unowned_reln); + } +} diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 585db1af89..1fd39f2d99 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -976,6 +976,10 @@ exec_simple_query(const char *query_string) plantree_list = pg_plan_queries(querytree_list, 0, NULL); + /* Done with the snapshot used for parsing/planning */ + if (snapshot_set) + PopActiveSnapshot(); + /* If we got a cancel signal in analysis or planning, quit */ CHECK_FOR_INTERRUPTS(); @@ -1000,19 +1004,9 @@ exec_simple_query(const char *query_string) NULL); /* - * Start the portal. - * - * If we took a snapshot for parsing/planning, the portal may be able - * to reuse it for the execution phase. Currently, this will only - * happen in PORTAL_ONE_SELECT mode. But even if PortalStart doesn't - * end up being able to do this, keeping the parse/plan snapshot - * around until after we start the portal doesn't cost much. + * Start the portal. No parameters here. */ - PortalStart(portal, NULL, 0, snapshot_set); - - /* Done with the snapshot used for parsing/planning */ - if (snapshot_set) - PopActiveSnapshot(); + PortalStart(portal, NULL, 0, InvalidSnapshot); /* * Select the appropriate output format: text unless we are doing a @@ -1735,19 +1729,15 @@ exec_bind_message(StringInfo input_message) cplan->stmt_list, cplan); - /* - * And we're ready to start portal execution. - * - * If we took a snapshot for parsing/planning, we'll try to reuse it for - * query execution (currently, reuse will only occur if PORTAL_ONE_SELECT - * mode is chosen). - */ - PortalStart(portal, params, 0, snapshot_set); - /* Done with the snapshot used for parameter I/O and parsing/planning */ if (snapshot_set) PopActiveSnapshot(); + /* + * And we're ready to start portal execution. + */ + PortalStart(portal, params, 0, InvalidSnapshot); + /* * Apply the result format requests to the portal. */ diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index 2cb9a8ee2f..8ad0aaa55e 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -447,18 +447,17 @@ FetchStatementTargetList(Node *stmt) * currently only honored for PORTAL_ONE_SELECT portals). Most callers * should simply pass zero. * - * The use_active_snapshot parameter is currently used only for - * PORTAL_ONE_SELECT portals. If it is true, the active snapshot will - * be used when starting up the executor; if false, a new snapshot will - * be taken. This is used both for cursors and to avoid taking an entirely - * new snapshot when it isn't necessary. + * The caller can optionally pass a snapshot to be used; pass InvalidSnapshot + * for the normal behavior of setting a new snapshot. This parameter is + * presently ignored for non-PORTAL_ONE_SELECT portals (it's only intended + * to be used for cursors). * * On return, portal is ready to accept PortalRun() calls, and the result * tupdesc (if any) is known. */ void PortalStart(Portal portal, ParamListInfo params, - int eflags, bool use_active_snapshot) + int eflags, Snapshot snapshot) { Portal saveActivePortal; ResourceOwner saveResourceOwner; @@ -500,8 +499,8 @@ PortalStart(Portal portal, ParamListInfo params, case PORTAL_ONE_SELECT: /* Must set snapshot before starting executor. */ - if (use_active_snapshot) - PushActiveSnapshot(GetActiveSnapshot()); + if (snapshot) + PushActiveSnapshot(snapshot); else PushActiveSnapshot(GetTransactionSnapshot()); diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 97376bb3ff..a42b8e9b53 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -972,14 +972,7 @@ standard_ProcessUtility(Node *parsetree, case T_AlterEnumStmt: /* ALTER TYPE (enum) */ if (isCompleteQuery) EventTriggerDDLCommandStart(parsetree); - - /* - * We disallow this in transaction blocks, because we can't cope - * with enum OID values getting into indexes and then having their - * defining pg_enum entries go away. - */ - PreventTransactionChain(isTopLevel, "ALTER TYPE ... ADD"); - AlterEnum((AlterEnumStmt *) parsetree); + AlterEnum((AlterEnumStmt *) parsetree, isTopLevel); break; case T_ViewStmt: /* CREATE VIEW */ @@ -1508,16 +1501,11 @@ UtilityContainsQuery(Node *parsetree) return qry; case T_CreateTableAsStmt: - /* might or might not contain a Query ... */ qry = (Query *) ((CreateTableAsStmt *) parsetree)->query; - if (IsA(qry, Query)) - { - /* Recursion currently can't be necessary here */ - Assert(qry->commandType != CMD_UTILITY); - return qry; - } - Assert(IsA(qry, ExecuteStmt)); - return NULL; + Assert(IsA(qry, Query)); + if (qry->commandType == CMD_UTILITY) + return UtilityContainsQuery(qry->utilityStmt); + return qry; default: return NULL; diff --git a/src/backend/utils/adt/int.c b/src/backend/utils/adt/int.c index 4be3901449..3a6f587d06 100644 --- a/src/backend/utils/adt/int.c +++ b/src/backend/utils/adt/int.c @@ -681,18 +681,6 @@ int4mul(PG_FUNCTION_ARGS) int32 arg2 = PG_GETARG_INT32(1); int32 result; -#ifdef WIN32 - - /* - * Win32 doesn't throw a catchable exception for SELECT -2147483648 * - * (-1); -- INT_MIN - */ - if (arg2 == -1 && arg1 == INT_MIN) - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("integer out of range"))); -#endif - result = arg1 * arg2; /* @@ -709,7 +697,8 @@ int4mul(PG_FUNCTION_ARGS) if (!(arg1 >= (int32) SHRT_MIN && arg1 <= (int32) SHRT_MAX && arg2 >= (int32) SHRT_MIN && arg2 <= (int32) SHRT_MAX) && arg2 != 0 && - (result / arg2 != arg1 || (arg2 == -1 && arg1 < 0 && result < 0))) + ((arg2 == -1 && arg1 < 0 && result < 0) || + result / arg2 != arg1)) ereport(ERROR, (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), errmsg("integer out of range"))); @@ -732,30 +721,27 @@ int4div(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } -#ifdef WIN32 - /* - * Win32 doesn't throw a catchable exception for SELECT -2147483648 / - * (-1); -- INT_MIN + * INT_MIN / -1 is problematic, since the result can't be represented on a + * two's-complement machine. Some machines produce INT_MIN, some produce + * zero, some throw an exception. We can dodge the problem by recognizing + * that division by -1 is the same as negation. */ - if (arg2 == -1 && arg1 == INT_MIN) - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("integer out of range"))); -#endif + if (arg2 == -1) + { + result = -arg1; + /* overflow check (needed for INT_MIN) */ + if (arg1 != 0 && SAMESIGN(result, arg1)) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("integer out of range"))); + PG_RETURN_INT32(result); + } + + /* No overflow is possible */ result = arg1 / arg2; - /* - * Overflow check. The only possible overflow case is for arg1 = INT_MIN, - * arg2 = -1, where the correct result is -INT_MIN, which can't be - * represented on a two's-complement machine. Most machines produce - * INT_MIN but it seems some produce zero. - */ - if (arg2 == -1 && arg1 < 0 && result <= 0) - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("integer out of range"))); PG_RETURN_INT32(result); } @@ -877,18 +863,27 @@ int2div(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } - result = arg1 / arg2; - /* - * Overflow check. The only possible overflow case is for arg1 = - * SHRT_MIN, arg2 = -1, where the correct result is -SHRT_MIN, which can't - * be represented on a two's-complement machine. Most machines produce - * SHRT_MIN but it seems some produce zero. + * SHRT_MIN / -1 is problematic, since the result can't be represented on + * a two's-complement machine. Some machines produce SHRT_MIN, some + * produce zero, some throw an exception. We can dodge the problem by + * recognizing that division by -1 is the same as negation. */ - if (arg2 == -1 && arg1 < 0 && result <= 0) - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("smallint out of range"))); + if (arg2 == -1) + { + result = -arg1; + /* overflow check (needed for SHRT_MIN) */ + if (arg1 != 0 && SAMESIGN(result, arg1)) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("smallint out of range"))); + PG_RETURN_INT16(result); + } + + /* No overflow is possible */ + + result = arg1 / arg2; + PG_RETURN_INT16(result); } @@ -1065,18 +1060,27 @@ int42div(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } - result = arg1 / arg2; - /* - * Overflow check. The only possible overflow case is for arg1 = INT_MIN, - * arg2 = -1, where the correct result is -INT_MIN, which can't be - * represented on a two's-complement machine. Most machines produce - * INT_MIN but it seems some produce zero. + * INT_MIN / -1 is problematic, since the result can't be represented on a + * two's-complement machine. Some machines produce INT_MIN, some produce + * zero, some throw an exception. We can dodge the problem by recognizing + * that division by -1 is the same as negation. */ - if (arg2 == -1 && arg1 < 0 && result <= 0) - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("integer out of range"))); + if (arg2 == -1) + { + result = -arg1; + /* overflow check (needed for INT_MIN) */ + if (arg1 != 0 && SAMESIGN(result, arg1)) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("integer out of range"))); + PG_RETURN_INT32(result); + } + + /* No overflow is possible */ + + result = arg1 / arg2; + PG_RETURN_INT32(result); } @@ -1095,8 +1099,12 @@ int4mod(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } - /* SELECT ((-2147483648)::int4) % (-1); causes a floating point exception */ - if (arg1 == INT_MIN && arg2 == -1) + /* + * Some machines throw a floating-point exception for INT_MIN % -1, which + * is a bit silly since the correct answer is perfectly well-defined, + * namely zero. + */ + if (arg2 == -1) PG_RETURN_INT32(0); /* No overflow is possible */ @@ -1119,6 +1127,15 @@ int2mod(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } + /* + * Some machines throw a floating-point exception for INT_MIN % -1, which + * is a bit silly since the correct answer is perfectly well-defined, + * namely zero. (It's not clear this ever happens when dealing with + * int16, but we might as well have the test for safety.) + */ + if (arg2 == -1) + PG_RETURN_INT16(0); + /* No overflow is possible */ PG_RETURN_INT16(arg1 % arg2); diff --git a/src/backend/utils/adt/int8.c b/src/backend/utils/adt/int8.c index 0e59956572..c4cb1f2eff 100644 --- a/src/backend/utils/adt/int8.c +++ b/src/backend/utils/adt/int8.c @@ -574,7 +574,8 @@ int8mul(PG_FUNCTION_ARGS) if (arg1 != (int64) ((int32) arg1) || arg2 != (int64) ((int32) arg2)) { if (arg2 != 0 && - (result / arg2 != arg1 || (arg2 == -1 && arg1 < 0 && result < 0))) + ((arg2 == -1 && arg1 < 0 && result < 0) || + result / arg2 != arg1)) ereport(ERROR, (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), errmsg("bigint out of range"))); @@ -598,18 +599,27 @@ int8div(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } - result = arg1 / arg2; - /* - * Overflow check. The only possible overflow case is for arg1 = - * INT64_MIN, arg2 = -1, where the correct result is -INT64_MIN, which - * can't be represented on a two's-complement machine. Most machines - * produce INT64_MIN but it seems some produce zero. + * INT64_MIN / -1 is problematic, since the result can't be represented on + * a two's-complement machine. Some machines produce INT64_MIN, some + * produce zero, some throw an exception. We can dodge the problem by + * recognizing that division by -1 is the same as negation. */ - if (arg2 == -1 && arg1 < 0 && result <= 0) - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("bigint out of range"))); + if (arg2 == -1) + { + result = -arg1; + /* overflow check (needed for INT64_MIN) */ + if (arg1 != 0 && SAMESIGN(result, arg1)) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("bigint out of range"))); + PG_RETURN_INT64(result); + } + + /* No overflow is possible */ + + result = arg1 / arg2; + PG_RETURN_INT64(result); } @@ -649,6 +659,14 @@ int8mod(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } + /* + * Some machines throw a floating-point exception for INT64_MIN % -1, + * which is a bit silly since the correct answer is perfectly + * well-defined, namely zero. + */ + if (arg2 == -1) + PG_RETURN_INT64(0); + /* No overflow is possible */ PG_RETURN_INT64(arg1 % arg2); @@ -830,18 +848,27 @@ int84div(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } - result = arg1 / arg2; - /* - * Overflow check. The only possible overflow case is for arg1 = - * INT64_MIN, arg2 = -1, where the correct result is -INT64_MIN, which - * can't be represented on a two's-complement machine. Most machines - * produce INT64_MIN but it seems some produce zero. + * INT64_MIN / -1 is problematic, since the result can't be represented on + * a two's-complement machine. Some machines produce INT64_MIN, some + * produce zero, some throw an exception. We can dodge the problem by + * recognizing that division by -1 is the same as negation. */ - if (arg2 == -1 && arg1 < 0 && result <= 0) - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("bigint out of range"))); + if (arg2 == -1) + { + result = -arg1; + /* overflow check (needed for INT64_MIN) */ + if (arg1 != 0 && SAMESIGN(result, arg1)) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("bigint out of range"))); + PG_RETURN_INT64(result); + } + + /* No overflow is possible */ + + result = arg1 / arg2; + PG_RETURN_INT64(result); } @@ -1018,18 +1045,27 @@ int82div(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } - result = arg1 / arg2; - /* - * Overflow check. The only possible overflow case is for arg1 = - * INT64_MIN, arg2 = -1, where the correct result is -INT64_MIN, which - * can't be represented on a two's-complement machine. Most machines - * produce INT64_MIN but it seems some produce zero. + * INT64_MIN / -1 is problematic, since the result can't be represented on + * a two's-complement machine. Some machines produce INT64_MIN, some + * produce zero, some throw an exception. We can dodge the problem by + * recognizing that division by -1 is the same as negation. */ - if (arg2 == -1 && arg1 < 0 && result <= 0) - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("bigint out of range"))); + if (arg2 == -1) + { + result = -arg1; + /* overflow check (needed for INT64_MIN) */ + if (arg1 != 0 && SAMESIGN(result, arg1)) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("bigint out of range"))); + PG_RETURN_INT64(result); + } + + /* No overflow is possible */ + + result = arg1 / arg2; + PG_RETURN_INT64(result); } diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c index 68c1f1de3b..b408df7faf 100644 --- a/src/backend/utils/adt/numeric.c +++ b/src/backend/utils/adt/numeric.c @@ -276,6 +276,16 @@ static NumericDigit const_two_data[1] = {2}; static NumericVar const_two = {1, 0, NUMERIC_POS, 0, NULL, const_two_data}; +#if DEC_DIGITS == 4 || DEC_DIGITS == 2 +static NumericDigit const_ten_data[1] = {10}; +static NumericVar const_ten = +{1, 0, NUMERIC_POS, 0, NULL, const_ten_data}; +#elif DEC_DIGITS == 1 +static NumericDigit const_ten_data[1] = {1}; +static NumericVar const_ten = +{1, 1, NUMERIC_POS, 0, NULL, const_ten_data}; +#endif + #if DEC_DIGITS == 4 static NumericDigit const_zero_point_five_data[1] = {5000}; #elif DEC_DIGITS == 2 @@ -367,8 +377,9 @@ static void zero_var(NumericVar *var); static const char *set_var_from_str(const char *str, const char *cp, NumericVar *dest); static void set_var_from_num(Numeric value, NumericVar *dest); +static void init_var_from_num(Numeric num, NumericVar *dest); static void set_var_from_var(NumericVar *value, NumericVar *dest); -static char *get_str_from_var(NumericVar *var, int dscale); +static char *get_str_from_var(NumericVar *var); static char *get_str_from_var_sci(NumericVar *var, int rscale); static Numeric make_result(NumericVar *var); @@ -533,18 +544,10 @@ numeric_out(PG_FUNCTION_ARGS) /* * Get the number in the variable format. - * - * Even if we didn't need to change format, we'd still need to copy the - * value to have a modifiable copy for rounding. set_var_from_num() also - * guarantees there is extra digit space in case we produce a carry out - * from rounding. */ - init_var(&x); - set_var_from_num(num, &x); + init_var_from_num(num, &x); - str = get_str_from_var(&x, x.dscale); - - free_var(&x); + str = get_str_from_var(&x); PG_RETURN_CSTRING(str); } @@ -616,12 +619,10 @@ numeric_out_sci(Numeric num, int scale) if (NUMERIC_IS_NAN(num)) return pstrdup("NaN"); - init_var(&x); - set_var_from_num(num, &x); + init_var_from_num(num, &x); str = get_str_from_var_sci(&x, scale); - free_var(&x); return str; } @@ -695,8 +696,7 @@ numeric_send(PG_FUNCTION_ARGS) StringInfoData buf; int i; - init_var(&x); - set_var_from_num(num, &x); + init_var_from_num(num, &x); pq_begintypsend(&buf); @@ -707,8 +707,6 @@ numeric_send(PG_FUNCTION_ARGS) for (i = 0; i < x.ndigits; i++) pq_sendint(&buf, x.digits[i], sizeof(NumericDigit)); - free_var(&x); - PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); } @@ -1150,9 +1148,7 @@ numeric_ceil(PG_FUNCTION_ARGS) if (NUMERIC_IS_NAN(num)) PG_RETURN_NUMERIC(make_result(&const_nan)); - init_var(&result); - - set_var_from_num(num, &result); + init_var_from_num(num, &result); ceil_var(&result, &result); res = make_result(&result); @@ -1177,9 +1173,7 @@ numeric_floor(PG_FUNCTION_ARGS) if (NUMERIC_IS_NAN(num)) PG_RETURN_NUMERIC(make_result(&const_nan)); - init_var(&result); - - set_var_from_num(num, &result); + init_var_from_num(num, &result); floor_var(&result, &result); res = make_result(&result); @@ -1282,13 +1276,9 @@ compute_bucket(Numeric operand, Numeric bound1, Numeric bound2, NumericVar bound2_var; NumericVar operand_var; - init_var(&bound1_var); - init_var(&bound2_var); - init_var(&operand_var); - - set_var_from_num(bound1, &bound1_var); - set_var_from_num(bound2, &bound2_var); - set_var_from_num(operand, &operand_var); + init_var_from_num(bound1, &bound1_var); + init_var_from_num(bound2, &bound2_var); + init_var_from_num(operand, &operand_var); if (cmp_var(&bound1_var, &bound2_var) < 0) { @@ -1573,19 +1563,14 @@ numeric_add(PG_FUNCTION_ARGS) /* * Unpack the values, let add_var() compute the result and return it. */ - init_var(&arg1); - init_var(&arg2); - init_var(&result); - - set_var_from_num(num1, &arg1); - set_var_from_num(num2, &arg2); + init_var_from_num(num1, &arg1); + init_var_from_num(num2, &arg2); + init_var(&result); add_var(&arg1, &arg2, &result); res = make_result(&result); - free_var(&arg1); - free_var(&arg2); free_var(&result); PG_RETURN_NUMERIC(res); @@ -1616,19 +1601,14 @@ numeric_sub(PG_FUNCTION_ARGS) /* * Unpack the values, let sub_var() compute the result and return it. */ - init_var(&arg1); - init_var(&arg2); - init_var(&result); - - set_var_from_num(num1, &arg1); - set_var_from_num(num2, &arg2); + init_var_from_num(num1, &arg1); + init_var_from_num(num2, &arg2); + init_var(&result); sub_var(&arg1, &arg2, &result); res = make_result(&result); - free_var(&arg1); - free_var(&arg2); free_var(&result); PG_RETURN_NUMERIC(res); @@ -1663,19 +1643,14 @@ numeric_mul(PG_FUNCTION_ARGS) * we request exact representation for the product (rscale = sum(dscale of * arg1, dscale of arg2)). */ - init_var(&arg1); - init_var(&arg2); - init_var(&result); - - set_var_from_num(num1, &arg1); - set_var_from_num(num2, &arg2); + init_var_from_num(num1, &arg1); + init_var_from_num(num2, &arg2); + init_var(&result); mul_var(&arg1, &arg2, &result, arg1.dscale + arg2.dscale); res = make_result(&result); - free_var(&arg1); - free_var(&arg2); free_var(&result); PG_RETURN_NUMERIC(res); @@ -1707,12 +1682,10 @@ numeric_div(PG_FUNCTION_ARGS) /* * Unpack the arguments */ - init_var(&arg1); - init_var(&arg2); - init_var(&result); + init_var_from_num(num1, &arg1); + init_var_from_num(num2, &arg2); - set_var_from_num(num1, &arg1); - set_var_from_num(num2, &arg2); + init_var(&result); /* * Select scale for division result @@ -1726,8 +1699,6 @@ numeric_div(PG_FUNCTION_ARGS) res = make_result(&result); - free_var(&arg1); - free_var(&arg2); free_var(&result); PG_RETURN_NUMERIC(res); @@ -1758,12 +1729,10 @@ numeric_div_trunc(PG_FUNCTION_ARGS) /* * Unpack the arguments */ - init_var(&arg1); - init_var(&arg2); - init_var(&result); + init_var_from_num(num1, &arg1); + init_var_from_num(num2, &arg2); - set_var_from_num(num1, &arg1); - set_var_from_num(num2, &arg2); + init_var(&result); /* * Do the divide and return the result @@ -1772,8 +1741,6 @@ numeric_div_trunc(PG_FUNCTION_ARGS) res = make_result(&result); - free_var(&arg1); - free_var(&arg2); free_var(&result); PG_RETURN_NUMERIC(res); @@ -1798,20 +1765,16 @@ numeric_mod(PG_FUNCTION_ARGS) if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2)) PG_RETURN_NUMERIC(make_result(&const_nan)); - init_var(&arg1); - init_var(&arg2); - init_var(&result); + init_var_from_num(num1, &arg1); + init_var_from_num(num2, &arg2); - set_var_from_num(num1, &arg1); - set_var_from_num(num2, &arg2); + init_var(&result); mod_var(&arg1, &arg2, &result); res = make_result(&result); free_var(&result); - free_var(&arg2); - free_var(&arg1); PG_RETURN_NUMERIC(res); } @@ -1838,9 +1801,7 @@ numeric_inc(PG_FUNCTION_ARGS) /* * Compute the result and return it */ - init_var(&arg); - - set_var_from_num(num, &arg); + init_var_from_num(num, &arg); add_var(&arg, &const_one, &arg); @@ -1977,10 +1938,9 @@ numeric_sqrt(PG_FUNCTION_ARGS) * to give at least NUMERIC_MIN_SIG_DIGITS significant digits; but in any * case not less than the input's dscale. */ - init_var(&arg); - init_var(&result); + init_var_from_num(num, &arg); - set_var_from_num(num, &arg); + init_var(&result); /* Assume the input was normalized, so arg.weight is accurate */ sweight = (arg.weight + 1) * DEC_DIGITS / 2 - 1; @@ -1998,7 +1958,6 @@ numeric_sqrt(PG_FUNCTION_ARGS) res = make_result(&result); free_var(&result); - free_var(&arg); PG_RETURN_NUMERIC(res); } @@ -2030,10 +1989,9 @@ numeric_exp(PG_FUNCTION_ARGS) * to give at least NUMERIC_MIN_SIG_DIGITS significant digits; but in any * case not less than the input's dscale. */ - init_var(&arg); - init_var(&result); + init_var_from_num(num, &arg); - set_var_from_num(num, &arg); + init_var(&result); /* convert input to float8, ignoring overflow */ val = numericvar_to_double_no_overflow(&arg); @@ -2061,7 +2019,6 @@ numeric_exp(PG_FUNCTION_ARGS) res = make_result(&result); free_var(&result); - free_var(&arg); PG_RETURN_NUMERIC(res); } @@ -2088,11 +2045,9 @@ numeric_ln(PG_FUNCTION_ARGS) if (NUMERIC_IS_NAN(num)) PG_RETURN_NUMERIC(make_result(&const_nan)); - init_var(&arg); + init_var_from_num(num, &arg); init_var(&result); - set_var_from_num(num, &arg); - /* Approx decimal digits before decimal point */ dec_digits = (arg.weight + 1) * DEC_DIGITS; @@ -2112,7 +2067,6 @@ numeric_ln(PG_FUNCTION_ARGS) res = make_result(&result); free_var(&result); - free_var(&arg); PG_RETURN_NUMERIC(res); } @@ -2142,13 +2096,10 @@ numeric_log(PG_FUNCTION_ARGS) /* * Initialize things */ - init_var(&arg1); - init_var(&arg2); + init_var_from_num(num1, &arg1); + init_var_from_num(num2, &arg2); init_var(&result); - set_var_from_num(num1, &arg1); - set_var_from_num(num2, &arg2); - /* * Call log_var() to compute and return the result; note it handles scale * selection itself. @@ -2158,8 +2109,6 @@ numeric_log(PG_FUNCTION_ARGS) res = make_result(&result); free_var(&result); - free_var(&arg2); - free_var(&arg1); PG_RETURN_NUMERIC(res); } @@ -2190,15 +2139,12 @@ numeric_power(PG_FUNCTION_ARGS) /* * Initialize things */ - init_var(&arg1); - init_var(&arg2); init_var(&arg2_trunc); init_var(&result); + init_var_from_num(num1, &arg1); + init_var_from_num(num2, &arg2); - set_var_from_num(num1, &arg1); - set_var_from_num(num2, &arg2); set_var_from_var(&arg2, &arg2_trunc); - trunc_var(&arg2_trunc, 0); /* @@ -2227,9 +2173,7 @@ numeric_power(PG_FUNCTION_ARGS) res = make_result(&result); free_var(&result); - free_var(&arg2); free_var(&arg2_trunc); - free_var(&arg1); PG_RETURN_NUMERIC(res); } @@ -2276,10 +2220,8 @@ numeric_int4(PG_FUNCTION_ARGS) errmsg("cannot convert NaN to integer"))); /* Convert to variable format, then convert to int4 */ - init_var(&x); - set_var_from_num(num, &x); + init_var_from_num(num, &x); result = numericvar_to_int4(&x); - free_var(&x); PG_RETURN_INT32(result); } @@ -2344,16 +2286,13 @@ numeric_int8(PG_FUNCTION_ARGS) errmsg("cannot convert NaN to bigint"))); /* Convert to variable format and thence to int8 */ - init_var(&x); - set_var_from_num(num, &x); + init_var_from_num(num, &x); if (!numericvar_to_int8(&x, &result)) ereport(ERROR, (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), errmsg("bigint out of range"))); - free_var(&x); - PG_RETURN_INT64(result); } @@ -2392,16 +2331,13 @@ numeric_int2(PG_FUNCTION_ARGS) errmsg("cannot convert NaN to smallint"))); /* Convert to variable format and thence to int8 */ - init_var(&x); - set_var_from_num(num, &x); + init_var_from_num(num, &x); if (!numericvar_to_int8(&x, &val)) ereport(ERROR, (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), errmsg("smallint out of range"))); - free_var(&x); - /* Down-convert to int2 */ result = (int16) val; @@ -2763,8 +2699,7 @@ numeric_stddev_internal(ArrayType *transarray, if (NUMERIC_IS_NAN(N) || NUMERIC_IS_NAN(sumX) || NUMERIC_IS_NAN(sumX2)) return make_result(&const_nan); - init_var(&vN); - set_var_from_num(N, &vN); + init_var_from_num(N, &vN); /* * Sample stddev and variance are undefined when N <= 1; population stddev @@ -2777,7 +2712,6 @@ numeric_stddev_internal(ArrayType *transarray, if (cmp_var(&vN, comp) <= 0) { - free_var(&vN); *is_null = true; return NULL; } @@ -2785,10 +2719,8 @@ numeric_stddev_internal(ArrayType *transarray, init_var(&vNminus1); sub_var(&vN, &const_one, &vNminus1); - init_var(&vsumX); - set_var_from_num(sumX, &vsumX); - init_var(&vsumX2); - set_var_from_num(sumX2, &vsumX2); + init_var_from_num(sumX, &vsumX); + init_var_from_num(sumX2, &vsumX2); /* compute rscale for mul_var calls */ rscale = vsumX.dscale * 2; @@ -2816,7 +2748,6 @@ numeric_stddev_internal(ArrayType *transarray, res = make_result(&vsumX); } - free_var(&vN); free_var(&vNminus1); free_var(&vsumX); free_var(&vsumX2); @@ -3449,6 +3380,32 @@ set_var_from_num(Numeric num, NumericVar *dest) } +/* + * init_var_from_num() - + * + * Initialize a variable from packed db format. The digits array is not + * copied, which saves some cycles when the resulting var is not modified. + * Also, there's no need to call free_var(), as long as you don't assign any + * other value to it (with set_var_* functions, or by using the var as the + * destination of a function like add_var()) + * + * CAUTION: Do not modify the digits buffer of a var initialized with this + * function, e.g by calling round_var() or trunc_var(), as the changes will + * propagate to the original Numeric! It's OK to use it as the destination + * argument of one of the calculational functions, though. + */ +static void +init_var_from_num(Numeric num, NumericVar *dest) +{ + dest->ndigits = NUMERIC_NDIGITS(num); + dest->weight = NUMERIC_WEIGHT(num); + dest->sign = NUMERIC_SIGN(num); + dest->dscale = NUMERIC_DSCALE(num); + dest->digits = NUMERIC_DIGITS(num); + dest->buf = NULL; /* digits array is not palloc'd */ +} + + /* * set_var_from_var() - * @@ -3475,12 +3432,13 @@ set_var_from_var(NumericVar *value, NumericVar *dest) * get_str_from_var() - * * Convert a var to text representation (guts of numeric_out). - * CAUTION: var's contents may be modified by rounding! + * The var is displayed to the number of digits indicated by its dscale. * Returns a palloc'd string. */ static char * -get_str_from_var(NumericVar *var, int dscale) +get_str_from_var(NumericVar *var) { + int dscale; char *str; char *cp; char *endcp; @@ -3492,13 +3450,7 @@ get_str_from_var(NumericVar *var, int dscale) NumericDigit d1; #endif - if (dscale < 0) - dscale = 0; - - /* - * Check if we must round up before printing the value and do so. - */ - round_var(var, dscale); + dscale = var->dscale; /* * Allocate space for the result. @@ -3634,8 +3586,6 @@ get_str_from_var(NumericVar *var, int dscale) * rscale is the number of decimal digits desired after the decimal point in * the output, negative values will be treated as meaning zero. * - * CAUTION: var's contents may be modified by rounding! - * * Returns a palloc'd string. */ static char * @@ -3694,10 +3644,9 @@ get_str_from_var_sci(NumericVar *var, int rscale) init_var(&denominator); init_var(&significand); - int8_to_numericvar((int64) 10, &denominator); - power_var_int(&denominator, exponent, &denominator, denom_scale); + power_var_int(&const_ten, exponent, &denominator, denom_scale); div_var(var, &denominator, &significand, rscale, true); - sig_out = get_str_from_var(&significand, rscale); + sig_out = get_str_from_var(&significand); free_var(&denominator); free_var(&significand); @@ -3886,8 +3835,6 @@ apply_typmod(NumericVar *var, int32 typmod) * Convert numeric to int8, rounding if needed. * * If overflow, return FALSE (no error is raised). Return TRUE if okay. - * - * CAUTION: var's contents may be modified by rounding! */ static bool numericvar_to_int8(NumericVar *var, int64 *result) @@ -3899,16 +3846,20 @@ numericvar_to_int8(NumericVar *var, int64 *result) int64 val, oldval; bool neg; + NumericVar rounded; /* Round to nearest integer */ - round_var(var, 0); + init_var(&rounded); + set_var_from_var(var, &rounded); + round_var(&rounded, 0); /* Check for zero input */ - strip_var(var); - ndigits = var->ndigits; + strip_var(&rounded); + ndigits = rounded.ndigits; if (ndigits == 0) { *result = 0; + free_var(&rounded); return true; } @@ -3916,12 +3867,12 @@ numericvar_to_int8(NumericVar *var, int64 *result) * For input like 10000000000, we must treat stripped digits as real. So * the loop assumes there are weight+1 digits before the decimal point. */ - weight = var->weight; + weight = rounded.weight; Assert(weight >= 0 && ndigits <= weight + 1); /* Construct the result */ - digits = var->digits; - neg = (var->sign == NUMERIC_NEG); + digits = rounded.digits; + neg = (rounded.sign == NUMERIC_NEG); val = digits[0]; for (i = 1; i <= weight; i++) { @@ -3940,10 +3891,15 @@ numericvar_to_int8(NumericVar *var, int64 *result) if ((val / NBASE) != oldval) /* possible overflow? */ { if (!neg || (-val) != val || val == 0 || oldval < 0) + { + free_var(&rounded); return false; + } } } + free_var(&rounded); + *result = neg ? -val : val; return true; } @@ -4030,7 +3986,7 @@ numericvar_to_double_no_overflow(NumericVar *var) double val; char *endptr; - tmp = get_str_from_var(var, var->dscale); + tmp = get_str_from_var(var); /* unlike float8in, we ignore ERANGE from strtod */ val = strtod(tmp, &endptr); @@ -5597,13 +5553,9 @@ power_var(NumericVar *base, NumericVar *exp, NumericVar *result) if (exp->ndigits == 0 || exp->ndigits <= exp->weight + 1) { /* exact integer, but does it fit in int? */ - NumericVar x; int64 expval64; - /* must copy because numericvar_to_int8() scribbles on input */ - init_var(&x); - set_var_from_var(exp, &x); - if (numericvar_to_int8(&x, &expval64)) + if (numericvar_to_int8(exp, &expval64)) { int expval = (int) expval64; @@ -5617,12 +5569,9 @@ power_var(NumericVar *base, NumericVar *exp, NumericVar *result) rscale = Min(rscale, NUMERIC_MAX_DISPLAY_SCALE); power_var_int(base, expval, result, rscale); - - free_var(&x); return; } } - free_var(&x); } /* diff --git a/src/backend/utils/adt/rowtypes.c b/src/backend/utils/adt/rowtypes.c index 13e574d4e8..d4ed7d0ca0 100644 --- a/src/backend/utils/adt/rowtypes.c +++ b/src/backend/utils/adt/rowtypes.c @@ -32,6 +32,7 @@ typedef struct ColumnIOData Oid column_type; Oid typiofunc; Oid typioparam; + bool typisvarlena; FmgrInfo proc; } ColumnIOData; @@ -364,6 +365,7 @@ record_out(PG_FUNCTION_ARGS) { ColumnIOData *column_info = &my_extra->columns[i]; Oid column_type = tupdesc->attrs[i]->atttypid; + Datum attr; char *value; char *tmp; bool nq; @@ -387,17 +389,24 @@ record_out(PG_FUNCTION_ARGS) */ if (column_info->column_type != column_type) { - bool typIsVarlena; - getTypeOutputInfo(column_type, &column_info->typiofunc, - &typIsVarlena); + &column_info->typisvarlena); fmgr_info_cxt(column_info->typiofunc, &column_info->proc, fcinfo->flinfo->fn_mcxt); column_info->column_type = column_type; } - value = OutputFunctionCall(&column_info->proc, values[i]); + /* + * If we have a toasted datum, forcibly detoast it here to avoid + * memory leakage inside the type's output routine. + */ + if (column_info->typisvarlena) + attr = PointerGetDatum(PG_DETOAST_DATUM(values[i])); + else + attr = values[i]; + + value = OutputFunctionCall(&column_info->proc, attr); /* Detect whether we need double quotes for this value */ nq = (value[0] == '\0'); /* force quotes for empty string */ @@ -416,17 +425,23 @@ record_out(PG_FUNCTION_ARGS) /* And emit the string */ if (nq) - appendStringInfoChar(&buf, '"'); + appendStringInfoCharMacro(&buf, '"'); for (tmp = value; *tmp; tmp++) { char ch = *tmp; if (ch == '"' || ch == '\\') - appendStringInfoChar(&buf, ch); - appendStringInfoChar(&buf, ch); + appendStringInfoCharMacro(&buf, ch); + appendStringInfoCharMacro(&buf, ch); } if (nq) - appendStringInfoChar(&buf, '"'); + appendStringInfoCharMacro(&buf, '"'); + + pfree(value); + + /* Clean up detoasted copy, if any */ + if (DatumGetPointer(attr) != DatumGetPointer(values[i])) + pfree(DatumGetPointer(attr)); } appendStringInfoChar(&buf, ')'); @@ -714,6 +729,7 @@ record_send(PG_FUNCTION_ARGS) { ColumnIOData *column_info = &my_extra->columns[i]; Oid column_type = tupdesc->attrs[i]->atttypid; + Datum attr; bytea *outputbytes; /* Ignore dropped columns in datatype */ @@ -734,23 +750,35 @@ record_send(PG_FUNCTION_ARGS) */ if (column_info->column_type != column_type) { - bool typIsVarlena; - getTypeBinaryOutputInfo(column_type, &column_info->typiofunc, - &typIsVarlena); + &column_info->typisvarlena); fmgr_info_cxt(column_info->typiofunc, &column_info->proc, fcinfo->flinfo->fn_mcxt); column_info->column_type = column_type; } - outputbytes = SendFunctionCall(&column_info->proc, values[i]); + /* + * If we have a toasted datum, forcibly detoast it here to avoid + * memory leakage inside the type's output routine. + */ + if (column_info->typisvarlena) + attr = PointerGetDatum(PG_DETOAST_DATUM(values[i])); + else + attr = values[i]; + + outputbytes = SendFunctionCall(&column_info->proc, attr); /* We assume the result will not have been toasted */ pq_sendint(&buf, VARSIZE(outputbytes) - VARHDRSZ, 4); pq_sendbytes(&buf, VARDATA(outputbytes), VARSIZE(outputbytes) - VARHDRSZ); + pfree(outputbytes); + + /* Clean up detoasted copy, if any */ + if (DatumGetPointer(attr) != DatumGetPointer(values[i])) + pfree(DatumGetPointer(attr)); } pfree(values); diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index c3ede233bc..b7aff1189b 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -3690,8 +3690,8 @@ get_insert_query_def(Query *query, deparse_context *context) get_with_clause(query, context); /* - * If it's an INSERT ... SELECT or VALUES (...), (...), ... there will be - * a single RTE for the SELECT or VALUES. + * If it's an INSERT ... SELECT or multi-row VALUES, there will be a + * single RTE for the SELECT or VALUES. Plain VALUES has neither. */ foreach(l, query->rtable) { @@ -3725,7 +3725,7 @@ get_insert_query_def(Query *query, deparse_context *context) context->indentLevel += PRETTYINDENT_STD; appendStringInfoChar(buf, ' '); } - appendStringInfo(buf, "INSERT INTO %s (", + appendStringInfo(buf, "INSERT INTO %s ", generate_relation_name(rte->relid, NIL)); /* @@ -3742,6 +3742,8 @@ get_insert_query_def(Query *query, deparse_context *context) values_cell = NULL; strippedexprs = NIL; sep = ""; + if (query->targetList) + appendStringInfoChar(buf, '('); foreach(l, query->targetList) { TargetEntry *tle = (TargetEntry *) lfirst(l); @@ -3778,7 +3780,8 @@ get_insert_query_def(Query *query, deparse_context *context) context, true)); } } - appendStringInfo(buf, ") "); + if (query->targetList) + appendStringInfo(buf, ") "); if (select_rte) { @@ -3791,7 +3794,7 @@ get_insert_query_def(Query *query, deparse_context *context) /* Add the multi-VALUES expression lists */ get_values_def(values_rte->values_lists, context); } - else + else if (strippedexprs) { /* Add the single-VALUES expression list */ appendContextKeyword(context, "VALUES (", @@ -3799,6 +3802,11 @@ get_insert_query_def(Query *query, deparse_context *context) get_rule_expr((Node *) strippedexprs, context, false); appendStringInfoChar(buf, ')'); } + else + { + /* No expressions, so it must be DEFAULT VALUES */ + appendStringInfo(buf, "DEFAULT VALUES"); + } /* Add RETURNING if present */ if (query->returningList) diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 61100aec4a..60000aaf34 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -6130,12 +6130,14 @@ genericcostestimate(PlannerInfo *root, * index would have better selectivity.) * * We can deal with this by adding a very small "fudge factor" that - * depends on the index size. The fudge factor used here is one - * spc_random_page_cost per 10000 index pages, which should be small - * enough to not alter index-vs-seqscan decisions, but will prevent - * indexes of different sizes from looking exactly equally attractive. + * depends on the index size, so that indexes of different sizes won't + * look exactly equally attractive. To ensure the fudge factor stays + * small even for very large indexes, use a log function. (We previously + * used a factor of one spc_random_page_cost per 10000 index pages, which + * grew too large for large indexes. This expression has about the same + * growth rate for small indexes, but tails off quickly.) */ - *indexTotalCost += index->pages * spc_random_page_cost / 10000.0; + *indexTotalCost += log(1.0 + index->pages / 10000.0) * spc_random_page_cost; /* * CPU cost: any complex expressions in the indexquals will need to be diff --git a/src/backend/utils/adt/timestamp.c b/src/backend/utils/adt/timestamp.c index 50ef8976be..6ff7385233 100644 --- a/src/backend/utils/adt/timestamp.c +++ b/src/backend/utils/adt/timestamp.c @@ -1285,6 +1285,50 @@ GetCurrentTimestamp(void) return result; } +/* + * GetCurrentIntegerTimestamp -- get the current operating system time as int64 + * + * Result is the number of milliseconds since the Postgres epoch. If compiled + * with --enable-integer-datetimes, this is identical to GetCurrentTimestamp(), + * and is implemented as a macro. + */ +#ifndef HAVE_INT64_TIMESTAMP +int64 +GetCurrentIntegerTimestamp(void) +{ + int64 result; + struct timeval tp; + + gettimeofday(&tp, NULL); + + result = (int64) tp.tv_sec - + ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY); + + result = (result * USECS_PER_SEC) + tp.tv_usec; + + return result; +} +#endif + +/* + * IntegetTimestampToTimestampTz -- convert an int64 timestamp to native format + * + * When compiled with --enable-integer-datetimes, this is implemented as a + * no-op macro. + */ +#ifndef HAVE_INT64_TIMESTAMP +TimestampTz +IntegerTimestampToTimestampTz(int64 timestamp) +{ + TimestampTz result; + + result = timestamp / USECS_PER_SEC; + result += (timestamp % USECS_PER_SEC) / 1000000.0; + + return result; +} +#endif + /* * TimestampDifference -- convert the difference between two timestamps * into integer seconds and microseconds diff --git a/src/backend/utils/cache/catcache.c b/src/backend/utils/cache/catcache.c index d6f6b1c0de..9ae143272b 100644 --- a/src/backend/utils/cache/catcache.c +++ b/src/backend/utils/cache/catcache.c @@ -291,7 +291,7 @@ CatalogCacheComputeTupleHashValue(CatCache *cache, HeapTuple tuple) static void CatCachePrintStats(int code, Datum arg) { - CatCache *cache; + slist_iter iter; long cc_searches = 0; long cc_hits = 0; long cc_neg_hits = 0; @@ -300,8 +300,10 @@ CatCachePrintStats(int code, Datum arg) long cc_lsearches = 0; long cc_lhits = 0; - for (cache = CacheHdr->ch_caches; cache; cache = cache->cc_next) + slist_foreach(iter, &CacheHdr->ch_caches) { + CatCache *cache = slist_container(CatCache, cc_next, iter.cur); + if (cache->cc_ntup == 0 && cache->cc_searches == 0) continue; /* don't print unused caches */ elog(DEBUG2, "catcache %s/%u: %d tup, %ld srch, %ld+%ld=%ld hits, %ld+%ld=%ld loads, %ld invals, %ld lsrch, %ld lhits", @@ -369,7 +371,7 @@ CatCacheRemoveCTup(CatCache *cache, CatCTup *ct) } /* delink from linked list */ - DLRemove(&ct->cache_elem); + dlist_delete(&ct->cache_elem); /* free associated tuple data */ if (ct->tuple.t_data != NULL) @@ -412,7 +414,7 @@ CatCacheRemoveCList(CatCache *cache, CatCList *cl) } /* delink from linked list */ - DLRemove(&cl->cache_elem); + dlist_delete(&cl->cache_elem); /* free associated tuple data */ if (cl->tuple.t_data != NULL) @@ -442,18 +444,18 @@ CatCacheRemoveCList(CatCache *cache, CatCList *cl) void CatalogCacheIdInvalidate(int cacheId, uint32 hashValue) { - CatCache *ccp; + slist_iter cache_iter; CACHE1_elog(DEBUG2, "CatalogCacheIdInvalidate: called"); /* * inspect caches to find the proper cache */ - for (ccp = CacheHdr->ch_caches; ccp; ccp = ccp->cc_next) + slist_foreach(cache_iter, &CacheHdr->ch_caches) { + CatCache *ccp = slist_container(CatCache, cc_next, cache_iter.cur); Index hashIndex; - Dlelem *elt, - *nextelt; + dlist_mutable_iter iter; if (cacheId != ccp->id) continue; @@ -468,11 +470,9 @@ CatalogCacheIdInvalidate(int cacheId, uint32 hashValue) * Invalidate *all* CatCLists in this cache; it's too hard to tell * which searches might still be correct, so just zap 'em all. */ - for (elt = DLGetHead(&ccp->cc_lists); elt; elt = nextelt) + dlist_foreach_modify(iter, &ccp->cc_lists) { - CatCList *cl = (CatCList *) DLE_VAL(elt); - - nextelt = DLGetSucc(elt); + CatCList *cl = dlist_container(CatCList, cache_elem, iter.cur); if (cl->refcount > 0) cl->dead = true; @@ -484,12 +484,9 @@ CatalogCacheIdInvalidate(int cacheId, uint32 hashValue) * inspect the proper hash bucket for tuple matches */ hashIndex = HASH_INDEX(hashValue, ccp->cc_nbuckets); - - for (elt = DLGetHead(&ccp->cc_bucket[hashIndex]); elt; elt = nextelt) + dlist_foreach_modify(iter, &ccp->cc_bucket[hashIndex]) { - CatCTup *ct = (CatCTup *) DLE_VAL(elt); - - nextelt = DLGetSucc(elt); + CatCTup *ct = dlist_container(CatCTup, cache_elem, iter.cur); if (hashValue == ct->hash_value) { @@ -557,17 +554,18 @@ AtEOXact_CatCache(bool isCommit) #ifdef USE_ASSERT_CHECKING if (assert_enabled) { - CatCache *ccp; + slist_iter cache_iter; - for (ccp = CacheHdr->ch_caches; ccp; ccp = ccp->cc_next) + slist_foreach(cache_iter, &CacheHdr->ch_caches) { - Dlelem *elt; + CatCache *ccp = slist_container(CatCache, cc_next, cache_iter.cur); + dlist_iter iter; int i; /* Check CatCLists */ - for (elt = DLGetHead(&ccp->cc_lists); elt; elt = DLGetSucc(elt)) + dlist_foreach(iter, &ccp->cc_lists) { - CatCList *cl = (CatCList *) DLE_VAL(elt); + CatCList *cl = dlist_container(CatCList, cache_elem, iter.cur); Assert(cl->cl_magic == CL_MAGIC); Assert(cl->refcount == 0); @@ -577,11 +575,11 @@ AtEOXact_CatCache(bool isCommit) /* Check individual tuples */ for (i = 0; i < ccp->cc_nbuckets; i++) { - for (elt = DLGetHead(&ccp->cc_bucket[i]); - elt; - elt = DLGetSucc(elt)) + dlist_head *bucket = &ccp->cc_bucket[i]; + + dlist_foreach(iter, bucket) { - CatCTup *ct = (CatCTup *) DLE_VAL(elt); + CatCTup *ct = dlist_container(CatCTup, cache_elem, iter.cur); Assert(ct->ct_magic == CT_MAGIC); Assert(ct->refcount == 0); @@ -604,16 +602,13 @@ AtEOXact_CatCache(bool isCommit) static void ResetCatalogCache(CatCache *cache) { - Dlelem *elt, - *nextelt; + dlist_mutable_iter iter; int i; /* Remove each list in this cache, or at least mark it dead */ - for (elt = DLGetHead(&cache->cc_lists); elt; elt = nextelt) + dlist_foreach_modify(iter, &cache->cc_lists) { - CatCList *cl = (CatCList *) DLE_VAL(elt); - - nextelt = DLGetSucc(elt); + CatCList *cl = dlist_container(CatCList, cache_elem, iter.cur); if (cl->refcount > 0) cl->dead = true; @@ -624,11 +619,11 @@ ResetCatalogCache(CatCache *cache) /* Remove each tuple in this cache, or at least mark it dead */ for (i = 0; i < cache->cc_nbuckets; i++) { - for (elt = DLGetHead(&cache->cc_bucket[i]); elt; elt = nextelt) - { - CatCTup *ct = (CatCTup *) DLE_VAL(elt); + dlist_head *bucket = &cache->cc_bucket[i]; - nextelt = DLGetSucc(elt); + dlist_foreach_modify(iter, bucket) + { + CatCTup *ct = dlist_container(CatCTup, cache_elem, iter.cur); if (ct->refcount > 0 || (ct->c_list && ct->c_list->refcount > 0)) @@ -654,12 +649,16 @@ ResetCatalogCache(CatCache *cache) void ResetCatalogCaches(void) { - CatCache *cache; + slist_iter iter; CACHE1_elog(DEBUG2, "ResetCatalogCaches called"); - for (cache = CacheHdr->ch_caches; cache; cache = cache->cc_next) + slist_foreach(iter, &CacheHdr->ch_caches) + { + CatCache *cache = slist_container(CatCache, cc_next, iter.cur); + ResetCatalogCache(cache); + } CACHE1_elog(DEBUG2, "end of ResetCatalogCaches call"); } @@ -680,12 +679,14 @@ ResetCatalogCaches(void) void CatalogCacheFlushCatalog(Oid catId) { - CatCache *cache; + slist_iter iter; CACHE2_elog(DEBUG2, "CatalogCacheFlushCatalog called for %u", catId); - for (cache = CacheHdr->ch_caches; cache; cache = cache->cc_next) + slist_foreach(iter, &CacheHdr->ch_caches) { + CatCache *cache = slist_container(CatCache, cc_next, iter.cur); + /* Does this cache store tuples of the target catalog? */ if (cache->cc_reloid == catId) { @@ -760,7 +761,7 @@ InitCatCache(int id, if (CacheHdr == NULL) { CacheHdr = (CatCacheHeader *) palloc(sizeof(CatCacheHeader)); - CacheHdr->ch_caches = NULL; + slist_init(&CacheHdr->ch_caches); CacheHdr->ch_ntup = 0; #ifdef CATCACHE_STATS /* set up to dump stats at backend exit */ @@ -771,9 +772,9 @@ InitCatCache(int id, /* * allocate a new cache structure * - * Note: we assume zeroing initializes the Dllist headers correctly + * Note: we rely on zeroing to initialize all the dlist headers correctly */ - cp = (CatCache *) palloc0(sizeof(CatCache) + nbuckets * sizeof(Dllist)); + cp = (CatCache *) palloc0(sizeof(CatCache) + nbuckets * sizeof(dlist_head)); /* * initialize the cache's relation information for the relation @@ -801,8 +802,7 @@ InitCatCache(int id, /* * add completed cache to top of group header's list */ - cp->cc_next = CacheHdr->ch_caches; - CacheHdr->ch_caches = cp; + slist_push_head(&CacheHdr->ch_caches, &cp->cc_next); /* * back to the old context before we return... @@ -1060,7 +1060,8 @@ SearchCatCache(CatCache *cache, ScanKeyData cur_skey[CATCACHE_MAXKEYS]; uint32 hashValue; Index hashIndex; - Dlelem *elt; + dlist_iter iter; + dlist_head *bucket; CatCTup *ct; Relation relation; SysScanDesc scandesc; @@ -1093,14 +1094,16 @@ SearchCatCache(CatCache *cache, /* * scan the hash bucket until we find a match or exhaust our tuples + * + * Note: it's okay to use dlist_foreach here, even though we modify the + * dlist within the loop, because we don't continue the loop afterwards. */ - for (elt = DLGetHead(&cache->cc_bucket[hashIndex]); - elt; - elt = DLGetSucc(elt)) + bucket = &cache->cc_bucket[hashIndex]; + dlist_foreach(iter, bucket) { bool res; - ct = (CatCTup *) DLE_VAL(elt); + ct = dlist_container(CatCTup, cache_elem, iter.cur); if (ct->dead) continue; /* ignore dead entries */ @@ -1125,7 +1128,7 @@ SearchCatCache(CatCache *cache, * most frequently accessed elements in any hashbucket will tend to be * near the front of the hashbucket's list.) */ - DLMoveToFront(&ct->cache_elem); + dlist_move_head(bucket, &ct->cache_elem); /* * If it's a positive entry, bump its refcount and return it. If it's @@ -1340,7 +1343,7 @@ SearchCatCacheList(CatCache *cache, { ScanKeyData cur_skey[CATCACHE_MAXKEYS]; uint32 lHashValue; - Dlelem *elt; + dlist_iter iter; CatCList *cl; CatCTup *ct; List *volatile ctlist; @@ -1381,14 +1384,15 @@ SearchCatCacheList(CatCache *cache, /* * scan the items until we find a match or exhaust our list + * + * Note: it's okay to use dlist_foreach here, even though we modify the + * dlist within the loop, because we don't continue the loop afterwards. */ - for (elt = DLGetHead(&cache->cc_lists); - elt; - elt = DLGetSucc(elt)) + dlist_foreach(iter, &cache->cc_lists) { bool res; - cl = (CatCList *) DLE_VAL(elt); + cl = dlist_container(CatCList, cache_elem, iter.cur); if (cl->dead) continue; /* ignore dead entries */ @@ -1416,7 +1420,7 @@ SearchCatCacheList(CatCache *cache, * since there's no point in that unless they are searched for * individually.) */ - DLMoveToFront(&cl->cache_elem); + dlist_move_head(&cache->cc_lists, &cl->cache_elem); /* Bump the list's refcount and return it */ ResourceOwnerEnlargeCatCacheListRefs(CurrentResourceOwner); @@ -1468,6 +1472,8 @@ SearchCatCacheList(CatCache *cache, { uint32 hashValue; Index hashIndex; + bool found = false; + dlist_head *bucket; /* * See if there's an entry for this tuple already. @@ -1476,11 +1482,10 @@ SearchCatCacheList(CatCache *cache, hashValue = CatalogCacheComputeTupleHashValue(cache, ntp); hashIndex = HASH_INDEX(hashValue, cache->cc_nbuckets); - for (elt = DLGetHead(&cache->cc_bucket[hashIndex]); - elt; - elt = DLGetSucc(elt)) + bucket = &cache->cc_bucket[hashIndex]; + dlist_foreach(iter, bucket) { - ct = (CatCTup *) DLE_VAL(elt); + ct = dlist_container(CatCTup, cache_elem, iter.cur); if (ct->dead || ct->negative) continue; /* ignore dead and negative entries */ @@ -1498,10 +1503,11 @@ SearchCatCacheList(CatCache *cache, if (ct->c_list) continue; + found = true; break; /* A-OK */ } - if (elt == NULL) + if (!found) { /* We didn't find a usable entry, so make a new one */ ct = CatalogCacheCreateEntry(cache, ntp, @@ -1564,7 +1570,6 @@ SearchCatCacheList(CatCache *cache, cl->cl_magic = CL_MAGIC; cl->my_cache = cache; - DLInitElem(&cl->cache_elem, cl); cl->refcount = 0; /* for the moment */ cl->dead = false; cl->ordered = ordered; @@ -1587,7 +1592,7 @@ SearchCatCacheList(CatCache *cache, } Assert(i == nmembers); - DLAddHead(&cache->cc_lists, &cl->cache_elem); + dlist_push_head(&cache->cc_lists, &cl->cache_elem); /* Finally, bump the list's refcount and return it */ cl->refcount++; @@ -1664,14 +1669,13 @@ CatalogCacheCreateEntry(CatCache *cache, HeapTuple ntp, */ ct->ct_magic = CT_MAGIC; ct->my_cache = cache; - DLInitElem(&ct->cache_elem, (void *) ct); ct->c_list = NULL; ct->refcount = 0; /* for the moment */ ct->dead = false; ct->negative = negative; ct->hash_value = hashValue; - DLAddHead(&cache->cc_bucket[hashIndex], &ct->cache_elem); + dlist_push_head(&cache->cc_bucket[hashIndex], &ct->cache_elem); cache->cc_ntup++; CacheHdr->ch_ntup++; @@ -1785,7 +1789,7 @@ PrepareToInvalidateCacheTuple(Relation relation, HeapTuple newtuple, void (*function) (int, uint32, Oid)) { - CatCache *ccp; + slist_iter iter; Oid reloid; CACHE1_elog(DEBUG2, "PrepareToInvalidateCacheTuple: called"); @@ -1808,8 +1812,9 @@ PrepareToInvalidateCacheTuple(Relation relation, * ---------------- */ - for (ccp = CacheHdr->ch_caches; ccp; ccp = ccp->cc_next) + slist_foreach(iter, &CacheHdr->ch_caches) { + CatCache *ccp = slist_container(CatCache, cc_next, iter.cur); uint32 hashvalue; Oid dbid; diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 8c9ebe0f6f..9a504f8025 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -1731,9 +1731,23 @@ RelationReloadIndexInfo(Relation relation) RelationGetRelid(relation)); index = (Form_pg_index) GETSTRUCT(tuple); + /* + * Basically, let's just copy all the bool fields. There are one or + * two of these that can't actually change in the current code, but + * it's not worth it to track exactly which ones they are. None of + * the array fields are allowed to change, though. + */ + relation->rd_index->indisunique = index->indisunique; + relation->rd_index->indisprimary = index->indisprimary; + relation->rd_index->indisexclusion = index->indisexclusion; + relation->rd_index->indimmediate = index->indimmediate; + relation->rd_index->indisclustered = index->indisclustered; relation->rd_index->indisvalid = index->indisvalid; relation->rd_index->indcheckxmin = index->indcheckxmin; relation->rd_index->indisready = index->indisready; + relation->rd_index->indislive = index->indislive; + + /* Copy xmin too, as that is needed to make sense of indcheckxmin */ HeapTupleHeaderSetXmin(relation->rd_indextuple->t_data, HeapTupleHeaderGetXmin(tuple->t_data)); @@ -3299,6 +3313,10 @@ CheckConstraintFetch(Relation relation) * so that we must recompute the index list on next request. This handles * creation or deletion of an index. * + * Indexes that are marked not IndexIsLive are omitted from the returned list. + * Such indexes are expected to be dropped momentarily, and should not be + * touched at all by any caller of this function. + * * The returned list is guaranteed to be sorted in order by OID. This is * needed by the executor, since for index types that we obtain exclusive * locks on when updating the index, all backends must lock the indexes in @@ -3358,9 +3376,12 @@ RelationGetIndexList(Relation relation) bool isnull; /* - * Ignore any indexes that are currently being dropped + * Ignore any indexes that are currently being dropped. This will + * prevent them from being searched, inserted into, or considered in + * HOT-safety decisions. It's unsafe to touch such an index at all + * since its catalog entries could disappear at any instant. */ - if (!index->indisvalid && !index->indisready) + if (!IndexIsLive(index)) continue; /* Add index's OID to result list in the proper order */ @@ -3379,7 +3400,8 @@ RelationGetIndexList(Relation relation) indclass = (oidvector *) DatumGetPointer(indclassDatum); /* Check to see if it is a unique, non-partial btree index on OID */ - if (index->indnatts == 1 && + if (IndexIsValid(index) && + index->indnatts == 1 && index->indisunique && index->indimmediate && index->indkey.values[0] == ObjectIdAttributeNumber && indclass->values[0] == OID_BTREE_OPS_OID && @@ -3674,6 +3696,13 @@ RelationGetIndexAttrBitmap(Relation relation) /* * For each index, add referenced attributes to indexattrs. + * + * Note: we consider all indexes returned by RelationGetIndexList, even if + * they are not indisready or indisvalid. This is important because an + * index for which CREATE INDEX CONCURRENTLY has just started must be + * included in HOT-safety decisions (see README.HOT). If a DROP INDEX + * CONCURRENTLY is far enough along that we should ignore the index, it + * won't be returned at all by RelationGetIndexList. */ indexattrs = NULL; foreach(l, indexoidlist) diff --git a/src/backend/utils/cache/relmapper.c b/src/backend/utils/cache/relmapper.c index 6f214957bf..d0d96ad3fd 100644 --- a/src/backend/utils/cache/relmapper.c +++ b/src/backend/utils/cache/relmapper.c @@ -588,7 +588,8 @@ load_relmap_file(bool shared) } /* Read data ... */ - fd = BasicOpenFile(mapfilename, O_RDONLY | PG_BINARY, S_IRUSR | S_IWUSR); + fd = OpenTransientFile(mapfilename, + O_RDONLY | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) ereport(FATAL, (errcode_for_file_access(), @@ -608,7 +609,7 @@ load_relmap_file(bool shared) errmsg("could not read relation mapping file \"%s\": %m", mapfilename))); - close(fd); + CloseTransientFile(fd); /* check for correct magic number, etc */ if (map->magic != RELMAPPER_FILEMAGIC || @@ -672,12 +673,6 @@ write_relmap_file(bool shared, RelMapFile *newmap, /* * Open the target file. We prefer to do this before entering the * critical section, so that an open() failure need not force PANIC. - * - * Note: since we use BasicOpenFile, we are nominally responsible for - * ensuring the fd is closed on error. In practice, this isn't important - * because either an error happens inside the critical section, or we are - * in bootstrap or WAL replay; so an error past this point is always fatal - * anyway. */ if (shared) { @@ -692,9 +687,9 @@ write_relmap_file(bool shared, RelMapFile *newmap, realmap = &local_map; } - fd = BasicOpenFile(mapfilename, - O_WRONLY | O_CREAT | PG_BINARY, - S_IRUSR | S_IWUSR); + fd = OpenTransientFile(mapfilename, + O_WRONLY | O_CREAT | PG_BINARY, + S_IRUSR | S_IWUSR); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), @@ -753,7 +748,7 @@ write_relmap_file(bool shared, RelMapFile *newmap, errmsg("could not fsync relation mapping file \"%s\": %m", mapfilename))); - if (close(fd)) + if (CloseTransientFile(fd)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close relation mapping file \"%s\": %m", @@ -896,19 +891,3 @@ relmap_redo(XLogRecPtr lsn, XLogRecord *record) else elog(PANIC, "relmap_redo: unknown op code %u", info); } - -void -relmap_desc(StringInfo buf, uint8 xl_info, char *rec) -{ - uint8 info = xl_info & ~XLR_INFO_MASK; - - if (info == XLOG_RELMAP_UPDATE) - { - xl_relmap_update *xlrec = (xl_relmap_update *) rec; - - appendStringInfo(buf, "update relmap: database %u tablespace %u size %u", - xlrec->dbid, xlrec->tsid, xlrec->nbytes); - } - else - appendStringInfo(buf, "UNKNOWN"); -} diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c index a40b343ebc..c22190a8b3 100644 --- a/src/backend/utils/error/elog.c +++ b/src/backend/utils/error/elog.c @@ -683,13 +683,13 @@ errcode_for_socket_access(void) * to the edata field because the buffer might be considerably larger than * really necessary. */ -#define EVALUATE_MESSAGE(targetfield, appendval, translateit) \ +#define EVALUATE_MESSAGE(domain, targetfield, appendval, translateit) \ { \ char *fmtbuf; \ StringInfoData buf; \ /* Internationalize the error format string */ \ if (translateit && !in_error_recursion_trouble()) \ - fmt = dgettext(edata->domain, fmt); \ + fmt = dgettext((domain), fmt); \ /* Expand %m in format string */ \ fmtbuf = expand_fmt_string(fmt, edata); \ initStringInfo(&buf); \ @@ -723,14 +723,14 @@ errcode_for_socket_access(void) * must be declared like "const char *fmt_singular, const char *fmt_plural, * unsigned long n, ...". Translation is assumed always wanted. */ -#define EVALUATE_MESSAGE_PLURAL(targetfield, appendval) \ +#define EVALUATE_MESSAGE_PLURAL(domain, targetfield, appendval) \ { \ const char *fmt; \ char *fmtbuf; \ StringInfoData buf; \ /* Internationalize the error format string */ \ if (!in_error_recursion_trouble()) \ - fmt = dngettext(edata->domain, fmt_singular, fmt_plural, n); \ + fmt = dngettext((domain), fmt_singular, fmt_plural, n); \ else \ fmt = (n == 1 ? fmt_singular : fmt_plural); \ /* Expand %m in format string */ \ @@ -781,7 +781,7 @@ errmsg(const char *fmt,...) CHECK_STACK_DEPTH(); oldcontext = MemoryContextSwitchTo(ErrorContext); - EVALUATE_MESSAGE(message, false, true); + EVALUATE_MESSAGE(edata->domain, message, false, true); MemoryContextSwitchTo(oldcontext); recursion_depth--; @@ -810,7 +810,7 @@ errmsg_internal(const char *fmt,...) CHECK_STACK_DEPTH(); oldcontext = MemoryContextSwitchTo(ErrorContext); - EVALUATE_MESSAGE(message, false, false); + EVALUATE_MESSAGE(edata->domain, message, false, false); MemoryContextSwitchTo(oldcontext); recursion_depth--; @@ -833,7 +833,7 @@ errmsg_plural(const char *fmt_singular, const char *fmt_plural, CHECK_STACK_DEPTH(); oldcontext = MemoryContextSwitchTo(ErrorContext); - EVALUATE_MESSAGE_PLURAL(message, false); + EVALUATE_MESSAGE_PLURAL(edata->domain, message, false); MemoryContextSwitchTo(oldcontext); recursion_depth--; @@ -854,7 +854,7 @@ errdetail(const char *fmt,...) CHECK_STACK_DEPTH(); oldcontext = MemoryContextSwitchTo(ErrorContext); - EVALUATE_MESSAGE(detail, false, true); + EVALUATE_MESSAGE(edata->domain, detail, false, true); MemoryContextSwitchTo(oldcontext); recursion_depth--; @@ -881,7 +881,7 @@ errdetail_internal(const char *fmt,...) CHECK_STACK_DEPTH(); oldcontext = MemoryContextSwitchTo(ErrorContext); - EVALUATE_MESSAGE(detail, false, false); + EVALUATE_MESSAGE(edata->domain, detail, false, false); MemoryContextSwitchTo(oldcontext); recursion_depth--; @@ -902,7 +902,7 @@ errdetail_log(const char *fmt,...) CHECK_STACK_DEPTH(); oldcontext = MemoryContextSwitchTo(ErrorContext); - EVALUATE_MESSAGE(detail_log, false, true); + EVALUATE_MESSAGE(edata->domain, detail_log, false, true); MemoryContextSwitchTo(oldcontext); recursion_depth--; @@ -925,7 +925,7 @@ errdetail_plural(const char *fmt_singular, const char *fmt_plural, CHECK_STACK_DEPTH(); oldcontext = MemoryContextSwitchTo(ErrorContext); - EVALUATE_MESSAGE_PLURAL(detail, false); + EVALUATE_MESSAGE_PLURAL(edata->domain, detail, false); MemoryContextSwitchTo(oldcontext); recursion_depth--; @@ -946,7 +946,7 @@ errhint(const char *fmt,...) CHECK_STACK_DEPTH(); oldcontext = MemoryContextSwitchTo(ErrorContext); - EVALUATE_MESSAGE(hint, false, true); + EVALUATE_MESSAGE(edata->domain, hint, false, true); MemoryContextSwitchTo(oldcontext); recursion_depth--; @@ -955,14 +955,14 @@ errhint(const char *fmt,...) /* - * errcontext --- add a context error message text to the current error + * errcontext_msg --- add a context error message text to the current error * * Unlike other cases, multiple calls are allowed to build up a stack of * context information. We assume earlier calls represent more-closely-nested * states. */ int -errcontext(const char *fmt,...) +errcontext_msg(const char *fmt,...) { ErrorData *edata = &errordata[errordata_stack_depth]; MemoryContext oldcontext; @@ -971,13 +971,35 @@ errcontext(const char *fmt,...) CHECK_STACK_DEPTH(); oldcontext = MemoryContextSwitchTo(ErrorContext); - EVALUATE_MESSAGE(context, true, true); + EVALUATE_MESSAGE(edata->context_domain, context, true, true); MemoryContextSwitchTo(oldcontext); recursion_depth--; return 0; /* return value does not matter */ } +/* + * set_errcontext_domain --- set message domain to be used by errcontext() + * + * errcontext_msg() can be called from a different module than the original + * ereport(), so we cannot use the message domain passed in errstart() to + * translate it. Instead, each errcontext_msg() call should be preceded by + * a set_errcontext_domain() call to specify the domain. This is usually + * done transparently by the errcontext() macro. + */ +int +set_errcontext_domain(const char *domain) +{ + ErrorData *edata = &errordata[errordata_stack_depth]; + + /* we don't bother incrementing recursion_depth */ + CHECK_STACK_DEPTH(); + + edata->context_domain = domain; + + return 0; /* return value does not matter */ +} + /* * errhidestmt --- optionally suppress STATEMENT: field of log entry @@ -1201,7 +1223,7 @@ elog_finish(int elevel, const char *fmt,...) recursion_depth++; oldcontext = MemoryContextSwitchTo(ErrorContext); - EVALUATE_MESSAGE(message, false, false); + EVALUATE_MESSAGE(edata->domain, message, false, false); MemoryContextSwitchTo(oldcontext); recursion_depth--; @@ -1260,7 +1282,7 @@ format_elog_string(const char *fmt,...) oldcontext = MemoryContextSwitchTo(ErrorContext); - EVALUATE_MESSAGE(message, false, true); + EVALUATE_MESSAGE(edata->domain, message, false, true); MemoryContextSwitchTo(oldcontext); @@ -1970,7 +1992,7 @@ log_line_prefix(StringInfo buf, ErrorData *edata) } break; case 'c': - appendStringInfo(buf, "%lx.%x", (long) (MyStartTime), MyProcPid); + appendStringInfo(buf, "%lx.%04x", (long) (MyStartTime), MyProcPid); break; case 'p': appendStringInfo(buf, "%d", MyProcPid); @@ -2149,7 +2171,7 @@ write_csvlog(ErrorData *edata) appendStringInfoChar(&buf, ','); /* session id */ - appendStringInfo(&buf, "%lx.%x", (long) MyStartTime, MyProcPid); + appendStringInfo(&buf, "%lx.%04x", (long) MyStartTime, MyProcPid); appendStringInfoChar(&buf, ','); /* Line number */ diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c index 6d0e0f5366..31ac2b25e8 100644 --- a/src/backend/utils/hash/dynahash.c +++ b/src/backend/utils/hash/dynahash.c @@ -21,6 +21,11 @@ * lookup key's hash value as a partition number --- this will work because * of the way calc_bucket() maps hash values to bucket numbers. * + * For hash tables in shared memory, the memory allocator function should + * match malloc's semantics of returning NULL on failure. For hash tables + * in local memory, we typically use palloc() which will throw error on + * failure. The code in this file has to cope with both cases. + * * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -820,6 +825,27 @@ hash_search_with_hash_value(HTAB *hashp, hctl->accesses++; #endif + /* + * If inserting, check if it is time to split a bucket. + * + * NOTE: failure to expand table is not a fatal error, it just means we + * have to run at higher fill factor than we wanted. However, if we're + * using the palloc allocator then it will throw error anyway on + * out-of-memory, so we must do this before modifying the table. + */ + if (action == HASH_ENTER || action == HASH_ENTER_NULL) + { + /* + * Can't split if running in partitioned mode, nor if frozen, nor if + * table is the subject of any active hash_seq_search scans. Strange + * order of these tests is to try to check cheaper conditions first. + */ + if (!IS_PARTITIONED(hctl) && !hashp->frozen && + hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor && + !has_seq_scans(hashp)) + (void) expand_table(hashp); + } + /* * Do the initial lookup */ @@ -940,24 +966,12 @@ hash_search_with_hash_value(HTAB *hashp, currBucket->hashvalue = hashvalue; hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize); - /* caller is expected to fill the data field on return */ - /* - * Check if it is time to split a bucket. Can't split if running - * in partitioned mode, nor if table is the subject of any active - * hash_seq_search scans. Strange order of these tests is to try - * to check cheaper conditions first. + * Caller is expected to fill the data field on return. DO NOT + * insert any code that could possibly throw error here, as doing + * so would leave the table entry incomplete and hence corrupt the + * caller's data structure. */ - if (!IS_PARTITIONED(hctl) && - hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor && - !has_seq_scans(hashp)) - { - /* - * NOTE: failure to expand table is not a fatal error, it just - * means we have to run at higher fill factor than we wanted. - */ - expand_table(hashp); - } return (void *) ELEMENTKEY(currBucket); } diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 745e7be68e..81cf136937 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -1466,7 +1466,7 @@ static struct config_int ConfigureNamesInt[] = GUC_UNIT_S }, &XLogArchiveTimeout, - 0, 0, INT_MAX, + 0, 0, INT_MAX / 2, NULL, NULL, NULL }, { @@ -1476,7 +1476,7 @@ static struct config_int ConfigureNamesInt[] = GUC_NOT_IN_SAMPLE | GUC_UNIT_S }, &PostAuthDelay, - 0, 0, INT_MAX, + 0, 0, INT_MAX / 1000000, NULL, NULL, NULL }, { @@ -2147,7 +2147,7 @@ static struct config_int ConfigureNamesInt[] = GUC_UNIT_MIN }, &Log_RotationAge, - HOURS_PER_DAY * MINS_PER_HOUR, 0, INT_MAX / MINS_PER_HOUR, + HOURS_PER_DAY * MINS_PER_HOUR, 0, INT_MAX / SECS_PER_MINUTE, NULL, NULL, NULL }, diff --git a/src/backend/utils/mmgr/portalmem.c b/src/backend/utils/mmgr/portalmem.c index 5713bbe12c..b981f975af 100644 --- a/src/backend/utils/mmgr/portalmem.c +++ b/src/backend/utils/mmgr/portalmem.c @@ -1055,3 +1055,22 @@ pg_cursor(PG_FUNCTION_ARGS) return (Datum) 0; } + +bool +ThereAreNoReadyPortals(void) +{ + HASH_SEQ_STATUS status; + PortalHashEnt *hentry; + + hash_seq_init(&status, PortalHashTable); + + while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL) + { + Portal portal = hentry->portal; + + if (portal->status == PORTAL_READY) + return false; + } + + return true; +} diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index fa514f6b48..5705a2d75e 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -1184,3 +1184,12 @@ DeleteAllExportedSnapshotFiles(void) FreeDir(s_dir); } + +bool +ThereAreNoPriorRegisteredSnapshots(void) +{ + if (RegisteredSnapshots <= 1) + return true; + + return false; +} diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index c56f721e08..40740dcb72 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -118,6 +118,7 @@ static const char *authmethodlocal = ""; static bool debug = false; static bool noclean = false; static bool do_sync = true; +static bool sync_only = false; static bool show_setting = false; static char *xlog_dir = ""; @@ -144,6 +145,7 @@ static char infoversion[100]; static bool caught_signal = false; static bool output_failed = false; static int output_errno = 0; +static char *pgdata_native; /* defaults */ static int n_connections = 10; @@ -172,6 +174,27 @@ static char *authwarning = NULL; static const char *boot_options = "-F"; static const char *backend_options = "--single -F -O -c search_path=pg_catalog -c exit_on_error=true"; +#ifdef WIN32 +char *restrict_env; +#endif +const char *subdirs[] = { + "global", + "pg_xlog", + "pg_xlog/archive_status", + "pg_clog", + "pg_notify", + "pg_serial", + "pg_snapshots", + "pg_subtrans", + "pg_twophase", + "pg_multixact/members", + "pg_multixact/offsets", + "base", + "base/1", + "pg_tblspc", + "pg_stat_tmp" +}; + /* path to 'initdb' binary directory */ static char bin_path[MAXPGPATH]; @@ -227,6 +250,17 @@ static bool check_locale_name(int category, const char *locale, static bool check_locale_encoding(const char *locale, int encoding); static void setlocales(void); static void usage(const char *progname); +void get_restricted_token(void); +void setup_pgdata(void); +void setup_bin_paths(const char *argv0); +void setup_data_file_paths(void); +void setup_locale_encoding(void); +void setup_signals(void); +void setup_text_search(void); +void create_data_directory(void); +void create_xlog_symlink(void); +void initialize_data_directory(void); + #ifdef WIN32 static int CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo); @@ -417,6 +451,7 @@ readfile(const char *path) int maxlength = 1, linelen = 0; int nlines = 0; + int n; char **result; char *buffer; int c; @@ -454,13 +489,13 @@ readfile(const char *path) /* now reprocess the file and store the lines */ rewind(infile); - nlines = 0; - while (fgets(buffer, maxlength + 1, infile) != NULL) - result[nlines++] = pg_strdup(buffer); + n = 0; + while (fgets(buffer, maxlength + 1, infile) != NULL && n < nlines) + result[n++] = pg_strdup(buffer); fclose(infile); free(buffer); - result[nlines] = NULL; + result[n] = NULL; return result; } @@ -2764,6 +2799,7 @@ usage(const char *progname) printf(_(" -n, --noclean do not clean up after errors\n")); printf(_(" -N, --nosync do not wait for changes to be written safely to disk\n")); printf(_(" -s, --show show internal settings\n")); + printf(_(" -S, --sync-only only sync data directory\n")); printf(_("\nOther options:\n")); printf(_(" -V, --version output version information, then exit\n")); printf(_(" -?, --help show this help, then exit\n")); @@ -2822,242 +2858,9 @@ check_need_password(const char *authmethodlocal, const char *authmethodhost) } } -int -main(int argc, char *argv[]) +void +get_restricted_token(void) { - /* - * options with no short version return a low integer, the rest return - * their short version value - */ - static struct option long_options[] = { - {"pgdata", required_argument, NULL, 'D'}, - {"encoding", required_argument, NULL, 'E'}, - {"locale", required_argument, NULL, 1}, - {"lc-collate", required_argument, NULL, 2}, - {"lc-ctype", required_argument, NULL, 3}, - {"lc-monetary", required_argument, NULL, 4}, - {"lc-numeric", required_argument, NULL, 5}, - {"lc-time", required_argument, NULL, 6}, - {"lc-messages", required_argument, NULL, 7}, - {"no-locale", no_argument, NULL, 8}, - {"text-search-config", required_argument, NULL, 'T'}, - {"auth", required_argument, NULL, 'A'}, - {"auth-local", required_argument, NULL, 10}, - {"auth-host", required_argument, NULL, 11}, - {"pwprompt", no_argument, NULL, 'W'}, - {"pwfile", required_argument, NULL, 9}, - {"username", required_argument, NULL, 'U'}, - {"help", no_argument, NULL, '?'}, - {"version", no_argument, NULL, 'V'}, - {"debug", no_argument, NULL, 'd'}, - {"show", no_argument, NULL, 's'}, - {"noclean", no_argument, NULL, 'n'}, - {"nosync", no_argument, NULL, 'N'}, - {"xlogdir", required_argument, NULL, 'X'}, - {NULL, 0, NULL, 0} - }; - - int c, - i, - ret; - int option_index; - char *effective_user; - char *pgdenv; /* PGDATA value gotten from and sent to - * environment */ - char bin_dir[MAXPGPATH]; - char *pg_data_native; - int user_enc; - -#ifdef WIN32 - char *restrict_env; -#endif - static const char *subdirs[] = { - "global", - "pg_xlog", - "pg_xlog/archive_status", - "pg_clog", - "pg_notify", - "pg_serial", - "pg_snapshots", - "pg_subtrans", - "pg_twophase", - "pg_multixact/members", - "pg_multixact/offsets", - "base", - "base/1", - "pg_tblspc", - "pg_stat_tmp" - }; - - progname = get_progname(argv[0]); - set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("initdb")); - - if (argc > 1) - { - if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) - { - usage(progname); - exit(0); - } - if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) - { - puts("initdb (PostgreSQL) " PG_VERSION); - exit(0); - } - } - - /* process command-line options */ - - while ((c = getopt_long(argc, argv, "dD:E:L:nNU:WA:sT:X:", long_options, &option_index)) != -1) - { - switch (c) - { - case 'A': - authmethodlocal = authmethodhost = pg_strdup(optarg); - - /* - * When ident is specified, use peer for local connections. - * Mirrored, when peer is specified, use ident for TCP/IP - * connections. - */ - if (strcmp(authmethodhost, "ident") == 0) - authmethodlocal = "peer"; - else if (strcmp(authmethodlocal, "peer") == 0) - authmethodhost = "ident"; - break; - case 10: - authmethodlocal = pg_strdup(optarg); - break; - case 11: - authmethodhost = pg_strdup(optarg); - break; - case 'D': - pg_data = pg_strdup(optarg); - break; - case 'E': - encoding = pg_strdup(optarg); - break; - case 'W': - pwprompt = true; - break; - case 'U': - username = pg_strdup(optarg); - break; - case 'd': - debug = true; - printf(_("Running in debug mode.\n")); - break; - case 'n': - noclean = true; - printf(_("Running in noclean mode. Mistakes will not be cleaned up.\n")); - break; - case 'N': - do_sync = false; - break; - case 'L': - share_path = pg_strdup(optarg); - break; - case 1: - locale = pg_strdup(optarg); - break; - case 2: - lc_collate = pg_strdup(optarg); - break; - case 3: - lc_ctype = pg_strdup(optarg); - break; - case 4: - lc_monetary = pg_strdup(optarg); - break; - case 5: - lc_numeric = pg_strdup(optarg); - break; - case 6: - lc_time = pg_strdup(optarg); - break; - case 7: - lc_messages = pg_strdup(optarg); - break; - case 8: - locale = "C"; - break; - case 9: - pwfilename = pg_strdup(optarg); - break; - case 's': - show_setting = true; - break; - case 'T': - default_text_search_config = pg_strdup(optarg); - break; - case 'X': - xlog_dir = pg_strdup(optarg); - break; - default: - /* getopt_long already emitted a complaint */ - fprintf(stderr, _("Try \"%s --help\" for more information.\n"), - progname); - exit(1); - } - } - - - /* - * Non-option argument specifies data directory as long as it wasn't - * already specified with -D / --pgdata - */ - if (optind < argc && strlen(pg_data) == 0) - { - pg_data = pg_strdup(argv[optind]); - optind++; - } - - if (optind < argc) - { - fprintf(stderr, _("%s: too many command-line arguments (first is \"%s\")\n"), - progname, argv[optind]); - fprintf(stderr, _("Try \"%s --help\" for more information.\n"), - progname); - exit(1); - } - - if (pwprompt && pwfilename) - { - fprintf(stderr, _("%s: password prompt and password file cannot be specified together\n"), progname); - exit(1); - } - - check_authmethod_unspecified(&authmethodlocal); - check_authmethod_unspecified(&authmethodhost); - - check_authmethod_valid(authmethodlocal, auth_methods_local, "local"); - check_authmethod_valid(authmethodhost, auth_methods_host, "host"); - - check_need_password(authmethodlocal, authmethodhost); - - if (strlen(pg_data) == 0) - { - pgdenv = getenv("PGDATA"); - if (pgdenv && strlen(pgdenv)) - { - /* PGDATA found */ - pg_data = pg_strdup(pgdenv); - } - else - { - fprintf(stderr, - _("%s: no data directory specified\n" - "You must identify the directory where the data for this database system\n" - "will reside. Do this with either the invocation option -D or the\n" - "environment variable PGDATA.\n"), - progname); - exit(1); - } - } - - pg_data_native = pg_data; - canonicalize_path(pg_data); - #ifdef WIN32 /* * Before we execute another program, make sure that we are running with a @@ -3100,6 +2903,35 @@ main(int argc, char *argv[]) } } #endif +} + +void +setup_pgdata(void) +{ + char *pgdata_get_env, *pgdata_set_env; + + if (strlen(pg_data) == 0) + { + pgdata_get_env = getenv("PGDATA"); + if (pgdata_get_env && strlen(pgdata_get_env)) + { + /* PGDATA found */ + pg_data = pg_strdup(pgdata_get_env); + } + else + { + fprintf(stderr, + _("%s: no data directory specified\n" + "You must identify the directory where the data for this database system\n" + "will reside. Do this with either the invocation option -D or the\n" + "environment variable PGDATA.\n"), + progname); + exit(1); + } + } + + pgdata_native = pg_strdup(pg_data); + canonicalize_path(pg_data); /* * we have to set PGDATA for postgres rather than pass it on the command @@ -3107,16 +2939,23 @@ main(int argc, char *argv[]) * need quotes otherwise on Windows because paths there are most likely to * have embedded spaces. */ - pgdenv = pg_malloc(8 + strlen(pg_data)); - sprintf(pgdenv, "PGDATA=%s", pg_data); - putenv(pgdenv); + pgdata_set_env = pg_malloc(8 + strlen(pg_data)); + sprintf(pgdata_set_env, "PGDATA=%s", pg_data); + putenv(pgdata_set_env); +} - if ((ret = find_other_exec(argv[0], "postgres", PG_BACKEND_VERSIONSTR, + +void +setup_bin_paths(const char *argv0) +{ + int ret; + + if ((ret = find_other_exec(argv0, "postgres", PG_BACKEND_VERSIONSTR, backend_exec)) < 0) { char full_path[MAXPGPATH]; - if (find_my_exec(argv[0], full_path) < 0) + if (find_my_exec(argv0, full_path) < 0) strlcpy(full_path, progname, sizeof(full_path)); if (ret == -1) @@ -3152,63 +2991,15 @@ main(int argc, char *argv[]) } canonicalize_path(share_path); +} - effective_user = get_id(); - if (strlen(username) == 0) - username = effective_user; - - set_input(&bki_file, "postgres.bki"); - set_input(&desc_file, "postgres.description"); - set_input(&shdesc_file, "postgres.shdescription"); - set_input(&hba_file, "pg_hba.conf.sample"); - set_input(&ident_file, "pg_ident.conf.sample"); - set_input(&conf_file, "postgresql.conf.sample"); - set_input(&conversion_file, "conversion_create.sql"); - set_input(&dictionary_file, "snowball_create.sql"); - set_input(&info_schema_file, "information_schema.sql"); - set_input(&features_file, "sql_features.txt"); - set_input(&system_views_file, "system_views.sql"); - - set_info_version(); - - if (show_setting || debug) - { - fprintf(stderr, - "VERSION=%s\n" - "PGDATA=%s\nshare_path=%s\nPGPATH=%s\n" - "POSTGRES_SUPERUSERNAME=%s\nPOSTGRES_BKI=%s\n" - "POSTGRES_DESCR=%s\nPOSTGRES_SHDESCR=%s\n" - "POSTGRESQL_CONF_SAMPLE=%s\n" - "PG_HBA_SAMPLE=%s\nPG_IDENT_SAMPLE=%s\n", - PG_VERSION, - pg_data, share_path, bin_path, - username, bki_file, - desc_file, shdesc_file, - conf_file, - hba_file, ident_file); - if (show_setting) - exit(0); - } - - check_input(bki_file); - check_input(desc_file); - check_input(shdesc_file); - check_input(hba_file); - check_input(ident_file); - check_input(conf_file); - check_input(conversion_file); - check_input(dictionary_file); - check_input(info_schema_file); - check_input(features_file); - check_input(system_views_file); +void +setup_locale_encoding(void) +{ + int user_enc; setlocales(); - printf(_("The files belonging to this database system will be owned " - "by user \"%s\".\n" - "This user must also own the server process.\n\n"), - effective_user); - if (strcmp(lc_ctype, lc_collate) == 0 && strcmp(lc_ctype, lc_time) == 0 && strcmp(lc_ctype, lc_numeric) == 0 && @@ -3288,6 +3079,60 @@ main(int argc, char *argv[]) !check_locale_encoding(lc_collate, user_enc)) exit(1); /* check_locale_encoding printed the error */ +} + + +void +setup_data_file_paths(void) +{ + set_input(&bki_file, "postgres.bki"); + set_input(&desc_file, "postgres.description"); + set_input(&shdesc_file, "postgres.shdescription"); + set_input(&hba_file, "pg_hba.conf.sample"); + set_input(&ident_file, "pg_ident.conf.sample"); + set_input(&conf_file, "postgresql.conf.sample"); + set_input(&conversion_file, "conversion_create.sql"); + set_input(&dictionary_file, "snowball_create.sql"); + set_input(&info_schema_file, "information_schema.sql"); + set_input(&features_file, "sql_features.txt"); + set_input(&system_views_file, "system_views.sql"); + + if (show_setting || debug) + { + fprintf(stderr, + "VERSION=%s\n" + "PGDATA=%s\nshare_path=%s\nPGPATH=%s\n" + "POSTGRES_SUPERUSERNAME=%s\nPOSTGRES_BKI=%s\n" + "POSTGRES_DESCR=%s\nPOSTGRES_SHDESCR=%s\n" + "POSTGRESQL_CONF_SAMPLE=%s\n" + "PG_HBA_SAMPLE=%s\nPG_IDENT_SAMPLE=%s\n", + PG_VERSION, + pg_data, share_path, bin_path, + username, bki_file, + desc_file, shdesc_file, + conf_file, + hba_file, ident_file); + if (show_setting) + exit(0); + } + + check_input(bki_file); + check_input(desc_file); + check_input(shdesc_file); + check_input(hba_file); + check_input(ident_file); + check_input(conf_file); + check_input(conversion_file); + check_input(dictionary_file); + check_input(info_schema_file); + check_input(features_file); + check_input(system_views_file); +} + + +void +setup_text_search(void) +{ if (strlen(default_text_search_config) == 0) { default_text_search_config = find_matching_ts_config(lc_ctype); @@ -3317,14 +3162,12 @@ main(int argc, char *argv[]) printf(_("The default text search configuration will be set to \"%s\".\n"), default_text_search_config); - printf("\n"); - - umask(S_IRWXG | S_IRWXO); +} - /* - * now we are starting to do real work, trap signals so we can clean up - */ +void +setup_signals(void) +{ /* some of these are not valid on Windows */ #ifdef SIGHUP pqsignal(SIGHUP, trapsig); @@ -3343,7 +3186,12 @@ main(int argc, char *argv[]) #ifdef SIGPIPE pqsignal(SIGPIPE, SIG_IGN); #endif +} + +void +create_data_directory(void) +{ switch (pg_check_dir(pg_data)) { case 0: @@ -3396,7 +3244,12 @@ main(int argc, char *argv[]) progname, pg_data, strerror(errno)); exit_nicely(); } +} + +void +create_xlog_symlink(void) +{ /* Create transaction log symlink, if required */ if (strcmp(xlog_dir, "") != 0) { @@ -3483,6 +3336,21 @@ main(int argc, char *argv[]) exit_nicely(); #endif } +} + + +void +initialize_data_directory(void) +{ + int i; + + setup_signals(); + + umask(S_IRWXG | S_IRWXO); + + create_data_directory(); + + create_xlog_symlink(); /* Create required subdirectories */ printf(_("creating subdirectories ... ")); @@ -3543,7 +3411,234 @@ main(int argc, char *argv[]) make_template0(); make_postgres(); +} + + +int +main(int argc, char *argv[]) +{ + static struct option long_options[] = { + {"pgdata", required_argument, NULL, 'D'}, + {"encoding", required_argument, NULL, 'E'}, + {"locale", required_argument, NULL, 1}, + {"lc-collate", required_argument, NULL, 2}, + {"lc-ctype", required_argument, NULL, 3}, + {"lc-monetary", required_argument, NULL, 4}, + {"lc-numeric", required_argument, NULL, 5}, + {"lc-time", required_argument, NULL, 6}, + {"lc-messages", required_argument, NULL, 7}, + {"no-locale", no_argument, NULL, 8}, + {"text-search-config", required_argument, NULL, 'T'}, + {"auth", required_argument, NULL, 'A'}, + {"auth-local", required_argument, NULL, 10}, + {"auth-host", required_argument, NULL, 11}, + {"pwprompt", no_argument, NULL, 'W'}, + {"pwfile", required_argument, NULL, 9}, + {"username", required_argument, NULL, 'U'}, + {"help", no_argument, NULL, '?'}, + {"version", no_argument, NULL, 'V'}, + {"debug", no_argument, NULL, 'd'}, + {"show", no_argument, NULL, 's'}, + {"noclean", no_argument, NULL, 'n'}, + {"nosync", no_argument, NULL, 'N'}, + {"sync-only", no_argument, NULL, 'S'}, + {"xlogdir", required_argument, NULL, 'X'}, + {NULL, 0, NULL, 0} + }; + + /* + * options with no short version return a low integer, the rest return + * their short version value + */ + int c; + int option_index; + char *effective_user; + char bin_dir[MAXPGPATH]; + + progname = get_progname(argv[0]); + set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("initdb")); + + if (argc > 1) + { + if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) + { + usage(progname); + exit(0); + } + if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) + { + puts("initdb (PostgreSQL) " PG_VERSION); + exit(0); + } + } + + /* process command-line options */ + + while ((c = getopt_long(argc, argv, "dD:E:L:nNU:WA:sST:X:", long_options, &option_index)) != -1) + { + switch (c) + { + case 'A': + authmethodlocal = authmethodhost = pg_strdup(optarg); + + /* + * When ident is specified, use peer for local connections. + * Mirrored, when peer is specified, use ident for TCP/IP + * connections. + */ + if (strcmp(authmethodhost, "ident") == 0) + authmethodlocal = "peer"; + else if (strcmp(authmethodlocal, "peer") == 0) + authmethodhost = "ident"; + break; + case 10: + authmethodlocal = pg_strdup(optarg); + break; + case 11: + authmethodhost = pg_strdup(optarg); + break; + case 'D': + pg_data = pg_strdup(optarg); + break; + case 'E': + encoding = pg_strdup(optarg); + break; + case 'W': + pwprompt = true; + break; + case 'U': + username = pg_strdup(optarg); + break; + case 'd': + debug = true; + printf(_("Running in debug mode.\n")); + break; + case 'n': + noclean = true; + printf(_("Running in noclean mode. Mistakes will not be cleaned up.\n")); + break; + case 'N': + do_sync = false; + break; + case 'S': + sync_only = true; + break; + case 'L': + share_path = pg_strdup(optarg); + break; + case 1: + locale = pg_strdup(optarg); + break; + case 2: + lc_collate = pg_strdup(optarg); + break; + case 3: + lc_ctype = pg_strdup(optarg); + break; + case 4: + lc_monetary = pg_strdup(optarg); + break; + case 5: + lc_numeric = pg_strdup(optarg); + break; + case 6: + lc_time = pg_strdup(optarg); + break; + case 7: + lc_messages = pg_strdup(optarg); + break; + case 8: + locale = "C"; + break; + case 9: + pwfilename = pg_strdup(optarg); + break; + case 's': + show_setting = true; + break; + case 'T': + default_text_search_config = pg_strdup(optarg); + break; + case 'X': + xlog_dir = pg_strdup(optarg); + break; + default: + /* getopt_long already emitted a complaint */ + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), + progname); + exit(1); + } + } + + + /* + * Non-option argument specifies data directory as long as it wasn't + * already specified with -D / --pgdata + */ + if (optind < argc && strlen(pg_data) == 0) + { + pg_data = pg_strdup(argv[optind]); + optind++; + } + + if (optind < argc) + { + fprintf(stderr, _("%s: too many command-line arguments (first is \"%s\")\n"), + progname, argv[optind]); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), + progname); + exit(1); + } + + /* If we only need to fsync, just to it and exit */ + if (sync_only) + { + setup_pgdata(); + perform_fsync(); + return 0; + } + + if (pwprompt && pwfilename) + { + fprintf(stderr, _("%s: password prompt and password file cannot be specified together\n"), progname); + exit(1); + } + + check_authmethod_unspecified(&authmethodlocal); + check_authmethod_unspecified(&authmethodhost); + + check_authmethod_valid(authmethodlocal, auth_methods_local, "local"); + check_authmethod_valid(authmethodhost, auth_methods_host, "host"); + + check_need_password(authmethodlocal, authmethodhost); + + get_restricted_token(); + + setup_pgdata(); + + setup_bin_paths(argv[0]); + + effective_user = get_id(); + if (strlen(username) == 0) + username = effective_user; + + printf(_("The files belonging to this database system will be owned " + "by user \"%s\".\n" + "This user must also own the server process.\n\n"), + effective_user); + + set_info_version(); + + setup_data_file_paths(); + + setup_locale_encoding(); + + setup_text_search(); + + printf("\n"); + initialize_data_directory(); + if (do_sync) perform_fsync(); else @@ -3561,9 +3656,9 @@ main(int argc, char *argv[]) "or\n" " %s%s%spg_ctl%s -D %s%s%s -l logfile start\n\n"), QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH, - QUOTE_PATH, pg_data_native, QUOTE_PATH, + QUOTE_PATH, pgdata_native, QUOTE_PATH, QUOTE_PATH, bin_dir, (strlen(bin_dir) > 0) ? DIR_SEP : "", QUOTE_PATH, - QUOTE_PATH, pg_data_native, QUOTE_PATH); + QUOTE_PATH, pgdata_native, QUOTE_PATH); return 0; } diff --git a/src/bin/pg_basebackup/pg_receivexlog.c b/src/bin/pg_basebackup/pg_receivexlog.c index 843fc69294..54524834f7 100644 --- a/src/bin/pg_basebackup/pg_receivexlog.c +++ b/src/bin/pg_basebackup/pg_receivexlog.c @@ -315,6 +315,7 @@ main(int argc, char **argv) {"verbose", no_argument, NULL, 'v'}, {NULL, 0, NULL, 0} }; + int c; int option_index; diff --git a/src/bin/pg_basebackup/receivelog.c b/src/bin/pg_basebackup/receivelog.c index 404ff91715..de82ff54d8 100644 --- a/src/bin/pg_basebackup/receivelog.c +++ b/src/bin/pg_basebackup/receivelog.c @@ -19,29 +19,27 @@ */ #define FRONTEND 1 #include "postgres.h" + +#include +#include +#include +#include +/* for ntohl/htonl */ +#include +#include + #include "libpq-fe.h" #include "access/xlog_internal.h" -#include "replication/walprotocol.h" #include "utils/datetime.h" #include "utils/timestamp.h" #include "receivelog.h" #include "streamutil.h" -#include -#include -#include -#include - - -/* Size of the streaming replication protocol headers */ -#define STREAMING_HEADER_SIZE (1+sizeof(WalDataMessageHeader)) -#define STREAMING_KEEPALIVE_SIZE (1+sizeof(PrimaryKeepaliveMessage)) /* fd for currently open WAL file */ static int walfile = -1; - /* * Open a new WAL file in the specified directory. Store the name * (not including the full directory) in namebuf. Assumes there is @@ -189,37 +187,34 @@ close_walfile(char *basedir, char *walname, bool segment_complete) /* * Local version of GetCurrentTimestamp(), since we are not linked with - * backend code. + * backend code. The protocol always uses integer timestamps, regardless of + * server setting. */ -static TimestampTz +static int64 localGetCurrentTimestamp(void) { - TimestampTz result; + int64 result; struct timeval tp; gettimeofday(&tp, NULL); - result = (TimestampTz) tp.tv_sec - + result = (int64) tp.tv_sec - ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY); -#ifdef HAVE_INT64_TIMESTAMP result = (result * USECS_PER_SEC) + tp.tv_usec; -#else - result = result + (tp.tv_usec / 1000000.0); -#endif return result; } /* - * Local version of TimestampDifference(), since we are not - * linked with backend code. + * Local version of TimestampDifference(), since we are not linked with + * backend code. */ static void -localTimestampDifference(TimestampTz start_time, TimestampTz stop_time, +localTimestampDifference(int64 start_time, int64 stop_time, long *secs, int *microsecs) { - TimestampTz diff = stop_time - start_time; + int64 diff = stop_time - start_time; if (diff <= 0) { @@ -228,13 +223,8 @@ localTimestampDifference(TimestampTz start_time, TimestampTz stop_time, } else { -#ifdef HAVE_INT64_TIMESTAMP *secs = (long) (diff / USECS_PER_SEC); *microsecs = (int) (diff % USECS_PER_SEC); -#else - *secs = (long) diff; - *microsecs = (int) ((diff - *secs) * 1000000.0); -#endif } } @@ -243,17 +233,86 @@ localTimestampDifference(TimestampTz start_time, TimestampTz stop_time, * linked with backend code. */ static bool -localTimestampDifferenceExceeds(TimestampTz start_time, - TimestampTz stop_time, +localTimestampDifferenceExceeds(int64 start_time, + int64 stop_time, int msec) { - TimestampTz diff = stop_time - start_time; + int64 diff = stop_time - start_time; -#ifdef HAVE_INT64_TIMESTAMP return (diff >= msec * INT64CONST(1000)); -#else - return (diff * 1000.0 >= msec); -#endif +} + +/* + * Converts an int64 to network byte order. + */ +static void +sendint64(int64 i, char *buf) +{ + uint32 n32; + + /* High order half first, since we're doing MSB-first */ + n32 = (uint32) (i >> 32); + n32 = htonl(n32); + memcpy(&buf[0], &n32, 4); + + /* Now the low order half */ + n32 = (uint32) i; + n32 = htonl(n32); + memcpy(&buf[4], &n32, 4); +} + +/* + * Converts an int64 from network byte order to native format. + */ +static int64 +recvint64(char *buf) +{ + int64 result; + uint32 h32; + uint32 l32; + + memcpy(&h32, buf, 4); + memcpy(&l32, buf + 4, 4); + h32 = ntohl(h32); + l32 = ntohl(l32); + + result = h32; + result <<= 32; + result |= l32; + + return result; +} + +/* + * Send a Standby Status Update message to server. + */ +static bool +sendFeedback(PGconn *conn, XLogRecPtr blockpos, int64 now, bool replyRequested) +{ + char replybuf[1 + 8 + 8 + 8 + 8 + 1]; + int len = 0; + + replybuf[len] = 'r'; + len += 1; + sendint64(blockpos, &replybuf[len]); /* write */ + len += 8; + sendint64(InvalidXLogRecPtr, &replybuf[len]); /* flush */ + len += 8; + sendint64(InvalidXLogRecPtr, &replybuf[len]); /* apply */ + len += 8; + sendint64(now, &replybuf[len]); /* sendTime */ + len += 8; + replybuf[len] = replyRequested ? 1 : 0; /* replyRequested */ + len += 1; + + if (PQputCopyData(conn, replybuf, len) <= 0 || PQflush(conn)) + { + fprintf(stderr, _("%s: could not send feedback packet: %s"), + progname, PQerrorMessage(conn)); + return false; + } + + return true; } /* @@ -354,6 +413,7 @@ ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, int bytes_left; int bytes_written; int64 now; + int hdr_len; if (copybuf != NULL) { @@ -382,24 +442,8 @@ ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, standby_message_timeout)) { /* Time to send feedback! */ - char replybuf[sizeof(StandbyReplyMessage) + 1]; - StandbyReplyMessage *replymsg; - - replymsg = (StandbyReplyMessage *) (replybuf + 1); - replymsg->write = blockpos; - replymsg->flush = InvalidXLogRecPtr; - replymsg->apply = InvalidXLogRecPtr; - replymsg->sendTime = now; - replybuf[0] = 'r'; - - if (PQputCopyData(conn, replybuf, sizeof(replybuf)) <= 0 || - PQflush(conn)) - { - fprintf(stderr, _("%s: could not send feedback packet: %s"), - progname, PQerrorMessage(conn)); + if (!sendFeedback(conn, blockpos, now, false)) goto error; - } - last_status = now; } @@ -419,12 +463,11 @@ ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, FD_SET(PQsocket(conn), &input_mask); if (standby_message_timeout) { - TimestampTz targettime; + int64 targettime; long secs; int usecs; - targettime = TimestampTzPlusMilliseconds(last_status, - standby_message_timeout - 1); + targettime = last_status + (standby_message_timeout - 1) * ((int64) 1000); localTimestampDifference(now, targettime, &secs, @@ -474,19 +517,38 @@ ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, progname, PQerrorMessage(conn)); goto error; } + + /* Check the message type. */ if (copybuf[0] == 'k') { + int pos; + bool replyRequested; + /* - * keepalive message, sent in 9.2 and newer. We just ignore this - * message completely, but need to skip past it in the stream. + * Parse the keepalive message, enclosed in the CopyData message. + * We just check if the server requested a reply, and ignore the + * rest. */ - if (r != STREAMING_KEEPALIVE_SIZE) + pos = 1; /* skip msgtype 'k' */ + pos += 8; /* skip walEnd */ + pos += 8; /* skip sendTime */ + + if (r < pos + 1) { - fprintf(stderr, - _("%s: keepalive message has incorrect size %d\n"), + fprintf(stderr, _("%s: streaming header too small: %d\n"), progname, r); goto error; } + replyRequested = copybuf[pos]; + + /* If the server requested an immediate reply, send one. */ + if (replyRequested) + { + now = localGetCurrentTimestamp(); + if (!sendFeedback(conn, blockpos, now, false)) + goto error; + last_status = now; + } continue; } else if (copybuf[0] != 'w') @@ -495,15 +557,25 @@ ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, progname, copybuf[0]); goto error; } - if (r < STREAMING_HEADER_SIZE + 1) + + /* + * Read the header of the XLogData message, enclosed in the CopyData + * message. We only need the WAL location field (dataStart), the rest + * of the header is ignored. + */ + hdr_len = 1; /* msgtype 'w' */ + hdr_len += 8; /* dataStart */ + hdr_len += 8; /* walEnd */ + hdr_len += 8; /* sendTime */ + if (r < hdr_len + 1) { fprintf(stderr, _("%s: streaming header too small: %d\n"), progname, r); goto error; } + blockpos = recvint64(©buf[1]); /* Extract WAL location for this block */ - memcpy(&blockpos, copybuf + 1, 8); xlogoff = blockpos % XLOG_SEG_SIZE; /* @@ -534,7 +606,7 @@ ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, } } - bytes_left = r - STREAMING_HEADER_SIZE; + bytes_left = r - hdr_len; bytes_written = 0; while (bytes_left) @@ -560,7 +632,7 @@ ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, } if (write(walfile, - copybuf + STREAMING_HEADER_SIZE + bytes_written, + copybuf + hdr_len + bytes_written, bytes_to_write) != bytes_to_write) { fprintf(stderr, diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index 129c4d0dda..0fe68bb9e1 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -234,9 +234,11 @@ main(int argc, char *argv[]) ControlFile.checkPointCopy.oldestActiveXid); printf(_("Time of latest checkpoint: %s\n"), ckpttime_str); - printf(_("Minimum recovery ending location: %X/%X\n"), + printf(_("Min recovery ending location: %X/%X\n"), (uint32) (ControlFile.minRecoveryPoint >> 32), (uint32) ControlFile.minRecoveryPoint); + printf(_("Min recovery ending loc's timeline: %u\n"), + ControlFile.minRecoveryPointTLI); printf(_("Backup start location: %X/%X\n"), (uint32) (ControlFile.backupStartPoint >> 32), (uint32) ControlFile.backupStartPoint); diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c index 23c47f6275..2a00cd2255 100644 --- a/src/bin/pg_ctl/pg_ctl.c +++ b/src/bin/pg_ctl/pg_ctl.c @@ -20,6 +20,7 @@ #include "postgres_fe.h" #include "libpq-fe.h" +#include #include #include #include @@ -315,50 +316,84 @@ get_pgpid(void) static char ** readfile(const char *path) { - FILE *infile; - int maxlength = 1, - linelen = 0; - int nlines = 0; + int fd; + int nlines; char **result; char *buffer; - int c; + char *linebegin; + int i; + int n; + int len; + struct stat statbuf; - if ((infile = fopen(path, "r")) == NULL) + /* + * Slurp the file into memory. + * + * The file can change concurrently, so we read the whole file into memory + * with a single read() call. That's not guaranteed to get an atomic + * snapshot, but in practice, for a small file, it's close enough for the + * current use. + */ + fd = open(path, O_RDONLY | PG_BINARY, 0); + if (fd < 0) + return NULL; + if (fstat(fd, &statbuf) < 0) + { + close(fd); return NULL; + } + if (statbuf.st_size == 0) + { + /* empty file */ + close(fd); + result = (char **) pg_malloc(sizeof(char *)); + *result = NULL; + return result; + } + buffer = pg_malloc(statbuf.st_size + 1); - /* pass over the file twice - the first time to size the result */ + len = read(fd, buffer, statbuf.st_size + 1); + close(fd); + if (len != statbuf.st_size) + { + /* oops, the file size changed between fstat and read */ + free(buffer); + return NULL; + } - while ((c = fgetc(infile)) != EOF) + /* + * Count newlines. We expect there to be a newline after each full line, + * including one at the end of file. If there isn't a newline at the end, + * any characters after the last newline will be ignored. + */ + nlines = 0; + for (i = 0; i < len; i++) { - linelen++; - if (c == '\n') - { + if (buffer[i] == '\n') nlines++; - if (linelen > maxlength) - maxlength = linelen; - linelen = 0; - } } - /* handle last line without a terminating newline (yuck) */ - if (linelen) - nlines++; - if (linelen > maxlength) - maxlength = linelen; - - /* set up the result and the line buffer */ + /* set up the result buffer */ result = (char **) pg_malloc((nlines + 1) * sizeof(char *)); - buffer = (char *) pg_malloc(maxlength + 1); - /* now reprocess the file and store the lines */ - rewind(infile); - nlines = 0; - while (fgets(buffer, maxlength + 1, infile) != NULL) - result[nlines++] = pg_strdup(buffer); + /* now split the buffer into lines */ + linebegin = buffer; + n = 0; + for (i = 0; i < len; i++) + { + if (buffer[i] == '\n') + { + int slen = &buffer[i] - linebegin + 1; + char *linebuf = pg_malloc(slen + 1); + memcpy(linebuf, linebegin, slen); + linebuf[slen] = '\0'; + result[n++] = linebuf; + linebegin = &buffer[i + 1]; + } + } + result[n] = NULL; - fclose(infile); free(buffer); - result[nlines] = NULL; return result; } diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c index cd7fed9a99..319533141c 100644 --- a/src/bin/pg_dump/pg_backup_archiver.c +++ b/src/bin/pg_dump/pg_backup_archiver.c @@ -228,15 +228,6 @@ RestoreArchive(Archive *AHX) /* * Check for nonsensical option combinations. * - * NB: createDB+dropSchema is useless because if you're creating the DB, - * there's no need to drop individual items in it. Moreover, if we tried - * to do that then we'd issue the drops in the database initially - * connected to, not the one we will create, which is very bad... - */ - if (ropt->createDB && ropt->dropSchema) - exit_horribly(modulename, "-C and -c are incompatible options\n"); - - /* * -C is not compatible with -1, because we can't create a database inside * a transaction block. */ @@ -381,7 +372,25 @@ RestoreArchive(Archive *AHX) { AH->currentTE = te; - /* We want anything that's selected and has a dropStmt */ + /* + * In createDB mode, issue a DROP *only* for the database as a + * whole. Issuing drops against anything else would be wrong, + * because at this point we're connected to the wrong database. + * Conversely, if we're not in createDB mode, we'd better not + * issue a DROP against the database at all. + */ + if (ropt->createDB) + { + if (strcmp(te->desc, "DATABASE") != 0) + continue; + } + else + { + if (strcmp(te->desc, "DATABASE") == 0) + continue; + } + + /* Otherwise, drop anything that's selected and has a dropStmt */ if (((te->reqs & (REQ_SCHEMA | REQ_DATA)) != 0) && te->dropStmt) { ahlog(AH, 1, "dropping %s %s\n", te->desc, te->tag); @@ -884,9 +893,6 @@ PrintTOCSummary(Archive *AHX, RestoreOptions *ropt) ahprintf(AH, ";\n;\n; Selected TOC Entries:\n;\n"); - /* We should print DATABASE entries whether or not -C was specified */ - ropt->createDB = 1; - curSection = SECTION_PRE_DATA; for (te = AH->toc->next; te != AH->toc; te = te->next) { diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 420fc8c36d..e0206b13cd 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -193,6 +193,7 @@ static void dumpTable(Archive *fout, TableInfo *tbinfo); static void dumpTableSchema(Archive *fout, TableInfo *tbinfo); static void dumpAttrDef(Archive *fout, AttrDefInfo *adinfo); static void dumpSequence(Archive *fout, TableInfo *tbinfo); +static void dumpSequenceData(Archive *fout, TableDataInfo *tdinfo); static void dumpIndex(Archive *fout, IndxInfo *indxinfo); static void dumpConstraint(Archive *fout, ConstraintInfo *coninfo); static void dumpTableConstraintComment(Archive *fout, ConstraintInfo *coninfo); @@ -1740,9 +1741,6 @@ makeTableDataInfo(TableInfo *tbinfo, bool oids) /* Skip VIEWs (no data to dump) */ if (tbinfo->relkind == RELKIND_VIEW) return; - /* Skip SEQUENCEs (handled elsewhere) */ - if (tbinfo->relkind == RELKIND_SEQUENCE) - return; /* Skip FOREIGN TABLEs (no data to dump) */ if (tbinfo->relkind == RELKIND_FOREIGN_TABLE) return; @@ -3634,6 +3632,7 @@ getAggregates(Archive *fout, int *numAggs) int i_proargtypes; int i_rolname; int i_aggacl; + int i_proiargs; /* Make sure we are in proper schema */ selectSourceSchema(fout, "pg_catalog"); @@ -3643,11 +3642,12 @@ getAggregates(Archive *fout, int *numAggs) * rationale behind the filtering logic. */ - if (fout->remoteVersion >= 80200) + if (fout->remoteVersion >= 80400) { appendPQExpBuffer(query, "SELECT tableoid, oid, proname AS aggname, " "pronamespace AS aggnamespace, " "pronargs, proargtypes, " + "pg_catalog.pg_get_function_identity_arguments(oid) AS proiargs," "(%s proowner) AS rolname, " "proacl AS aggacl " "FROM pg_proc p " @@ -3665,12 +3665,28 @@ getAggregates(Archive *fout, int *numAggs) "deptype = 'e')"); appendPQExpBuffer(query, ")"); } + else if (fout->remoteVersion >= 80200) + { + appendPQExpBuffer(query, "SELECT tableoid, oid, proname AS aggname, " + "pronamespace AS aggnamespace, " + "pronargs, proargtypes, " + "NULL::text AS proiargs," + "(%s proowner) AS rolname, " + "proacl AS aggacl " + "FROM pg_proc p " + "WHERE proisagg AND (" + "pronamespace != " + "(SELECT oid FROM pg_namespace " + "WHERE nspname = 'pg_catalog'))", + username_subquery); + } else if (fout->remoteVersion >= 70300) { appendPQExpBuffer(query, "SELECT tableoid, oid, proname AS aggname, " "pronamespace AS aggnamespace, " "CASE WHEN proargtypes[0] = 'pg_catalog.\"any\"'::pg_catalog.regtype THEN 0 ELSE 1 END AS pronargs, " "proargtypes, " + "NULL::text AS proiargs, " "(%s proowner) AS rolname, " "proacl AS aggacl " "FROM pg_proc " @@ -3685,6 +3701,7 @@ getAggregates(Archive *fout, int *numAggs) "0::oid AS aggnamespace, " "CASE WHEN aggbasetype = 0 THEN 0 ELSE 1 END AS pronargs, " "aggbasetype AS proargtypes, " + "NULL::text AS proiargs, " "(%s aggowner) AS rolname, " "'{=X}' AS aggacl " "FROM pg_aggregate " @@ -3700,6 +3717,7 @@ getAggregates(Archive *fout, int *numAggs) "0::oid AS aggnamespace, " "CASE WHEN aggbasetype = 0 THEN 0 ELSE 1 END AS pronargs, " "aggbasetype AS proargtypes, " + "NULL::text AS proiargs, " "(%s aggowner) AS rolname, " "'{=X}' AS aggacl " "FROM pg_aggregate " @@ -3723,6 +3741,7 @@ getAggregates(Archive *fout, int *numAggs) i_proargtypes = PQfnumber(res, "proargtypes"); i_rolname = PQfnumber(res, "rolname"); i_aggacl = PQfnumber(res, "aggacl"); + i_proiargs = PQfnumber(res, "proiargs"); for (i = 0; i < ntups; i++) { @@ -3742,6 +3761,7 @@ getAggregates(Archive *fout, int *numAggs) agginfo[i].aggfn.lang = InvalidOid; /* not currently interesting */ agginfo[i].aggfn.prorettype = InvalidOid; /* not saved */ agginfo[i].aggfn.proacl = pg_strdup(PQgetvalue(res, i, i_aggacl)); + agginfo[i].aggfn.proiargs = pg_strdup(PQgetvalue(res, i, i_proiargs)); agginfo[i].aggfn.nargs = atoi(PQgetvalue(res, i, i_pronargs)); if (agginfo[i].aggfn.nargs == 0) agginfo[i].aggfn.argtypes = NULL; @@ -3793,6 +3813,7 @@ getFuncs(Archive *fout, int *numFuncs) int i_proargtypes; int i_prorettype; int i_proacl; + int i_proiargs; /* Make sure we are in proper schema */ selectSourceSchema(fout, "pg_catalog"); @@ -3813,12 +3834,13 @@ getFuncs(Archive *fout, int *numFuncs) * doesn't have; otherwise we might not get creation ordering correct. */ - if (fout->remoteVersion >= 70300) + if (fout->remoteVersion >= 80400) { appendPQExpBuffer(query, "SELECT tableoid, oid, proname, prolang, " "pronargs, proargtypes, prorettype, proacl, " "pronamespace, " + "pg_catalog.pg_get_function_identity_arguments(oid) AS proiargs," "(%s proowner) AS rolname " "FROM pg_proc p " "WHERE NOT proisagg AND (" @@ -3840,6 +3862,21 @@ getFuncs(Archive *fout, int *numFuncs) "deptype = 'e')"); appendPQExpBuffer(query, ")"); } + else if (fout->remoteVersion >= 70300) + { + appendPQExpBuffer(query, + "SELECT tableoid, oid, proname, prolang, " + "pronargs, proargtypes, prorettype, proacl, " + "pronamespace, " + "NULL::text AS proiargs," + "(%s proowner) AS rolname " + "FROM pg_proc p " + "WHERE NOT proisagg AND (" + "pronamespace != " + "(SELECT oid FROM pg_namespace " + "WHERE nspname = 'pg_catalog'))", + username_subquery); + } else if (fout->remoteVersion >= 70100) { appendPQExpBuffer(query, @@ -3847,6 +3884,7 @@ getFuncs(Archive *fout, int *numFuncs) "pronargs, proargtypes, prorettype, " "'{=X}' AS proacl, " "0::oid AS pronamespace, " + "NULL::text AS proiargs," "(%s proowner) AS rolname " "FROM pg_proc " "WHERE pg_proc.oid > '%u'::oid", @@ -3863,6 +3901,7 @@ getFuncs(Archive *fout, int *numFuncs) "pronargs, proargtypes, prorettype, " "'{=X}' AS proacl, " "0::oid AS pronamespace, " + "NULL::text AS proiargs," "(%s proowner) AS rolname " "FROM pg_proc " "where pg_proc.oid > '%u'::oid", @@ -3888,6 +3927,7 @@ getFuncs(Archive *fout, int *numFuncs) i_proargtypes = PQfnumber(res, "proargtypes"); i_prorettype = PQfnumber(res, "prorettype"); i_proacl = PQfnumber(res, "proacl"); + i_proiargs = PQfnumber(res, "proiargs"); for (i = 0; i < ntups; i++) { @@ -3903,6 +3943,7 @@ getFuncs(Archive *fout, int *numFuncs) finfo[i].rolname = pg_strdup(PQgetvalue(res, i, i_rolname)); finfo[i].lang = atooid(PQgetvalue(res, i, i_prolang)); finfo[i].prorettype = atooid(PQgetvalue(res, i, i_prorettype)); + finfo[i].proiargs = pg_strdup(PQgetvalue(res, i, i_proiargs)); finfo[i].proacl = pg_strdup(PQgetvalue(res, i, i_proacl)); finfo[i].nargs = atoi(PQgetvalue(res, i, i_pronargs)); if (finfo[i].nargs == 0) @@ -7396,7 +7437,10 @@ dumpDumpableObject(Archive *fout, DumpableObject *dobj) dumpCast(fout, (CastInfo *) dobj); break; case DO_TABLE_DATA: - dumpTableData(fout, (TableDataInfo *) dobj); + if (((TableDataInfo *) dobj)->tdtable->relkind == RELKIND_SEQUENCE) + dumpSequenceData(fout, (TableDataInfo *) dobj); + else + dumpTableData(fout, (TableDataInfo *) dobj); break; case DO_DUMMY_TYPE: /* table rowtypes and array types are never dumped separately */ @@ -12304,13 +12348,13 @@ collectSecLabels(Archive *fout, SecLabelItem **items) static void dumpTable(Archive *fout, TableInfo *tbinfo) { - if (tbinfo->dobj.dump) + if (tbinfo->dobj.dump && !dataOnly) { char *namecopy; if (tbinfo->relkind == RELKIND_SEQUENCE) dumpSequence(fout, tbinfo); - else if (!dataOnly) + else dumpTableSchema(fout, tbinfo); /* Handle the ACL here */ @@ -13425,20 +13469,22 @@ findLastBuiltinOid_V70(Archive *fout) return last_oid; } +/* + * dumpSequence + * write the declaration (not data) of one user-defined sequence + */ static void dumpSequence(Archive *fout, TableInfo *tbinfo) { PGresult *res; char *startv, - *last, *incby, *maxv = NULL, *minv = NULL, *cache; char bufm[100], bufx[100]; - bool cycled, - called; + bool cycled; PQExpBuffer query = createPQExpBuffer(); PQExpBuffer delqry = createPQExpBuffer(); PQExpBuffer labelq = createPQExpBuffer(); @@ -13453,7 +13499,7 @@ dumpSequence(Archive *fout, TableInfo *tbinfo) { appendPQExpBuffer(query, "SELECT sequence_name, " - "start_value, last_value, increment_by, " + "start_value, increment_by, " "CASE WHEN increment_by > 0 AND max_value = %s THEN NULL " " WHEN increment_by < 0 AND max_value = -1 THEN NULL " " ELSE max_value " @@ -13462,7 +13508,7 @@ dumpSequence(Archive *fout, TableInfo *tbinfo) " WHEN increment_by < 0 AND min_value = %s THEN NULL " " ELSE min_value " "END AS min_value, " - "cache_value, is_cycled, is_called from %s", + "cache_value, is_cycled FROM %s", bufx, bufm, fmtId(tbinfo->dobj.name)); } @@ -13470,7 +13516,7 @@ dumpSequence(Archive *fout, TableInfo *tbinfo) { appendPQExpBuffer(query, "SELECT sequence_name, " - "0 AS start_value, last_value, increment_by, " + "0 AS start_value, increment_by, " "CASE WHEN increment_by > 0 AND max_value = %s THEN NULL " " WHEN increment_by < 0 AND max_value = -1 THEN NULL " " ELSE max_value " @@ -13479,7 +13525,7 @@ dumpSequence(Archive *fout, TableInfo *tbinfo) " WHEN increment_by < 0 AND min_value = %s THEN NULL " " ELSE min_value " "END AS min_value, " - "cache_value, is_cycled, is_called from %s", + "cache_value, is_cycled FROM %s", bufx, bufm, fmtId(tbinfo->dobj.name)); } @@ -13506,165 +13552,120 @@ dumpSequence(Archive *fout, TableInfo *tbinfo) #endif startv = PQgetvalue(res, 0, 1); - last = PQgetvalue(res, 0, 2); - incby = PQgetvalue(res, 0, 3); + incby = PQgetvalue(res, 0, 2); + if (!PQgetisnull(res, 0, 3)) + maxv = PQgetvalue(res, 0, 3); if (!PQgetisnull(res, 0, 4)) - maxv = PQgetvalue(res, 0, 4); - if (!PQgetisnull(res, 0, 5)) - minv = PQgetvalue(res, 0, 5); - cache = PQgetvalue(res, 0, 6); - cycled = (strcmp(PQgetvalue(res, 0, 7), "t") == 0); - called = (strcmp(PQgetvalue(res, 0, 8), "t") == 0); + minv = PQgetvalue(res, 0, 4); + cache = PQgetvalue(res, 0, 5); + cycled = (strcmp(PQgetvalue(res, 0, 6), "t") == 0); /* - * The logic we use for restoring sequences is as follows: - * - * Add a CREATE SEQUENCE statement as part of a "schema" dump (use - * last_val for start if called is false, else use min_val for start_val). - * Also, if the sequence is owned by a column, add an ALTER SEQUENCE OWNED - * BY command for it. - * - * Add a 'SETVAL(seq, last_val, iscalled)' as part of a "data" dump. + * DROP must be fully qualified in case same name appears in pg_catalog */ - if (!dataOnly) - { - /* - * DROP must be fully qualified in case same name appears in - * pg_catalog - */ - appendPQExpBuffer(delqry, "DROP SEQUENCE %s.", - fmtId(tbinfo->dobj.namespace->dobj.name)); - appendPQExpBuffer(delqry, "%s;\n", - fmtId(tbinfo->dobj.name)); + appendPQExpBuffer(delqry, "DROP SEQUENCE %s.", + fmtId(tbinfo->dobj.namespace->dobj.name)); + appendPQExpBuffer(delqry, "%s;\n", + fmtId(tbinfo->dobj.name)); - resetPQExpBuffer(query); + resetPQExpBuffer(query); - if (binary_upgrade) - { - binary_upgrade_set_pg_class_oids(fout, query, - tbinfo->dobj.catId.oid, false); - binary_upgrade_set_type_oids_by_rel_oid(fout, query, - tbinfo->dobj.catId.oid); - } + if (binary_upgrade) + { + binary_upgrade_set_pg_class_oids(fout, query, + tbinfo->dobj.catId.oid, false); + binary_upgrade_set_type_oids_by_rel_oid(fout, query, + tbinfo->dobj.catId.oid); + } - appendPQExpBuffer(query, - "CREATE SEQUENCE %s\n", - fmtId(tbinfo->dobj.name)); + appendPQExpBuffer(query, + "CREATE SEQUENCE %s\n", + fmtId(tbinfo->dobj.name)); - if (fout->remoteVersion >= 80400) - appendPQExpBuffer(query, " START WITH %s\n", startv); - else - { - /* - * Versions before 8.4 did not remember the true start value. If - * is_called is false then the sequence has never been incremented - * so we can use last_val. Otherwise punt and let it default. - */ - if (!called) - appendPQExpBuffer(query, " START WITH %s\n", last); - } + if (fout->remoteVersion >= 80400) + appendPQExpBuffer(query, " START WITH %s\n", startv); - appendPQExpBuffer(query, " INCREMENT BY %s\n", incby); + appendPQExpBuffer(query, " INCREMENT BY %s\n", incby); - if (minv) - appendPQExpBuffer(query, " MINVALUE %s\n", minv); - else - appendPQExpBuffer(query, " NO MINVALUE\n"); + if (minv) + appendPQExpBuffer(query, " MINVALUE %s\n", minv); + else + appendPQExpBuffer(query, " NO MINVALUE\n"); - if (maxv) - appendPQExpBuffer(query, " MAXVALUE %s\n", maxv); - else - appendPQExpBuffer(query, " NO MAXVALUE\n"); + if (maxv) + appendPQExpBuffer(query, " MAXVALUE %s\n", maxv); + else + appendPQExpBuffer(query, " NO MAXVALUE\n"); - appendPQExpBuffer(query, - " CACHE %s%s", - cache, (cycled ? "\n CYCLE" : "")); + appendPQExpBuffer(query, + " CACHE %s%s", + cache, (cycled ? "\n CYCLE" : "")); - appendPQExpBuffer(query, ";\n"); + appendPQExpBuffer(query, ";\n"); - appendPQExpBuffer(labelq, "SEQUENCE %s", fmtId(tbinfo->dobj.name)); + appendPQExpBuffer(labelq, "SEQUENCE %s", fmtId(tbinfo->dobj.name)); - /* binary_upgrade: no need to clear TOAST table oid */ + /* binary_upgrade: no need to clear TOAST table oid */ - if (binary_upgrade) - binary_upgrade_extension_member(query, &tbinfo->dobj, - labelq->data); + if (binary_upgrade) + binary_upgrade_extension_member(query, &tbinfo->dobj, + labelq->data); - ArchiveEntry(fout, tbinfo->dobj.catId, tbinfo->dobj.dumpId, - tbinfo->dobj.name, - tbinfo->dobj.namespace->dobj.name, - NULL, - tbinfo->rolname, - false, "SEQUENCE", SECTION_PRE_DATA, - query->data, delqry->data, NULL, - NULL, 0, - NULL, NULL); + ArchiveEntry(fout, tbinfo->dobj.catId, tbinfo->dobj.dumpId, + tbinfo->dobj.name, + tbinfo->dobj.namespace->dobj.name, + NULL, + tbinfo->rolname, + false, "SEQUENCE", SECTION_PRE_DATA, + query->data, delqry->data, NULL, + NULL, 0, + NULL, NULL); - /* - * If the sequence is owned by a table column, emit the ALTER for it - * as a separate TOC entry immediately following the sequence's own - * entry. It's OK to do this rather than using full sorting logic, - * because the dependency that tells us it's owned will have forced - * the table to be created first. We can't just include the ALTER in - * the TOC entry because it will fail if we haven't reassigned the - * sequence owner to match the table's owner. - * - * We need not schema-qualify the table reference because both - * sequence and table must be in the same schema. - */ - if (OidIsValid(tbinfo->owning_tab)) - { - TableInfo *owning_tab = findTableByOid(tbinfo->owning_tab); + /* + * If the sequence is owned by a table column, emit the ALTER for it as a + * separate TOC entry immediately following the sequence's own entry. + * It's OK to do this rather than using full sorting logic, because the + * dependency that tells us it's owned will have forced the table to be + * created first. We can't just include the ALTER in the TOC entry + * because it will fail if we haven't reassigned the sequence owner to + * match the table's owner. + * + * We need not schema-qualify the table reference because both sequence + * and table must be in the same schema. + */ + if (OidIsValid(tbinfo->owning_tab)) + { + TableInfo *owning_tab = findTableByOid(tbinfo->owning_tab); - if (owning_tab && owning_tab->dobj.dump) - { - resetPQExpBuffer(query); - appendPQExpBuffer(query, "ALTER SEQUENCE %s", - fmtId(tbinfo->dobj.name)); - appendPQExpBuffer(query, " OWNED BY %s", - fmtId(owning_tab->dobj.name)); - appendPQExpBuffer(query, ".%s;\n", + if (owning_tab && owning_tab->dobj.dump) + { + resetPQExpBuffer(query); + appendPQExpBuffer(query, "ALTER SEQUENCE %s", + fmtId(tbinfo->dobj.name)); + appendPQExpBuffer(query, " OWNED BY %s", + fmtId(owning_tab->dobj.name)); + appendPQExpBuffer(query, ".%s;\n", fmtId(owning_tab->attnames[tbinfo->owning_col - 1])); - ArchiveEntry(fout, nilCatalogId, createDumpId(), - tbinfo->dobj.name, - tbinfo->dobj.namespace->dobj.name, - NULL, - tbinfo->rolname, - false, "SEQUENCE OWNED BY", SECTION_PRE_DATA, - query->data, "", NULL, - &(tbinfo->dobj.dumpId), 1, - NULL, NULL); - } + ArchiveEntry(fout, nilCatalogId, createDumpId(), + tbinfo->dobj.name, + tbinfo->dobj.namespace->dobj.name, + NULL, + tbinfo->rolname, + false, "SEQUENCE OWNED BY", SECTION_PRE_DATA, + query->data, "", NULL, + &(tbinfo->dobj.dumpId), 1, + NULL, NULL); } - - /* Dump Sequence Comments and Security Labels */ - dumpComment(fout, labelq->data, - tbinfo->dobj.namespace->dobj.name, tbinfo->rolname, - tbinfo->dobj.catId, 0, tbinfo->dobj.dumpId); - dumpSecLabel(fout, labelq->data, - tbinfo->dobj.namespace->dobj.name, tbinfo->rolname, - tbinfo->dobj.catId, 0, tbinfo->dobj.dumpId); } - if (!schemaOnly) - { - resetPQExpBuffer(query); - appendPQExpBuffer(query, "SELECT pg_catalog.setval("); - appendStringLiteralAH(query, fmtId(tbinfo->dobj.name), fout); - appendPQExpBuffer(query, ", %s, %s);\n", - last, (called ? "true" : "false")); - - ArchiveEntry(fout, nilCatalogId, createDumpId(), - tbinfo->dobj.name, - tbinfo->dobj.namespace->dobj.name, - NULL, - tbinfo->rolname, - false, "SEQUENCE SET", SECTION_PRE_DATA, - query->data, "", NULL, - &(tbinfo->dobj.dumpId), 1, - NULL, NULL); - } + /* Dump Sequence Comments and Security Labels */ + dumpComment(fout, labelq->data, + tbinfo->dobj.namespace->dobj.name, tbinfo->rolname, + tbinfo->dobj.catId, 0, tbinfo->dobj.dumpId); + dumpSecLabel(fout, labelq->data, + tbinfo->dobj.namespace->dobj.name, tbinfo->rolname, + tbinfo->dobj.catId, 0, tbinfo->dobj.dumpId); PQclear(res); @@ -13673,6 +13674,61 @@ dumpSequence(Archive *fout, TableInfo *tbinfo) destroyPQExpBuffer(labelq); } +/* + * dumpSequenceData + * write the data of one user-defined sequence + */ +static void +dumpSequenceData(Archive *fout, TableDataInfo *tdinfo) +{ + TableInfo *tbinfo = tdinfo->tdtable; + PGresult *res; + char *last; + bool called; + PQExpBuffer query = createPQExpBuffer(); + + /* Make sure we are in proper schema */ + selectSourceSchema(fout, tbinfo->dobj.namespace->dobj.name); + + appendPQExpBuffer(query, + "SELECT last_value, is_called FROM %s", + fmtId(tbinfo->dobj.name)); + + res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK); + + if (PQntuples(res) != 1) + { + write_msg(NULL, ngettext("query to get data of sequence \"%s\" returned %d row (expected 1)\n", + "query to get data of sequence \"%s\" returned %d rows (expected 1)\n", + PQntuples(res)), + tbinfo->dobj.name, PQntuples(res)); + exit_nicely(1); + } + + last = PQgetvalue(res, 0, 0); + called = (strcmp(PQgetvalue(res, 0, 1), "t") == 0); + + resetPQExpBuffer(query); + appendPQExpBuffer(query, "SELECT pg_catalog.setval("); + appendStringLiteralAH(query, fmtId(tbinfo->dobj.name), fout); + appendPQExpBuffer(query, ", %s, %s);\n", + last, (called ? "true" : "false")); + + ArchiveEntry(fout, nilCatalogId, createDumpId(), + tbinfo->dobj.name, + tbinfo->dobj.namespace->dobj.name, + NULL, + tbinfo->rolname, + false, "SEQUENCE SET", SECTION_DATA, + query->data, "", NULL, + &(tbinfo->dobj.dumpId), 1, + NULL, NULL); + + PQclear(res); + + destroyPQExpBuffer(query); +} + static void dumpTrigger(Archive *fout, TriggerInfo *tginfo) { diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h index 40b24d2c54..76caf640c7 100644 --- a/src/bin/pg_dump/pg_dump.h +++ b/src/bin/pg_dump/pg_dump.h @@ -193,6 +193,7 @@ typedef struct _funcInfo Oid *argtypes; Oid prorettype; char *proacl; + char *proiargs; } FuncInfo; /* AggInfo is a superset of FuncInfo */ diff --git a/src/bin/pg_dump/pg_dump_sort.c b/src/bin/pg_dump/pg_dump_sort.c index 9b9d84bfa2..436739733d 100644 --- a/src/bin/pg_dump/pg_dump_sort.c +++ b/src/bin/pg_dump/pg_dump_sort.c @@ -285,6 +285,9 @@ DOTypeNameCompare(const void *p1, const void *p2) cmpval = fobj1->nargs - fobj2->nargs; if (cmpval != 0) return cmpval; + cmpval = strcmp(fobj1->proiargs, fobj2->proiargs); + if (cmpval != 0) + return cmpval; } else if (obj1->objType == DO_OPERATOR) { diff --git a/src/bin/pg_dump/pg_dumpall.c b/src/bin/pg_dump/pg_dumpall.c index ca95bad1cc..088106fae0 100644 --- a/src/bin/pg_dump/pg_dumpall.c +++ b/src/bin/pg_dump/pg_dumpall.c @@ -82,24 +82,6 @@ static char *filename = NULL; int main(int argc, char *argv[]) { - char *pghost = NULL; - char *pgport = NULL; - char *pguser = NULL; - char *pgdb = NULL; - char *use_role = NULL; - enum trivalue prompt_password = TRI_DEFAULT; - bool data_only = false; - bool globals_only = false; - bool output_clean = false; - bool roles_only = false; - bool tablespaces_only = false; - PGconn *conn; - int encoding; - const char *std_strings; - int c, - ret; - int optindex; - static struct option long_options[] = { {"data-only", no_argument, NULL, 'a'}, {"clean", no_argument, NULL, 'c'}, @@ -142,6 +124,24 @@ main(int argc, char *argv[]) {NULL, 0, NULL, 0} }; + char *pghost = NULL; + char *pgport = NULL; + char *pguser = NULL; + char *pgdb = NULL; + char *use_role = NULL; + enum trivalue prompt_password = TRI_DEFAULT; + bool data_only = false; + bool globals_only = false; + bool output_clean = false; + bool roles_only = false; + bool tablespaces_only = false; + PGconn *conn; + int encoding; + const char *std_strings; + int c, + ret; + int optindex; + set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_dump")); progname = get_progname(argv[0]); @@ -502,7 +502,7 @@ main(int argc, char *argv[]) } /* Dump CREATE DATABASE commands */ - if (!globals_only && !roles_only && !tablespaces_only) + if (binary_upgrade || (!globals_only && !roles_only && !tablespaces_only)) dumpCreateDB(conn); /* Dump role/database settings */ @@ -642,7 +642,8 @@ dumpRoles(PGconn *conn) i_rolpassword, i_rolvaliduntil, i_rolreplication, - i_rolcomment; + i_rolcomment, + i_is_current_user; int i; /* note: rolconfig is dumped later */ @@ -652,7 +653,8 @@ dumpRoles(PGconn *conn) "rolcreaterole, rolcreatedb, " "rolcanlogin, rolconnlimit, rolpassword, " "rolvaliduntil, rolreplication, " - "pg_catalog.shobj_description(oid, 'pg_authid') as rolcomment " + "pg_catalog.shobj_description(oid, 'pg_authid') as rolcomment, " + "rolname = current_user AS is_current_user " "FROM pg_authid " "ORDER BY 2"); else if (server_version >= 80200) @@ -661,7 +663,8 @@ dumpRoles(PGconn *conn) "rolcreaterole, rolcreatedb, " "rolcanlogin, rolconnlimit, rolpassword, " "rolvaliduntil, false as rolreplication, " - "pg_catalog.shobj_description(oid, 'pg_authid') as rolcomment " + "pg_catalog.shobj_description(oid, 'pg_authid') as rolcomment, " + "rolname = current_user AS is_current_user " "FROM pg_authid " "ORDER BY 2"); else if (server_version >= 80100) @@ -670,7 +673,8 @@ dumpRoles(PGconn *conn) "rolcreaterole, rolcreatedb, " "rolcanlogin, rolconnlimit, rolpassword, " "rolvaliduntil, false as rolreplication, " - "null as rolcomment " + "null as rolcomment, " + "rolname = current_user AS is_current_user " "FROM pg_authid " "ORDER BY 2"); else @@ -685,7 +689,8 @@ dumpRoles(PGconn *conn) "passwd as rolpassword, " "valuntil as rolvaliduntil, " "false as rolreplication, " - "null as rolcomment " + "null as rolcomment, " + "rolname = current_user AS is_current_user " "FROM pg_shadow " "UNION ALL " "SELECT 0, groname as rolname, " @@ -698,7 +703,7 @@ dumpRoles(PGconn *conn) "null::text as rolpassword, " "null::abstime as rolvaliduntil, " "false as rolreplication, " - "null as rolcomment " + "null as rolcomment, false " "FROM pg_group " "WHERE NOT EXISTS (SELECT 1 FROM pg_shadow " " WHERE usename = groname) " @@ -718,6 +723,7 @@ dumpRoles(PGconn *conn) i_rolvaliduntil = PQfnumber(res, "rolvaliduntil"); i_rolreplication = PQfnumber(res, "rolreplication"); i_rolcomment = PQfnumber(res, "rolcomment"); + i_is_current_user = PQfnumber(res, "is_current_user"); if (PQntuples(res) > 0) fprintf(OPF, "--\n-- Roles\n--\n\n"); @@ -745,9 +751,12 @@ dumpRoles(PGconn *conn) * will acquire the right properties even if it already exists (ie, it * won't hurt for the CREATE to fail). This is particularly important * for the role we are connected as, since even with --clean we will - * have failed to drop it. + * have failed to drop it. binary_upgrade cannot generate any errors, + * so we assume the current role is already created. */ - appendPQExpBuffer(buf, "CREATE ROLE %s;\n", fmtId(rolename)); + if (!binary_upgrade || + strcmp(PQgetvalue(res, i, i_is_current_user), "f") == 0) + appendPQExpBuffer(buf, "CREATE ROLE %s;\n", fmtId(rolename)); appendPQExpBuffer(buf, "ALTER ROLE %s WITH", fmtId(rolename)); if (strcmp(PQgetvalue(res, i, i_rolsuper), "t") == 0) diff --git a/src/bin/pg_resetxlog/pg_resetxlog.c b/src/bin/pg_resetxlog/pg_resetxlog.c index d5d89ec3ad..35be03d9db 100644 --- a/src/bin/pg_resetxlog/pg_resetxlog.c +++ b/src/bin/pg_resetxlog/pg_resetxlog.c @@ -91,7 +91,6 @@ main(int argc, char *argv[]) char *endptr; char *DataDir; int fd; - char path[MAXPGPATH]; set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_resetxlog")); @@ -252,13 +251,12 @@ main(int argc, char *argv[]) * Check for a postmaster lock file --- if there is one, refuse to * proceed, on grounds we might be interfering with a live installation. */ - snprintf(path, MAXPGPATH, "%s/postmaster.pid", DataDir); - - if ((fd = open(path, O_RDONLY, 0)) < 0) + if ((fd = open("postmaster.pid", O_RDONLY, 0)) < 0) { if (errno != ENOENT) { - fprintf(stderr, _("%s: could not open file \"%s\" for reading: %s\n"), progname, path, strerror(errno)); + fprintf(stderr, _("%s: could not open file \"%s\" for reading: %s\n"), + progname, "postmaster.pid", strerror(errno)); exit(1); } } @@ -266,7 +264,7 @@ main(int argc, char *argv[]) { fprintf(stderr, _("%s: lock file \"%s\" exists\n" "Is a server running? If not, delete the lock file and try again.\n"), - progname, path); + progname, "postmaster.pid"); exit(1); } @@ -612,6 +610,7 @@ RewriteControlFile(void) ControlFile.checkPoint = ControlFile.checkPointCopy.redo; ControlFile.prevCheckPoint = 0; ControlFile.minRecoveryPoint = 0; + ControlFile.minRecoveryPointTLI = 0; ControlFile.backupStartPoint = 0; ControlFile.backupEndPoint = 0; ControlFile.backupEndRequired = false; diff --git a/src/bin/psql/psqlscan.l b/src/bin/psql/psqlscan.l index d32a12c63c..6c1429815f 100644 --- a/src/bin/psql/psqlscan.l +++ b/src/bin/psql/psqlscan.l @@ -1807,7 +1807,7 @@ prepare_buffer(const char *txt, int len, char **txtcopy) /* first byte should always be okay... */ newtxt[i] = txt[i]; i++; - while (--thislen > 0) + while (--thislen > 0 && i < len) newtxt[i++] = (char) 0xFF; } } diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 52877ae293..1e8eabdb5e 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -167,7 +167,7 @@ typedef GISTScanOpaqueData *GISTScanOpaque; #define XLOG_GIST_PAGE_SPLIT 0x30 /* #define XLOG_GIST_INSERT_COMPLETE 0x40 */ /* not used anymore */ #define XLOG_GIST_CREATE_INDEX 0x50 -#define XLOG_GIST_PAGE_DELETE 0x60 + /* #define XLOG_GIST_PAGE_DELETE 0x60 */ /* not used anymore */ typedef struct gistxlogPageUpdate { @@ -211,12 +211,6 @@ typedef struct gistxlogPage int num; /* number of index tuples following */ } gistxlogPage; -typedef struct gistxlogPageDelete -{ - RelFileNode node; - BlockNumber blkno; -} gistxlogPageDelete; - /* SplitedPageLayout - gistSplit function result */ typedef struct SplitedPageLayout { diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index a4ab0627a0..7259f997e6 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -26,6 +26,7 @@ /* "options" flag bits for heap_insert */ #define HEAP_INSERT_SKIP_WAL 0x0001 #define HEAP_INSERT_SKIP_FSM 0x0002 +#define HEAP_INSERT_FROZEN 0x0004 typedef struct BulkInsertStateData *BulkInsertState; @@ -35,6 +36,29 @@ typedef enum LockTupleExclusive } LockTupleMode; +/* + * When heap_update, heap_delete, or heap_lock_tuple fail because the target + * tuple is already outdated, they fill in this struct to provide information + * to the caller about what happened. + * ctid is the target's ctid link: it is the same as the target's TID if the + * target was deleted, or the location of the replacement tuple if the target + * was updated. + * xmax is the outdating transaction's XID. If the caller wants to visit the + * replacement tuple, it must check that this matches before believing the + * replacement is really a match. + * cmax is the outdating command's CID, but only when the failure code is + * HeapTupleSelfUpdated (i.e., something in the current transaction outdated + * the tuple); otherwise cmax is zero. (We make this restriction because + * HeapTupleHeaderGetCmax doesn't work for tuples outdated in other + * transactions.) + */ +typedef struct HeapUpdateFailureData +{ + ItemPointerData ctid; + TransactionId xmax; + CommandId cmax; +} HeapUpdateFailureData; + /* ---------------- * function prototypes for heap access method @@ -100,16 +124,15 @@ extern Oid heap_insert(Relation relation, HeapTuple tup, CommandId cid, extern void heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, CommandId cid, int options, BulkInsertState bistate); extern HTSU_Result heap_delete(Relation relation, ItemPointer tid, - ItemPointer ctid, TransactionId *update_xmax, - CommandId cid, Snapshot crosscheck, bool wait); + CommandId cid, Snapshot crosscheck, bool wait, + HeapUpdateFailureData *hufd); extern HTSU_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, - ItemPointer ctid, TransactionId *update_xmax, - CommandId cid, Snapshot crosscheck, bool wait); + CommandId cid, Snapshot crosscheck, bool wait, + HeapUpdateFailureData *hufd); extern HTSU_Result heap_lock_tuple(Relation relation, HeapTuple tuple, - Buffer *buffer, ItemPointer ctid, - TransactionId *update_xmax, CommandId cid, - LockTupleMode mode, bool nowait); + CommandId cid, LockTupleMode mode, bool nowait, + Buffer *buffer, HeapUpdateFailureData *hufd); extern void heap_inplace_update(Relation relation, HeapTuple tuple); extern bool heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid); extern bool heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid, diff --git a/src/include/access/timeline.h b/src/include/access/timeline.h index f2a7658bc4..785195bd36 100644 --- a/src/include/access/timeline.h +++ b/src/include/access/timeline.h @@ -14,10 +14,28 @@ #include "access/xlogdefs.h" #include "nodes/pg_list.h" +/* + * A list of these structs describes the timeline history of the server. Each + * TimeLineHistoryEntry represents a piece of WAL belonging to the history, + * from newest to oldest. All WAL positions between 'begin' and 'end' belong to + * the timeline represented by the entry. Together the 'begin' and 'end' + * pointers of all the entries form a contiguous line from beginning of time + * to infinity. + */ +typedef struct +{ + TimeLineID tli; + XLogRecPtr begin; /* inclusive */ + XLogRecPtr end; /* exclusive, 0 means infinity */ +} TimeLineHistoryEntry; + extern List *readTimeLineHistory(TimeLineID targetTLI); extern bool existsTimeLineHistory(TimeLineID probeTLI); extern TimeLineID findNewestTimeLine(TimeLineID startTLI); extern void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, - TimeLineID endTLI, XLogSegNo endLogSegNo, char *reason); + XLogRecPtr switchpoint, char *reason); +extern bool tliInHistory(TimeLineID tli, List *expectedTLIs); +extern TimeLineID tliOfPointInHistory(XLogRecPtr ptr, List *history); +extern XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history); #endif /* TIMELINE_H */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 2893f3b352..32c2e40ac1 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -71,11 +71,7 @@ typedef struct XLogRecord */ #define XLR_BKP_BLOCK_MASK 0x0F /* all info bits used for bkp blocks */ #define XLR_MAX_BKP_BLOCKS 4 -#define XLR_SET_BKP_BLOCK(iblk) (0x08 >> (iblk)) -#define XLR_BKP_BLOCK_1 XLR_SET_BKP_BLOCK(0) /* 0x08 */ -#define XLR_BKP_BLOCK_2 XLR_SET_BKP_BLOCK(1) /* 0x04 */ -#define XLR_BKP_BLOCK_3 XLR_SET_BKP_BLOCK(2) /* 0x02 */ -#define XLR_BKP_BLOCK_4 XLR_SET_BKP_BLOCK(3) /* 0x01 */ +#define XLR_BKP_BLOCK(iblk) (0x08 >> (iblk)) /* iblk in 0..3 */ /* Sync methods */ #define SYNC_METHOD_FSYNC 0 @@ -94,13 +90,13 @@ extern int sync_method; * If buffer is valid then XLOG will check if buffer must be backed up * (ie, whether this is first change of that page since last checkpoint). * If so, the whole page contents are attached to the XLOG record, and XLOG - * sets XLR_BKP_BLOCK_X bit in xl_info. Note that the buffer must be pinned + * sets XLR_BKP_BLOCK(N) bit in xl_info. Note that the buffer must be pinned * and exclusive-locked by the caller, so that it won't change under us. * NB: when the buffer is backed up, we DO NOT insert the data pointed to by * this XLogRecData struct into the XLOG record, since we assume it's present * in the buffer. Therefore, rmgr redo routines MUST pay attention to - * XLR_BKP_BLOCK_X to know what is actually stored in the XLOG record. - * The i'th XLR_BKP_BLOCK bit corresponds to the i'th distinct buffer + * XLR_BKP_BLOCK(N) to know what is actually stored in the XLOG record. + * The N'th XLR_BKP_BLOCK bit corresponds to the N'th distinct buffer * value (ignoring InvalidBuffer) appearing in the rdata chain. * * When buffer is valid, caller must set buffer_std to indicate whether the @@ -274,7 +270,9 @@ extern int XLogFileOpen(XLogSegNo segno); extern void XLogGetLastRemoved(XLogSegNo *segno); extern void XLogSetAsyncXactLSN(XLogRecPtr record); -extern void RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup); +extern Buffer RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, + int block_index, + bool get_cleanup_lock, bool keep_buffer); extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record); extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec); diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 2c66b2feb8..89252d0230 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -205,6 +205,25 @@ typedef XLogLongPageHeaderData *XLogLongPageHeader; (uint32) ((logSegNo) / XLogSegmentsPerXLogId), \ (uint32) ((logSegNo) % XLogSegmentsPerXLogId), offset) +/* + * Information logged when we detect a change in one of the parameters + * important for Hot Standby. + */ +typedef struct xl_parameter_change +{ + int MaxConnections; + int max_prepared_xacts; + int max_locks_per_xact; + int wal_level; +} xl_parameter_change; + +/* logs restore point */ +typedef struct xl_restore_point +{ + TimestampTz rp_time; + char rp_name[MAXFNAMELEN]; +} xl_restore_point; + /* * Method table for resource managers. @@ -243,7 +262,8 @@ extern char *recoveryRestoreCommand; * Prototypes for functions in xlogarchive.c */ extern bool RestoreArchivedFile(char *path, const char *xlogfname, - const char *recovername, off_t expectedSize); + const char *recovername, off_t expectedSize, + bool cleanupEnabled); extern void ExecuteRecoveryCommand(char *command, char *commandName, bool failOnerror); extern void XLogArchiveNotify(const char *xlog); diff --git a/src/include/c.h b/src/include/c.h index 3b0fa9c4f0..a6c0e6e650 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -736,15 +736,15 @@ typedef NameData *Name; StaticAssertStmt(__builtin_types_compatible_p(__typeof__(varname), typename), \ CppAsString(varname) " does not have type " CppAsString(typename)) #define AssertVariableIsOfTypeMacro(varname, typename) \ - StaticAssertExpr(__builtin_types_compatible_p(__typeof__(varname), typename), \ - CppAsString(varname) " does not have type " CppAsString(typename)) + ((void) StaticAssertExpr(__builtin_types_compatible_p(__typeof__(varname), typename), \ + CppAsString(varname) " does not have type " CppAsString(typename))) #else /* !HAVE__BUILTIN_TYPES_COMPATIBLE_P */ #define AssertVariableIsOfType(varname, typename) \ StaticAssertStmt(sizeof(varname) == sizeof(typename), \ CppAsString(varname) " does not have type " CppAsString(typename)) #define AssertVariableIsOfTypeMacro(varname, typename) \ - StaticAssertExpr(sizeof(varname) == sizeof(typename), \ - CppAsString(varname) " does not have type " CppAsString(typename)) + ((void) StaticAssertExpr(sizeof(varname) == sizeof(typename), \ + CppAsString(varname) " does not have type " CppAsString(typename))) #endif /* HAVE__BUILTIN_TYPES_COMPATIBLE_P */ diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 5254a57d8a..9622356a63 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201210071 +#define CATALOG_VERSION_NO 201211281 #endif diff --git a/src/include/catalog/heap.h b/src/include/catalog/heap.h index bc8c63a15e..a35829bb7b 100644 --- a/src/include/catalog/heap.h +++ b/src/include/catalog/heap.h @@ -66,7 +66,8 @@ extern Oid heap_create_with_catalog(const char *relname, OnCommitAction oncommit, Datum reloptions, bool use_user_acl, - bool allow_system_table_mods); + bool allow_system_table_mods, + bool is_internal); extern void heap_create_init_fork(Relation rel); @@ -106,6 +107,7 @@ extern Node *cookDefault(ParseState *pstate, extern void DeleteRelationTuple(Oid relid); extern void DeleteAttributeTuples(Oid relid); +extern void DeleteSystemAttributeTuples(Oid relid); extern void RemoveAttributeById(Oid relid, AttrNumber attnum); extern void RemoveAttrDefault(Oid relid, AttrNumber attnum, DropBehavior behavior, bool complain, bool internal); diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index eb417cecb7..b96099f94c 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -27,6 +27,15 @@ typedef void (*IndexBuildCallback) (Relation index, bool tupleIsAlive, void *state); +/* Action code for index_set_state_flags */ +typedef enum +{ + INDEX_CREATE_SET_READY, + INDEX_CREATE_SET_VALID, + INDEX_DROP_CLEAR_VALID, + INDEX_DROP_SET_DEAD +} IndexStateFlagsAction; + extern void index_check_primary_key(Relation heapRel, IndexInfo *indexInfo, @@ -50,7 +59,8 @@ extern Oid index_create(Relation heapRelation, bool initdeferred, bool allow_system_table_mods, bool skip_build, - bool concurrent); + bool concurrent, + bool is_internal); extern void index_constraint_create(Relation heapRelation, Oid indexRelationId, @@ -89,6 +99,8 @@ extern double IndexBuildHeapScan(Relation heapRelation, extern void validate_index(Oid heapId, Oid indexId, Snapshot snapshot); +extern void index_set_state_flags(Oid indexId, IndexStateFlagsAction action); + extern void reindex_index(Oid indexId, bool skip_constraint_checks); /* Flag bits for reindex_relation(): */ diff --git a/src/include/catalog/objectaccess.h b/src/include/catalog/objectaccess.h index 3b40dbc492..b4b84a64d0 100644 --- a/src/include/catalog/objectaccess.h +++ b/src/include/catalog/objectaccess.h @@ -30,6 +30,19 @@ typedef enum ObjectAccessType OAT_DROP, } ObjectAccessType; +/* + * Arguments of OAT_POST_CREATE event + */ +typedef struct +{ + /* + * This flag informs extensions whether the context of this creation + * is invoked by user's operations, or not. E.g, it shall be dealt + * as internal stuff on toast tables or indexes due to type changes. + */ + bool is_internal; +} ObjectAccessPostCreate; + /* * Arguments of OAT_DROP event */ diff --git a/src/include/catalog/pg_constraint.h b/src/include/catalog/pg_constraint.h index 9a1c890684..e4e9c40ca7 100644 --- a/src/include/catalog/pg_constraint.h +++ b/src/include/catalog/pg_constraint.h @@ -20,6 +20,7 @@ #define PG_CONSTRAINT_H #include "catalog/genbki.h" +#include "catalog/dependency.h" #include "nodes/pg_list.h" /* ---------------- @@ -244,7 +245,7 @@ extern char *ChooseConstraintName(const char *name1, const char *name2, List *others); extern void AlterConstraintNamespaces(Oid ownerId, Oid oldNspId, - Oid newNspId, bool isType); + Oid newNspId, bool isType, ObjectAddresses *objsMoved); extern Oid get_relation_constraint_oid(Oid relid, const char *conname, bool missing_ok); extern Oid get_domain_constraint_oid(Oid typid, const char *conname, bool missing_ok); diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index c9ee80531c..1408be9c3a 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -21,7 +21,7 @@ /* Version identifier for this pg_control format */ -#define PG_CONTROL_VERSION 931 +#define PG_CONTROL_VERSION 932 /* * Body of CheckPoint XLOG records. This is declared here because we keep @@ -153,6 +153,7 @@ typedef struct ControlFileData * pg_start_backup() call, not accompanied by pg_stop_backup(). */ XLogRecPtr minRecoveryPoint; + TimeLineID minRecoveryPointTLI; XLogRecPtr backupStartPoint; XLogRecPtr backupEndPoint; bool backupEndRequired; diff --git a/src/include/catalog/pg_index.h b/src/include/catalog/pg_index.h index 934fe97e5f..0ebe2cc9e2 100644 --- a/src/include/catalog/pg_index.h +++ b/src/include/catalog/pg_index.h @@ -41,6 +41,7 @@ CATALOG(pg_index,2610) BKI_WITHOUT_OIDS BKI_SCHEMA_MACRO bool indisvalid; /* is this index valid for use by queries? */ bool indcheckxmin; /* must we wait for xmin to be old? */ bool indisready; /* is this index ready for inserts? */ + bool indislive; /* is this index alive at all? */ /* variable-length fields start here, but we allow direct access to indkey */ int2vector indkey; /* column numbers of indexed cols, or 0 */ @@ -68,7 +69,7 @@ typedef FormData_pg_index *Form_pg_index; * compiler constants for pg_index * ---------------- */ -#define Natts_pg_index 17 +#define Natts_pg_index 18 #define Anum_pg_index_indexrelid 1 #define Anum_pg_index_indrelid 2 #define Anum_pg_index_indnatts 3 @@ -80,12 +81,13 @@ typedef FormData_pg_index *Form_pg_index; #define Anum_pg_index_indisvalid 9 #define Anum_pg_index_indcheckxmin 10 #define Anum_pg_index_indisready 11 -#define Anum_pg_index_indkey 12 -#define Anum_pg_index_indcollation 13 -#define Anum_pg_index_indclass 14 -#define Anum_pg_index_indoption 15 -#define Anum_pg_index_indexprs 16 -#define Anum_pg_index_indpred 17 +#define Anum_pg_index_indislive 12 +#define Anum_pg_index_indkey 13 +#define Anum_pg_index_indcollation 14 +#define Anum_pg_index_indclass 15 +#define Anum_pg_index_indoption 16 +#define Anum_pg_index_indexprs 17 +#define Anum_pg_index_indpred 18 /* * Index AMs that support ordered scans must support these two indoption @@ -95,4 +97,13 @@ typedef FormData_pg_index *Form_pg_index; #define INDOPTION_DESC 0x0001 /* values are in reverse order */ #define INDOPTION_NULLS_FIRST 0x0002 /* NULLs are first instead of last */ +/* + * Use of these macros is recommended over direct examination of the state + * flag columns where possible; this allows source code compatibility with + * the hacky representation used in 9.2. + */ +#define IndexIsValid(indexForm) ((indexForm)->indisvalid) +#define IndexIsReady(indexForm) ((indexForm)->indisready) +#define IndexIsLive(indexForm) ((indexForm)->indislive) + #endif /* PG_INDEX_H */ diff --git a/src/include/catalog/storage.h b/src/include/catalog/storage.h index d5103a88f4..a4c3ba6b4f 100644 --- a/src/include/catalog/storage.h +++ b/src/include/catalog/storage.h @@ -14,7 +14,6 @@ #ifndef STORAGE_H #define STORAGE_H -#include "access/xlog.h" #include "storage/block.h" #include "storage/relfilenode.h" #include "utils/relcache.h" @@ -34,9 +33,4 @@ extern void AtSubCommit_smgr(void); extern void AtSubAbort_smgr(void); extern void PostPrepare_smgr(void); -extern void log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum); - -extern void smgr_redo(XLogRecPtr lsn, XLogRecord *record); -extern void smgr_desc(StringInfo buf, uint8 xl_info, char *rec); - #endif /* STORAGE_H */ diff --git a/src/include/catalog/storage_xlog.h b/src/include/catalog/storage_xlog.h new file mode 100644 index 0000000000..ae3a928231 --- /dev/null +++ b/src/include/catalog/storage_xlog.h @@ -0,0 +1,49 @@ +/*------------------------------------------------------------------------- + * + * storage_xlog.h + * prototypes for XLog support for backend/catalog/storage.c + * + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/catalog/storage_xlog.h + * + *------------------------------------------------------------------------- + */ +#ifndef STORAGE_XLOG_H +#define STORAGE_XLOG_H + +#include "access/xlog.h" +#include "storage/block.h" +#include "storage/relfilenode.h" + +/* + * Declarations for smgr-related XLOG records + * + * Note: we log file creation and truncation here, but logging of deletion + * actions is handled by xact.c, because it is part of transaction commit. + */ + +/* XLOG gives us high 4 bits */ +#define XLOG_SMGR_CREATE 0x10 +#define XLOG_SMGR_TRUNCATE 0x20 + +typedef struct xl_smgr_create +{ + RelFileNode rnode; + ForkNumber forkNum; +} xl_smgr_create; + +typedef struct xl_smgr_truncate +{ + BlockNumber blkno; + RelFileNode rnode; +} xl_smgr_truncate; + +extern void log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum); + +extern void smgr_redo(XLogRecPtr lsn, XLogRecord *record); +extern void smgr_desc(StringInfo buf, uint8 xl_info, char *rec); + +#endif /* STORAGE_XLOG_H */ diff --git a/src/include/commands/alter.h b/src/include/commands/alter.h index 84135457d1..4493985ad3 100644 --- a/src/include/commands/alter.h +++ b/src/include/commands/alter.h @@ -14,13 +14,15 @@ #ifndef ALTER_H #define ALTER_H +#include "catalog/dependency.h" #include "nodes/parsenodes.h" #include "utils/relcache.h" extern void ExecRenameStmt(RenameStmt *stmt); extern void ExecAlterObjectSchemaStmt(AlterObjectSchemaStmt *stmt); -extern Oid AlterObjectNamespace_oid(Oid classId, Oid objid, Oid nspOid); +extern Oid AlterObjectNamespace_oid(Oid classId, Oid objid, Oid nspOid, + ObjectAddresses *objsMoved); extern Oid AlterObjectNamespace_internal(Relation rel, Oid objid, Oid nspOid); extern void ExecAlterOwnerStmt(AlterOwnerStmt *stmt); diff --git a/src/include/commands/tablecmds.h b/src/include/commands/tablecmds.h index 15d4713cec..4f32062056 100644 --- a/src/include/commands/tablecmds.h +++ b/src/include/commands/tablecmds.h @@ -15,6 +15,7 @@ #define TABLECMDS_H #include "access/htup.h" +#include "catalog/dependency.h" #include "nodes/parsenodes.h" #include "storage/lock.h" #include "utils/relcache.h" @@ -36,9 +37,13 @@ extern void AlterTableInternal(Oid relid, List *cmds, bool recurse); extern void AlterTableNamespace(AlterObjectSchemaStmt *stmt); +extern void AlterTableNamespaceInternal(Relation rel, Oid oldNspOid, + Oid nspOid, ObjectAddresses *objsMoved); + extern void AlterRelationNamespaceInternal(Relation classRel, Oid relOid, Oid oldNspOid, Oid newNspOid, - bool hasDependEntry); + bool hasDependEntry, + ObjectAddresses *objsMoved); extern void CheckTableNotInUse(Relation rel, const char *stmt); diff --git a/src/include/commands/typecmds.h b/src/include/commands/typecmds.h index b72cfc4fd9..e87ca90089 100644 --- a/src/include/commands/typecmds.h +++ b/src/include/commands/typecmds.h @@ -15,6 +15,7 @@ #define TYPECMDS_H #include "access/htup.h" +#include "catalog/dependency.h" #include "nodes/parsenodes.h" @@ -25,7 +26,7 @@ extern void RemoveTypeById(Oid typeOid); extern void DefineDomain(CreateDomainStmt *stmt); extern void DefineEnum(CreateEnumStmt *stmt); extern void DefineRange(CreateRangeStmt *stmt); -extern void AlterEnum(AlterEnumStmt *stmt); +extern void AlterEnum(AlterEnumStmt *stmt, bool isTopLevel); extern Oid DefineCompositeType(RangeVar *typevar, List *coldeflist); extern Oid AssignTypeArrayOid(void); @@ -45,9 +46,10 @@ extern void AlterTypeOwner(List *names, Oid newOwnerId, ObjectType objecttype); extern void AlterTypeOwnerInternal(Oid typeOid, Oid newOwnerId, bool hasDependEntry); extern void AlterTypeNamespace(List *names, const char *newschema, ObjectType objecttype); -extern Oid AlterTypeNamespace_oid(Oid typeOid, Oid nspOid); -extern Oid AlterTypeNamespaceInternal(Oid typeOid, Oid nspOid, +extern Oid AlterTypeNamespace_oid(Oid typeOid, Oid nspOid, ObjectAddresses *objsMoved); +extern Oid AlterTypeNamespaceInternal(Oid typeOid, Oid nspOid, bool isImplicitArray, - bool errorOnTableType); + bool errorOnTableType, + ObjectAddresses *objsMoved); #endif /* TYPECMDS_H */ diff --git a/src/include/lib/binaryheap.h b/src/include/lib/binaryheap.h new file mode 100644 index 0000000000..449ceb57fc --- /dev/null +++ b/src/include/lib/binaryheap.h @@ -0,0 +1,53 @@ +/* + * binaryheap.h + * + * A simple binary heap implementation + * + * Portions Copyright (c) 2012, PostgreSQL Global Development Group + * + * src/include/lib/binaryheap.h + */ + +#ifndef BINARYHEAP_H +#define BINARYHEAP_H + +/* + * For a max-heap, the comparator must return <0 iff a < b, 0 iff a == b, + * and >0 iff a > b. For a min-heap, the conditions are reversed. + */ +typedef int (*binaryheap_comparator) (Datum a, Datum b, void *arg); + +/* + * binaryheap + * + * bh_size how many nodes are currently in "nodes" + * bh_space how many nodes can be stored in "nodes" + * bh_has_heap_property no unordered operations since last heap build + * bh_compare comparison function to define the heap property + * bh_arg user data for comparison function + * bh_nodes variable-length array of "space" nodes + */ +typedef struct binaryheap +{ + int bh_size; + int bh_space; + bool bh_has_heap_property; /* debugging cross-check */ + binaryheap_comparator bh_compare; + void *bh_arg; + Datum bh_nodes[FLEXIBLE_ARRAY_MEMBER]; +} binaryheap; + +extern binaryheap *binaryheap_allocate(int capacity, + binaryheap_comparator compare, + void *arg); +extern void binaryheap_free(binaryheap *heap); +extern void binaryheap_add_unordered(binaryheap *heap, Datum d); +extern void binaryheap_build(binaryheap *heap); +extern void binaryheap_add(binaryheap *heap, Datum d); +extern Datum binaryheap_first(binaryheap *heap); +extern Datum binaryheap_remove_first(binaryheap *heap); +extern void binaryheap_replace_first(binaryheap *heap, Datum d); + +#define binaryheap_empty(h) ((h)->bh_size == 0) + +#endif /* BINARYHEAP_H */ diff --git a/src/include/lib/dllist.h b/src/include/lib/dllist.h deleted file mode 100644 index 25ed64c7c4..0000000000 --- a/src/include/lib/dllist.h +++ /dev/null @@ -1,85 +0,0 @@ -/*------------------------------------------------------------------------- - * - * dllist.h - * simple doubly linked list primitives - * the elements of the list are void* so the lists can contain anything - * Dlelem can only be in one list at a time - * - * - * Here's a small example of how to use Dllists: - * - * Dllist *lst; - * Dlelem *elt; - * void *in_stuff; -- stuff to stick in the list - * void *out_stuff - * - * lst = DLNewList(); -- make a new dllist - * DLAddHead(lst, DLNewElem(in_stuff)); -- add a new element to the list - * with in_stuff as the value - * ... - * elt = DLGetHead(lst); -- retrieve the head element - * out_stuff = (void*)DLE_VAL(elt); -- get the stuff out - * DLRemove(elt); -- removes the element from its list - * DLFreeElem(elt); -- free the element since we don't - * use it anymore - * - * - * It is also possible to use Dllist objects that are embedded in larger - * structures instead of being separately malloc'd. To do this, use - * DLInitElem() to initialize a Dllist field within a larger object. - * Don't forget to DLRemove() each field from its list (if any) before - * freeing the larger object! - * - * - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * src/include/lib/dllist.h - * - *------------------------------------------------------------------------- - */ - -#ifndef DLLIST_H -#define DLLIST_H - -struct Dllist; -struct Dlelem; - -typedef struct Dlelem -{ - struct Dlelem *dle_next; /* next element */ - struct Dlelem *dle_prev; /* previous element */ - void *dle_val; /* value of the element */ - struct Dllist *dle_list; /* what list this element is in */ -} Dlelem; - -typedef struct Dllist -{ - Dlelem *dll_head; - Dlelem *dll_tail; -} Dllist; - -extern Dllist *DLNewList(void); /* allocate and initialize a list header */ -extern void DLInitList(Dllist *list); /* init a header alloced by caller */ -extern void DLFreeList(Dllist *list); /* free up a list and all the nodes in - * it */ -extern Dlelem *DLNewElem(void *val); -extern void DLInitElem(Dlelem *e, void *val); -extern void DLFreeElem(Dlelem *e); -extern void DLRemove(Dlelem *e); /* removes node from list */ -extern void DLAddHead(Dllist *list, Dlelem *node); -extern void DLAddTail(Dllist *list, Dlelem *node); -extern Dlelem *DLRemHead(Dllist *list); /* remove and return the head */ -extern Dlelem *DLRemTail(Dllist *list); -extern void DLMoveToFront(Dlelem *e); /* move node to front of its list */ - -/* These are macros for speed */ -#define DLGetHead(list) ((list)->dll_head) -#define DLGetTail(list) ((list)->dll_tail) -#define DLGetSucc(elem) ((elem)->dle_next) -#define DLGetPred(elem) ((elem)->dle_prev) -#define DLGetListHdr(elem) ((elem)->dle_list) - -#define DLE_VAL(elem) ((elem)->dle_val) - -#endif /* DLLIST_H */ diff --git a/src/include/lib/ilist.h b/src/include/lib/ilist.h new file mode 100644 index 0000000000..8f584863a3 --- /dev/null +++ b/src/include/lib/ilist.h @@ -0,0 +1,735 @@ +/*------------------------------------------------------------------------- + * + * ilist.h + * integrated/inline doubly- and singly-linked lists + * + * These list types are useful when there are only a predetermined set of + * lists that an object could be in. List links are embedded directly into + * the objects, and thus no extra memory management overhead is required. + * (Of course, if only a small proportion of existing objects are in a list, + * the link fields in the remainder would be wasted space. But usually, + * it saves space to not have separately-allocated list nodes.) + * + * None of the functions here allocate any memory; they just manipulate + * externally managed memory. The APIs for singly and doubly linked lists + * are identical as far as capabilities of both allow. + * + * Each list has a list header, which exists even when the list is empty. + * An empty singly-linked list has a NULL pointer in its header. + * There are two kinds of empty doubly linked lists: those that have been + * initialized to NULL, and those that have been initialized to circularity. + * (If a dlist is modified and then all its elements are deleted, it will be + * in the circular state.) We prefer circular dlists because there are some + * operations that can be done without branches (and thus faster) on lists + * that use circular representation. However, it is often convenient to + * initialize list headers to zeroes rather than setting them up with an + * explicit initialization function, so we also allow the other case. + * + * EXAMPLES + * + * Here's a simple example demonstrating how this can be used. Let's assume + * we want to store information about the tables contained in a database. + * + * #include "lib/ilist.h" + * + * // Define struct for the databases including a list header that will be + * // used to access the nodes in the table list later on. + * typedef struct my_database + * { + * char *datname; + * dlist_head tables; + * // ... + * } my_database; + * + * // Define struct for the tables. Note the list_node element which stores + * // prev/next list links. The list_node element need not be first. + * typedef struct my_table + * { + * char *tablename; + * dlist_node list_node; + * perm_t permissions; + * // ... + * } my_table; + * + * // create a database + * my_database *db = create_database(); + * + * // and add a few tables to its table list + * dlist_push_head(&db->tables, &create_table(db, "a")->list_node); + * ... + * dlist_push_head(&db->tables, &create_table(db, "b")->list_node); + * + * + * To iterate over the table list, we allocate an iterator variable and use + * a specialized looping construct. Inside a dlist_foreach, the iterator's + * 'cur' field can be used to access the current element. iter.cur points to + * a 'dlist_node', but most of the time what we want is the actual table + * information; dlist_container() gives us that, like so: + * + * dlist_iter iter; + * dlist_foreach(iter, &db->tables) + * { + * my_table *tbl = dlist_container(my_table, list_node, iter.cur); + * printf("we have a table: %s in database %s\n", + * tbl->tablename, db->datname); + * } + * + * + * While a simple iteration is useful, we sometimes also want to manipulate + * the list while iterating. There is a different iterator element and looping + * construct for that. Suppose we want to delete tables that meet a certain + * criterion: + * + * dlist_mutable_iter miter; + * dlist_foreach_modify(miter, &db->tables) + * { + * my_table *tbl = dlist_container(my_table, list_node, miter.cur); + * + * if (!tbl->to_be_deleted) + * continue; // don't touch this one + * + * // unlink the current table from the linked list + * dlist_delete(miter.cur); + * // as these lists never manage memory, we can still access the table + * // after it's been unlinked + * drop_table(db, tbl); + * } + * + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/lib/ilist.h + *------------------------------------------------------------------------- + */ +#ifndef ILIST_H +#define ILIST_H + +/* + * Enable for extra debugging. This is rather expensive, so it's not enabled by + * default even when USE_ASSERT_CHECKING. + */ +/* #define ILIST_DEBUG */ + +/* + * Node of a doubly linked list. + * + * Embed this in structs that need to be part of a doubly linked list. + */ +typedef struct dlist_node dlist_node; +struct dlist_node +{ + dlist_node *prev; + dlist_node *next; +}; + +/* + * Head of a doubly linked list. + * + * Non-empty lists are internally circularly linked. Circular lists have the + * advantage of not needing any branches in the most common list manipulations. + * An empty list can also be represented as a pair of NULL pointers, making + * initialization easier. + */ +typedef struct dlist_head +{ + /* + * head.next either points to the first element of the list; to &head if + * it's a circular empty list; or to NULL if empty and not circular. + * + * head.prev either points to the last element of the list; to &head if + * it's a circular empty list; or to NULL if empty and not circular. + */ + dlist_node head; +} dlist_head; + + +/* + * Doubly linked list iterator. + * + * Used as state in dlist_foreach() and dlist_reverse_foreach(). To get the + * current element of the iteration use the 'cur' member. + * + * Iterations using this are *not* allowed to change the list while iterating! + * + * NB: We use an extra "end" field here to avoid multiple evaluations of + * arguments in the dlist_foreach() macro. + */ +typedef struct dlist_iter +{ + dlist_node *cur; /* current element */ + dlist_node *end; /* last node we'll iterate to */ +} dlist_iter; + +/* + * Doubly linked list iterator allowing some modifications while iterating. + * + * Used as state in dlist_foreach_modify(). To get the current element of the + * iteration use the 'cur' member. + * + * Iterations using this are only allowed to change the list at the current + * point of iteration. It is fine to delete the current node, but it is *not* + * fine to insert or delete adjacent nodes. + * + * NB: We need a separate type for mutable iterations so that we can store + * the 'next' node of the current node in case it gets deleted or modified. + */ +typedef struct dlist_mutable_iter +{ + dlist_node *cur; /* current element */ + dlist_node *next; /* next node we'll iterate to */ + dlist_node *end; /* last node we'll iterate to */ +} dlist_mutable_iter; + +/* + * Node of a singly linked list. + * + * Embed this in structs that need to be part of a singly linked list. + */ +typedef struct slist_node slist_node; +struct slist_node +{ + slist_node *next; +}; + +/* + * Head of a singly linked list. + * + * Singly linked lists are not circularly linked, in contrast to doubly linked + * lists; we just set head.next to NULL if empty. This doesn't incur any + * additional branches in the usual manipulations. + */ +typedef struct slist_head +{ + slist_node head; +} slist_head; + +/* + * Singly linked list iterator. + * + * Used as state in slist_foreach(). To get the current element of the + * iteration use the 'cur' member. + * + * Do *not* manipulate the list while iterating! + * + * NB: this wouldn't really need to be an extra struct, we could use a + * slist_node * directly. We prefer a separate type for consistency. + */ +typedef struct slist_iter +{ + slist_node *cur; +} slist_iter; + +/* + * Singly linked list iterator allowing some modifications while iterating. + * + * Used as state in slist_foreach_modify(). + * + * Iterations using this are allowed to remove the current node and to add + * more nodes ahead of the current node. + */ +typedef struct slist_mutable_iter +{ + slist_node *cur; /* current element */ + slist_node *next; /* next node we'll iterate to */ +} slist_mutable_iter; + + +/* Static initializers */ +#define DLIST_STATIC_INIT(name) {{&(name).head, &(name).head}} +#define SLIST_STATIC_INIT(name) {{NULL}} + + +/* Prototypes for functions too big to be inline */ + +/* Caution: this is O(n) */ +extern void slist_delete(slist_head *head, slist_node *node); + +#ifdef ILIST_DEBUG +extern void dlist_check(dlist_head *head); +extern void slist_check(slist_head *head); +#else +/* + * These seemingly useless casts to void are here to keep the compiler quiet + * about the argument being unused in many functions in a non-debug compile, + * in which functions the only point of passing the list head pointer is to be + * able to run these checks. + */ +#define dlist_check(head) ((void) (head)) +#define slist_check(head) ((void) (head)) +#endif /* ILIST_DEBUG */ + + +/* + * We want the functions below to be inline; but if the compiler doesn't + * support that, fall back on providing them as regular functions. See + * STATIC_IF_INLINE in c.h. + */ +#ifndef PG_USE_INLINE +extern void dlist_init(dlist_head *head); +extern bool dlist_is_empty(dlist_head *head); +extern void dlist_push_head(dlist_head *head, dlist_node *node); +extern void dlist_push_tail(dlist_head *head, dlist_node *node); +extern void dlist_insert_after(dlist_node *after, dlist_node *node); +extern void dlist_insert_before(dlist_node *before, dlist_node *node); +extern void dlist_delete(dlist_node *node); +extern dlist_node *dlist_pop_head_node(dlist_head *head); +extern void dlist_move_head(dlist_head *head, dlist_node *node); +extern bool dlist_has_next(dlist_head *head, dlist_node *node); +extern bool dlist_has_prev(dlist_head *head, dlist_node *node); +extern dlist_node *dlist_next_node(dlist_head *head, dlist_node *node); +extern dlist_node *dlist_prev_node(dlist_head *head, dlist_node *node); +extern dlist_node *dlist_head_node(dlist_head *head); +extern dlist_node *dlist_tail_node(dlist_head *head); + +/* dlist macro support functions */ +extern void *dlist_tail_element_off(dlist_head *head, size_t off); +extern void *dlist_head_element_off(dlist_head *head, size_t off); +#endif /* !PG_USE_INLINE */ + +#if defined(PG_USE_INLINE) || defined(ILIST_INCLUDE_DEFINITIONS) +/* + * Initialize a doubly linked list. + * Previous state will be thrown away without any cleanup. + */ +STATIC_IF_INLINE void +dlist_init(dlist_head *head) +{ + head->head.next = head->head.prev = &head->head; +} + +/* + * Is the list empty? + * + * An empty list has either its first 'next' pointer set to NULL, or to itself. + */ +STATIC_IF_INLINE bool +dlist_is_empty(dlist_head *head) +{ + dlist_check(head); + + return head->head.next == NULL || head->head.next == &(head->head); +} + +/* + * Insert a node at the beginning of the list. + */ +STATIC_IF_INLINE void +dlist_push_head(dlist_head *head, dlist_node *node) +{ + if (head->head.next == NULL) /* convert NULL header to circular */ + dlist_init(head); + + node->next = head->head.next; + node->prev = &head->head; + node->next->prev = node; + head->head.next = node; + + dlist_check(head); +} + +/* + * Insert a node at the end of the list. + */ +STATIC_IF_INLINE void +dlist_push_tail(dlist_head *head, dlist_node *node) +{ + if (head->head.next == NULL) /* convert NULL header to circular */ + dlist_init(head); + + node->next = &head->head; + node->prev = head->head.prev; + node->prev->next = node; + head->head.prev = node; + + dlist_check(head); +} + +/* + * Insert a node after another *in the same list* + */ +STATIC_IF_INLINE void +dlist_insert_after(dlist_node *after, dlist_node *node) +{ + node->prev = after; + node->next = after->next; + after->next = node; + node->next->prev = node; +} + +/* + * Insert a node before another *in the same list* + */ +STATIC_IF_INLINE void +dlist_insert_before(dlist_node *before, dlist_node *node) +{ + node->prev = before->prev; + node->next = before; + before->prev = node; + node->prev->next = node; +} + +/* + * Delete 'node' from its list (it must be in one). + */ +STATIC_IF_INLINE void +dlist_delete(dlist_node *node) +{ + node->prev->next = node->next; + node->next->prev = node->prev; +} + +/* + * Remove and return the first node from a list (there must be one). + */ +STATIC_IF_INLINE dlist_node * +dlist_pop_head_node(dlist_head *head) +{ + dlist_node *node; + + Assert(!dlist_is_empty(head)); + node = head->head.next; + dlist_delete(node); + return node; +} + +/* + * Move element from its current position in the list to the head position in + * the same list. + * + * Undefined behaviour if 'node' is not already part of the list. + */ +STATIC_IF_INLINE void +dlist_move_head(dlist_head *head, dlist_node *node) +{ + /* fast path if it's already at the head */ + if (head->head.next == node) + return; + + dlist_delete(node); + dlist_push_head(head, node); + + dlist_check(head); +} + +/* + * Check whether 'node' has a following node. + * Caution: unreliable if 'node' is not in the list. + */ +STATIC_IF_INLINE bool +dlist_has_next(dlist_head *head, dlist_node *node) +{ + return node->next != &head->head; +} + +/* + * Check whether 'node' has a preceding node. + * Caution: unreliable if 'node' is not in the list. + */ +STATIC_IF_INLINE bool +dlist_has_prev(dlist_head *head, dlist_node *node) +{ + return node->prev != &head->head; +} + +/* + * Return the next node in the list (there must be one). + */ +STATIC_IF_INLINE dlist_node * +dlist_next_node(dlist_head *head, dlist_node *node) +{ + Assert(dlist_has_next(head, node)); + return node->next; +} + +/* + * Return previous node in the list (there must be one). + */ +STATIC_IF_INLINE dlist_node * +dlist_prev_node(dlist_head *head, dlist_node *node) +{ + Assert(dlist_has_prev(head, node)); + return node->prev; +} + +/* internal support function to get address of head element's struct */ +STATIC_IF_INLINE void * +dlist_head_element_off(dlist_head *head, size_t off) +{ + Assert(!dlist_is_empty(head)); + return (char *) head->head.next - off; +} + +/* + * Return the first node in the list (there must be one). + */ +STATIC_IF_INLINE dlist_node * +dlist_head_node(dlist_head *head) +{ + return (dlist_node *) dlist_head_element_off(head, 0); +} + +/* internal support function to get address of tail element's struct */ +STATIC_IF_INLINE void * +dlist_tail_element_off(dlist_head *head, size_t off) +{ + Assert(!dlist_is_empty(head)); + return (char *) head->head.prev - off; +} + +/* + * Return the last node in the list (there must be one). + */ +STATIC_IF_INLINE dlist_node * +dlist_tail_node(dlist_head *head) +{ + return (dlist_node *) dlist_tail_element_off(head, 0); +} +#endif /* PG_USE_INLINE || ILIST_INCLUDE_DEFINITIONS */ + +/* + * Return the containing struct of 'type' where 'membername' is the dlist_node + * pointed at by 'ptr'. + * + * This is used to convert a dlist_node * back to its containing struct. + */ +#define dlist_container(type, membername, ptr) \ + (AssertVariableIsOfTypeMacro(ptr, dlist_node *), \ + AssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node), \ + ((type *) ((char *) (ptr) - offsetof(type, membername)))) + +/* + * Return the address of the first element in the list. + * + * The list must not be empty. + */ +#define dlist_head_element(type, membername, lhead) \ + (AssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node), \ + (type *) dlist_head_element_off(lhead, offsetof(type, membername))) + +/* + * Return the address of the last element in the list. + * + * The list must not be empty. + */ +#define dlist_tail_element(type, membername, lhead) \ + (AssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node), \ + ((type *) dlist_tail_element_off(lhead, offsetof(type, membername)))) + +/* + * Iterate through the list pointed at by 'lhead' storing the state in 'iter'. + * + * Access the current element with iter.cur. + * + * It is *not* allowed to manipulate the list during iteration. + */ +#define dlist_foreach(iter, lhead) \ + for (AssertVariableIsOfTypeMacro(iter, dlist_iter), \ + AssertVariableIsOfTypeMacro(lhead, dlist_head *), \ + (iter).end = &(lhead)->head, \ + (iter).cur = (iter).end->next ? (iter).end->next : (iter).end; \ + (iter).cur != (iter).end; \ + (iter).cur = (iter).cur->next) + +/* + * Iterate through the list pointed at by 'lhead' storing the state in 'iter'. + * + * Access the current element with iter.cur. + * + * Iterations using this are only allowed to change the list at the current + * point of iteration. It is fine to delete the current node, but it is *not* + * fine to insert or delete adjacent nodes. + */ +#define dlist_foreach_modify(iter, lhead) \ + for (AssertVariableIsOfTypeMacro(iter, dlist_mutable_iter), \ + AssertVariableIsOfTypeMacro(lhead, dlist_head *), \ + (iter).end = &(lhead)->head, \ + (iter).cur = (iter).end->next ? (iter).end->next : (iter).end, \ + (iter).next = (iter).cur->next; \ + (iter).cur != (iter).end; \ + (iter).cur = (iter).next, (iter).next = (iter).cur->next) + +/* + * Iterate through the list in reverse order. + * + * It is *not* allowed to manipulate the list during iteration. + */ +#define dlist_reverse_foreach(iter, lhead) \ + for (AssertVariableIsOfTypeMacro(iter, dlist_iter), \ + AssertVariableIsOfTypeMacro(lhead, dlist_head *), \ + (iter).end = &(lhead)->head, \ + (iter).cur = (iter).end->prev ? (iter).end->prev : (iter).end; \ + (iter).cur != (iter).end; \ + (iter).cur = (iter).cur->prev) + + +/* + * We want the functions below to be inline; but if the compiler doesn't + * support that, fall back on providing them as regular functions. See + * STATIC_IF_INLINE in c.h. + */ +#ifndef PG_USE_INLINE +extern void slist_init(slist_head *head); +extern bool slist_is_empty(slist_head *head); +extern void slist_push_head(slist_head *head, slist_node *node); +extern void slist_insert_after(slist_node *after, slist_node *node); +extern slist_node *slist_pop_head_node(slist_head *head); +extern bool slist_has_next(slist_head *head, slist_node *node); +extern slist_node *slist_next_node(slist_head *head, slist_node *node); +extern slist_node *slist_head_node(slist_head *head); + +/* slist macro support function */ +extern void *slist_head_element_off(slist_head *head, size_t off); +#endif + +#if defined(PG_USE_INLINE) || defined(ILIST_INCLUDE_DEFINITIONS) +/* + * Initialize a singly linked list. + * Previous state will be thrown away without any cleanup. + */ +STATIC_IF_INLINE void +slist_init(slist_head *head) +{ + head->head.next = NULL; +} + +/* + * Is the list empty? + */ +STATIC_IF_INLINE bool +slist_is_empty(slist_head *head) +{ + slist_check(head); + + return head->head.next == NULL; +} + +/* + * Insert a node at the beginning of the list. + */ +STATIC_IF_INLINE void +slist_push_head(slist_head *head, slist_node *node) +{ + node->next = head->head.next; + head->head.next = node; + + slist_check(head); +} + +/* + * Insert a node after another *in the same list* + */ +STATIC_IF_INLINE void +slist_insert_after(slist_node *after, slist_node *node) +{ + node->next = after->next; + after->next = node; +} + +/* + * Remove and return the first node from a list (there must be one). + */ +STATIC_IF_INLINE slist_node * +slist_pop_head_node(slist_head *head) +{ + slist_node *node; + + Assert(!slist_is_empty(head)); + node = head->head.next; + head->head.next = node->next; + slist_check(head); + return node; +} + +/* + * Check whether 'node' has a following node. + */ +STATIC_IF_INLINE bool +slist_has_next(slist_head *head, slist_node *node) +{ + slist_check(head); + + return node->next != NULL; +} + +/* + * Return the next node in the list (there must be one). + */ +STATIC_IF_INLINE slist_node * +slist_next_node(slist_head *head, slist_node *node) +{ + Assert(slist_has_next(head, node)); + return node->next; +} + +/* internal support function to get address of head element's struct */ +STATIC_IF_INLINE void * +slist_head_element_off(slist_head *head, size_t off) +{ + Assert(!slist_is_empty(head)); + return (char *) head->head.next - off; +} + +/* + * Return the first node in the list (there must be one). + */ +STATIC_IF_INLINE slist_node * +slist_head_node(slist_head *head) +{ + return (slist_node *) slist_head_element_off(head, 0); +} +#endif /* PG_USE_INLINE || ILIST_INCLUDE_DEFINITIONS */ + +/* + * Return the containing struct of 'type' where 'membername' is the slist_node + * pointed at by 'ptr'. + * + * This is used to convert a slist_node * back to its containing struct. + */ +#define slist_container(type, membername, ptr) \ + (AssertVariableIsOfTypeMacro(ptr, slist_node *), \ + AssertVariableIsOfTypeMacro(((type *) NULL)->membername, slist_node), \ + ((type *) ((char *) (ptr) - offsetof(type, membername)))) + +/* + * Return the address of the first element in the list. + * + * The list must not be empty. + */ +#define slist_head_element(type, membername, lhead) \ + (AssertVariableIsOfTypeMacro(((type *) NULL)->membername, slist_node), \ + (type *) slist_head_element_off(lhead, offsetof(type, membername))) + +/* + * Iterate through the list pointed at by 'lhead' storing the state in 'iter'. + * + * Access the current element with iter.cur. + * + * It is *not* allowed to manipulate the list during iteration. + */ +#define slist_foreach(iter, lhead) \ + for (AssertVariableIsOfTypeMacro(iter, slist_iter), \ + AssertVariableIsOfTypeMacro(lhead, slist_head *), \ + (iter).cur = (lhead)->head.next; \ + (iter).cur != NULL; \ + (iter).cur = (iter).cur->next) + +/* + * Iterate through the list pointed at by 'lhead' storing the state in 'iter'. + * + * Access the current element with iter.cur. + * + * Iterations using this are allowed to remove the current node and to add + * more nodes ahead of the current node. + */ +#define slist_foreach_modify(iter, lhead) \ + for (AssertVariableIsOfTypeMacro(iter, slist_mutable_iter), \ + AssertVariableIsOfTypeMacro(lhead, slist_head *), \ + (iter).cur = (lhead)->head.next, \ + (iter).next = (iter).cur ? (iter).cur->next : NULL; \ + (iter).cur != NULL; \ + (iter).cur = (iter).next, \ + (iter).next = (iter).next ? (iter).next->next : NULL) + +#endif /* ILIST_H */ diff --git a/src/include/libpq/hba.h b/src/include/libpq/hba.h index 408d26263a..79a5dc608c 100644 --- a/src/include/libpq/hba.h +++ b/src/include/libpq/hba.h @@ -71,6 +71,7 @@ typedef struct HbaLine char *ldapbindpasswd; char *ldapsearchattribute; char *ldapbasedn; + int ldapscope; char *ldapprefix; char *ldapsuffix; bool clientcert; diff --git a/src/include/libpq/pqcomm.h b/src/include/libpq/pqcomm.h index 604b5535df..635132dec9 100644 --- a/src/include/libpq/pqcomm.h +++ b/src/include/libpq/pqcomm.h @@ -73,6 +73,19 @@ typedef struct DEFAULT_PGSOCKET_DIR, \ (port)) +/* + * The maximum workable length of a socket path is what will fit into + * struct sockaddr_un. This is usually only 100 or so bytes :-(. + * + * For consistency, always pass a MAXPGPATH-sized buffer to UNIXSOCK_PATH(), + * then complain if the resulting string is >= UNIXSOCK_PATH_BUFLEN bytes. + * (Because the standard API for getaddrinfo doesn't allow it to complain in + * a useful way when the socket pathname is too long, we have to test for + * this explicitly, instead of just letting the subroutine return an error.) + */ +#define UNIXSOCK_PATH_BUFLEN sizeof(((struct sockaddr_un *) NULL)->sun_path) + + /* * These manipulate the frontend/backend protocol version number. * diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 9f57989f80..3ea349315c 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -416,7 +416,7 @@ extern char *local_preload_libraries_string; * 7 shared memory key (not present on Windows) * * Lines 6 and up are added via AddToDataDirLockFile() after initial file - * creation; they have to be ordered according to time of addition. + * creation. * * The socket lock file, if used, has the same contents as lines 1-5. */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index fec07b8e42..d4911bd2ae 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1100,10 +1100,8 @@ typedef struct AppendState * nkeys number of sort key columns * sortkeys sort keys in SortSupport representation * slots current output tuple of each subplan - * heap heap of active tuples (represented as array indexes) - * heap_size number of active heap entries + * heap heap of active tuples * initialized true if we have fetched first tuple from each subplan - * last_slot last subplan fetched from (which must be re-called) * ---------------- */ typedef struct MergeAppendState @@ -1114,10 +1112,8 @@ typedef struct MergeAppendState int ms_nkeys; SortSupport ms_sortkeys; /* array of length ms_nkeys */ TupleTableSlot **ms_slots; /* array of length ms_nplans */ - int *ms_heap; /* array of length ms_nplans */ - int ms_heap_size; /* current active length of ms_heap[] */ + struct binaryheap *ms_heap; /* binary heap of slot indices */ bool ms_initialized; /* are subplans started? */ - int ms_last_slot; /* last subplan slot we returned from */ } MergeAppendState; /* ---------------- diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 09b15e7694..88344990f4 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -1197,6 +1197,7 @@ typedef enum AlterTableType AT_ReAddIndex, /* internal to commands/tablecmds.c */ AT_AddConstraint, /* add constraint */ AT_AddConstraintRecurse, /* internal to commands/tablecmds.c */ + AT_ReAddConstraint, /* internal to commands/tablecmds.c */ AT_ValidateConstraint, /* validate constraint */ AT_ValidateConstraintRecurse, /* internal to commands/tablecmds.c */ AT_ProcessedConstraint, /* pre-processed add constraint (local in diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index 2b2742d7ef..0a1f8d5289 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -616,6 +616,7 @@ typedef struct EquivalenceMember Expr *em_expr; /* the expression represented */ Relids em_relids; /* all relids appearing in em_expr */ + Relids em_nullable_relids; /* nullable by lower outer joins */ bool em_is_const; /* expression is pseudoconstant? */ bool em_is_child; /* derived version for a child relation? */ Oid em_datatype; /* the "nominal type" used by the opfamily */ diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 165856de0b..c7d59d28da 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -124,7 +124,8 @@ extern void add_child_rel_equivalences(PlannerInfo *root, RelOptInfo *child_rel); extern void mutate_eclass_expressions(PlannerInfo *root, Node *(*mutator) (), - void *context); + void *context, + bool include_child_exprs); extern List *generate_implied_equalities_for_indexcol(PlannerInfo *root, IndexOptInfo *index, int indexcol, diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index c395d4292c..0fe696c2db 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -105,13 +105,15 @@ extern void process_implied_equality(PlannerInfo *root, Expr *item1, Expr *item2, Relids qualscope, + Relids nullable_relids, bool below_outer_join, bool both_const); extern RestrictInfo *build_implied_join_equality(Oid opno, Oid collation, Expr *item1, Expr *item2, - Relids qualscope); + Relids qualscope, + Relids nullable_relids); /* * prototypes for plan/analyzejoins.c diff --git a/src/include/parser/parse_node.h b/src/include/parser/parse_node.h index e3bb35f130..aa9c648e41 100644 --- a/src/include/parser/parse_node.h +++ b/src/include/parser/parse_node.h @@ -206,7 +206,7 @@ typedef struct ParseCallbackState { ParseState *pstate; int location; - ErrorContextCallback errcontext; + ErrorContextCallback errcallback; } ParseCallbackState; diff --git a/src/include/port/win32.h b/src/include/port/win32.h index b357663d63..7d05913312 100644 --- a/src/include/port/win32.h +++ b/src/include/port/win32.h @@ -65,7 +65,13 @@ #define USES_WINSOCK -/* defines for dynamic linking on Win32 platform */ +/* defines for dynamic linking on Win32 platform + * + * http://support.microsoft.com/kb/132044 + * http://msdn.microsoft.com/en-us/library/8fskxacy(v=vs.80).aspx + * http://msdn.microsoft.com/en-us/library/a90k134d(v=vs.80).aspx + */ + #if defined(WIN32) || defined(__CYGWIN__) #ifdef BUILDING_DLL diff --git a/src/include/replication/walprotocol.h b/src/include/replication/walprotocol.h deleted file mode 100644 index 396d006ea7..0000000000 --- a/src/include/replication/walprotocol.h +++ /dev/null @@ -1,128 +0,0 @@ -/*------------------------------------------------------------------------- - * - * walprotocol.h - * Definitions relevant to the streaming WAL transmission protocol. - * - * Portions Copyright (c) 2010-2012, PostgreSQL Global Development Group - * - * src/include/replication/walprotocol.h - * - *------------------------------------------------------------------------- - */ -#ifndef _WALPROTOCOL_H -#define _WALPROTOCOL_H - -#include "access/xlogdefs.h" -#include "datatype/timestamp.h" - - -/* - * All messages from WalSender must contain these fields to allow us to - * correctly calculate the replication delay. - */ -typedef struct -{ - /* Current end of WAL on the sender */ - XLogRecPtr walEnd; - - /* Sender's system clock at the time of transmission */ - TimestampTz sendTime; - - /* - * If replyRequested is set, the client should reply immediately to this - * message, to avoid a timeout disconnect. - */ - bool replyRequested; -} WalSndrMessage; - - -/* - * Header for a WAL data message (message type 'w'). This is wrapped within - * a CopyData message at the FE/BE protocol level. - * - * The header is followed by actual WAL data. Note that the data length is - * not specified in the header --- it's just whatever remains in the message. - * - * walEnd and sendTime are not essential data, but are provided in case - * the receiver wants to adjust its behavior depending on how far behind - * it is. - */ -typedef struct -{ - /* WAL start location of the data included in this message */ - XLogRecPtr dataStart; - - /* Current end of WAL on the sender */ - XLogRecPtr walEnd; - - /* Sender's system clock at the time of transmission */ - TimestampTz sendTime; -} WalDataMessageHeader; - -/* - * Keepalive message from primary (message type 'k'). (lowercase k) - * This is wrapped within a CopyData message at the FE/BE protocol level. - * - * Note that the data length is not specified here. - */ -typedef WalSndrMessage PrimaryKeepaliveMessage; - -/* - * Reply message from standby (message type 'r'). This is wrapped within - * a CopyData message at the FE/BE protocol level. - * - * Note that the data length is not specified here. - */ -typedef struct -{ - /* - * The xlog locations that have been written, flushed, and applied by - * standby-side. These may be invalid if the standby-side is unable to or - * chooses not to report these. - */ - XLogRecPtr write; - XLogRecPtr flush; - XLogRecPtr apply; - - /* Sender's system clock at the time of transmission */ - TimestampTz sendTime; - - /* - * If replyRequested is set, the server should reply immediately to this - * message, to avoid a timeout disconnect. - */ - bool replyRequested; -} StandbyReplyMessage; - -/* - * Hot Standby feedback from standby (message type 'h'). This is wrapped within - * a CopyData message at the FE/BE protocol level. - * - * Note that the data length is not specified here. - */ -typedef struct -{ - /* - * The current xmin and epoch from the standby, for Hot Standby feedback. - * This may be invalid if the standby-side does not support feedback, or - * Hot Standby is not yet available. - */ - TransactionId xmin; - uint32 epoch; - - /* Sender's system clock at the time of transmission */ - TimestampTz sendTime; -} StandbyHSFeedbackMessage; - -/* - * Maximum data payload in a WAL data message. Must be >= XLOG_BLCKSZ. - * - * We don't have a good idea of what a good value would be; there's some - * overhead per message in both walsender and walreceiver, but on the other - * hand sending large batches makes walsender less responsive to signals - * because signals are checked only between messages. 128kB (with - * default 8k blocks) seems like a reasonable guess for now. - */ -#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) - -#endif /* _WALPROTOCOL_H */ diff --git a/src/include/rewrite/rewriteManip.h b/src/include/rewrite/rewriteManip.h index e13331dcb5..1a96c556c9 100644 --- a/src/include/rewrite/rewriteManip.h +++ b/src/include/rewrite/rewriteManip.h @@ -31,6 +31,13 @@ struct replace_rte_variables_context bool inserted_sublink; /* have we inserted a SubLink? */ }; +typedef enum ReplaceVarsNoMatchOption +{ + REPLACEVARS_REPORT_ERROR, /* throw error if no match */ + REPLACEVARS_CHANGE_VARNO, /* change the Var's varno, nothing else */ + REPLACEVARS_SUBSTITUTE_NULL /* replace with a NULL Const */ +} ReplaceVarsNoMatchOption; + extern void OffsetVarNodes(Node *node, int offset, int sublevels_up); extern void ChangeVarNodes(Node *node, int old_varno, int new_varno, @@ -69,9 +76,12 @@ extern Node *map_variable_attnos(Node *node, const AttrNumber *attno_map, int map_length, bool *found_whole_row); -extern Node *ResolveNew(Node *node, int target_varno, int sublevels_up, - RangeTblEntry *target_rte, - List *targetlist, int event, int update_varno, - bool *outer_hasSubLinks); +extern Node *ReplaceVarsFromTargetList(Node *node, + int target_varno, int sublevels_up, + RangeTblEntry *target_rte, + List *targetlist, + ReplaceVarsNoMatchOption nomatch_option, + int nomatch_varno, + bool *outer_hasSubLinks); #endif /* REWRITEMANIP_H */ diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index bad9f10c62..940d9d4751 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -16,13 +16,13 @@ * calls: * * File {Close, Read, Write, Seek, Tell, Sync} - * {File Name Open, Allocate, Free} File + * {Path Name Open, Allocate, Free} File * * These are NOT JUST RENAMINGS OF THE UNIX ROUTINES. * Use them for all file activity... * * File fd; - * fd = FilePathOpenFile("foo", O_RDONLY, 0600); + * fd = PathNameOpenFile("foo", O_RDONLY, 0600); * * AllocateFile(); * FreeFile(); @@ -33,7 +33,8 @@ * no way for them to share kernel file descriptors with other files. * * Likewise, use AllocateDir/FreeDir, not opendir/closedir, to allocate - * open directories (DIR*). + * open directories (DIR*), and OpenTransientFile/CloseTransient File for an + * unbuffered file descriptor. */ #ifndef FD_H #define FD_H @@ -66,7 +67,6 @@ extern int max_safe_fds; /* Operations on virtual Files --- equivalent to Unix kernel file ops */ extern File PathNameOpenFile(FileName fileName, int fileFlags, int fileMode); extern File OpenTemporaryFile(bool interXact); -extern void FileSetTransient(File file); extern void FileClose(File file); extern int FilePrefetch(File file, off_t offset, int amount); extern int FileRead(File file, char *buffer, int amount); @@ -85,6 +85,10 @@ extern DIR *AllocateDir(const char *dirname); extern struct dirent *ReadDir(DIR *dir, const char *dirname); extern int FreeDir(DIR *dir); +/* Operations to allow use of a plain kernel FD, with automatic cleanup */ +extern int OpenTransientFile(FileName fileName, int fileFlags, int fileMode); +extern int CloseTransientFile(int fd); + /* If you've really really gotta have a plain kernel FD, use this */ extern int BasicOpenFile(FileName fileName, int fileFlags, int fileMode); diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h index 71fb4868a0..0fa572e50b 100644 --- a/src/include/storage/latch.h +++ b/src/include/storage/latch.h @@ -33,8 +33,8 @@ * ResetLatch - Clears the latch, allowing it to be set again * WaitLatch - Waits for the latch to become set * - * WaitLatch includes a provision for timeouts (which should hopefully not - * be necessary once the code is fully latch-ified) and a provision for + * WaitLatch includes a provision for timeouts (which should be avoided + * when possible, as they incur extra overhead) and a provision for * postmaster child processes to wake up immediately on postmaster death. * See unix_latch.c for detailed specifications for the exported functions. * @@ -64,14 +64,15 @@ * will be lifted in future by inserting suitable memory barriers into * SetLatch and ResetLatch. * + * On some platforms, signals will not interrupt the latch wait primitive + * by themselves. Therefore, it is critical that any signal handler that + * is meant to terminate a WaitLatch wait calls SetLatch. + * * Note that use of the process latch (PGPROC.procLatch) is generally better * than an ad-hoc shared latch for signaling auxiliary processes. This is * because generic signal handlers will call SetLatch on the process latch * only, so using any latch other than the process latch effectively precludes - * ever registering a generic handler. Since signals have the potential to - * invalidate the latch timeout on some platforms, resulting in a - * denial-of-service, it is important to verify that all signal handlers - * within all WaitLatch-calling processes call SetLatch. + * use of any generic handler. * * * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group @@ -111,6 +112,7 @@ typedef struct /* * prototypes for functions in latch.c */ +extern void InitializeLatchSupport(void); extern void InitLatch(volatile Latch *latch); extern void InitSharedLatch(volatile Latch *latch); extern void OwnLatch(volatile Latch *latch); diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h index d56f0fa4b7..e01a5c5044 100644 --- a/src/include/storage/lock.h +++ b/src/include/storage/lock.h @@ -544,6 +544,7 @@ extern void DumpAllLocks(void); /* Lock a VXID (used to wait for a transaction to finish) */ extern void VirtualXactLockTableInsert(VirtualTransactionId vxid); +extern void VirtualXactLockTableCleanup(void); extern bool VirtualXactLock(VirtualTransactionId vxid, bool wait); #endif /* LOCK_H */ diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index e10aafe99e..686ac48657 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -168,7 +168,8 @@ typedef struct PGXACT uint8 vacuumFlags; /* vacuum-related flags, see above */ bool overflowed; - bool inCommit; /* true if within commit critical section */ + bool delayChkpt; /* true if this proc delays checkpoint start */ + /* previously called InCommit */ uint8 nxids; } PGXACT; diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index 5b4cab926f..9933dad635 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -52,8 +52,8 @@ extern bool TransactionIdIsActive(TransactionId xid); extern TransactionId GetOldestXmin(bool allDbs, bool ignoreVacuum); extern TransactionId GetOldestActiveTransactionId(void); -extern int GetTransactionsInCommit(TransactionId **xids_p); -extern bool HaveTransactionsInCommit(TransactionId *xids, int nxids); +extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids); +extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids); extern PGPROC *BackendPidGetProc(int pid); extern int BackendXidGetPid(TransactionId xid); diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 3560d53907..9eccf7671e 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -33,6 +33,9 @@ * without having to make the smgr explicitly aware of relcache. There * can't be more than one "owner" pointer per SMgrRelation, but that's * all we need. + * + * SMgrRelations that do not have an "owner" are considered to be transient, + * and are deleted at end of transaction. */ typedef struct SMgrRelationData { @@ -60,10 +63,12 @@ typedef struct SMgrRelationData * submodules. Do not touch them from elsewhere. */ int smgr_which; /* storage manager selector */ - bool smgr_transient; /* T if files are to be closed at EOXact */ /* for md.c; NULL for forks that are not open */ struct _MdfdVec *md_fd[MAX_FORKNUM + 1]; + + /* if unowned, list link in list of all unowned SMgrRelations */ + struct SMgrRelationData *next_unowned_reln; } SMgrRelationData; typedef SMgrRelationData *SMgrRelation; @@ -73,7 +78,6 @@ typedef SMgrRelationData *SMgrRelation; extern void smgrinit(void); extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend); -extern void smgrsettransient(SMgrRelation reln); extern bool smgrexists(SMgrRelation reln, ForkNumber forknum); extern void smgrsetowner(SMgrRelation *owner, SMgrRelation reln); extern void smgrclose(SMgrRelation reln); @@ -97,6 +101,7 @@ extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum); extern void smgrpreckpt(void); extern void smgrsync(void); extern void smgrpostckpt(void); +extern void AtEOXact_SMgr(void); /* internals: move me elsewhere -- ay 7/94 */ diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h index 7024fc4f3c..34558a5a30 100644 --- a/src/include/storage/standby.h +++ b/src/include/storage/standby.h @@ -68,6 +68,7 @@ typedef struct xl_standby_locks typedef struct xl_running_xacts { int xcnt; /* # of xact ids in xids[] */ + int subxcnt; /* # of subxact ids in xids[] */ bool subxid_overflow; /* snapshot overflowed, subxids missing */ TransactionId nextXid; /* copy of ShmemVariableCache->nextXid */ TransactionId oldestRunningXid; /* *not* oldestXmin */ @@ -98,6 +99,7 @@ extern void standby_desc(StringInfo buf, uint8 xl_info, char *rec); typedef struct RunningTransactionsData { int xcnt; /* # of xact ids in xids[] */ + int subxcnt; /* # of subxact ids in xids[] */ bool subxid_overflow; /* snapshot overflowed, subxids missing */ TransactionId nextXid; /* copy of ShmemVariableCache->nextXid */ TransactionId oldestRunningXid; /* *not* oldestXmin */ @@ -111,6 +113,6 @@ typedef RunningTransactionsData *RunningTransactions; extern void LogAccessExclusiveLock(Oid dbOid, Oid relOid); extern void LogAccessExclusiveLockPrepare(void); -extern void LogStandbySnapshot(TransactionId *nextXid); +extern void LogStandbySnapshot(void); #endif /* STANDBY_H */ diff --git a/src/include/tcop/pquery.h b/src/include/tcop/pquery.h index 22aad2e96c..c77bb8be94 100644 --- a/src/include/tcop/pquery.h +++ b/src/include/tcop/pquery.h @@ -28,7 +28,7 @@ extern List *FetchPortalTargetList(Portal portal); extern List *FetchStatementTargetList(Node *stmt); extern void PortalStart(Portal portal, ParamListInfo params, - int eflags, bool use_active_snapshot); + int eflags, Snapshot snapshot); extern void PortalSetResultFormat(Portal portal, int nFormats, int16 *formats); diff --git a/src/include/utils/catcache.h b/src/include/utils/catcache.h index d91700a07e..3688684323 100644 --- a/src/include/utils/catcache.h +++ b/src/include/utils/catcache.h @@ -22,7 +22,7 @@ #include "access/htup.h" #include "access/skey.h" -#include "lib/dllist.h" +#include "lib/ilist.h" #include "utils/relcache.h" /* @@ -37,7 +37,7 @@ typedef struct catcache { int id; /* cache identifier --- see syscache.h */ - struct catcache *cc_next; /* link to next catcache */ + slist_node cc_next; /* list link */ const char *cc_relname; /* name of relation the tuples come from */ Oid cc_reloid; /* OID of relation the tuples come from */ Oid cc_indexoid; /* OID of index matching cache keys */ @@ -51,7 +51,7 @@ typedef struct catcache ScanKeyData cc_skey[CATCACHE_MAXKEYS]; /* precomputed key info for * heap scans */ bool cc_isname[CATCACHE_MAXKEYS]; /* flag "name" key columns */ - Dllist cc_lists; /* list of CatCList structs */ + dlist_head cc_lists; /* list of CatCList structs */ #ifdef CATCACHE_STATS long cc_searches; /* total # searches against this cache */ long cc_hits; /* # of matches against existing entry */ @@ -66,7 +66,7 @@ typedef struct catcache long cc_lsearches; /* total # list-searches */ long cc_lhits; /* # of matches against existing lists */ #endif - Dllist cc_bucket[1]; /* hash buckets --- VARIABLE LENGTH ARRAY */ + dlist_head cc_bucket[1]; /* hash buckets --- VARIABLE LENGTH ARRAY */ } CatCache; /* VARIABLE LENGTH STRUCT */ @@ -77,11 +77,11 @@ typedef struct catctup CatCache *my_cache; /* link to owning catcache */ /* - * Each tuple in a cache is a member of a Dllist that stores the elements - * of its hash bucket. We keep each Dllist in LRU order to speed repeated + * Each tuple in a cache is a member of a dlist that stores the elements + * of its hash bucket. We keep each dlist in LRU order to speed repeated * lookups. */ - Dlelem cache_elem; /* list member of per-bucket list */ + dlist_node cache_elem; /* list member of per-bucket list */ /* * The tuple may also be a member of at most one CatCList. (If a single @@ -139,7 +139,7 @@ typedef struct catclist * might not be true during bootstrap or recovery operations. (namespace.c * is able to save some cycles when it is true.) */ - Dlelem cache_elem; /* list member of per-catcache list */ + dlist_node cache_elem; /* list member of per-catcache list */ int refcount; /* number of active references */ bool dead; /* dead but not yet removed? */ bool ordered; /* members listed in index order? */ @@ -153,7 +153,7 @@ typedef struct catclist typedef struct catcacheheader { - CatCache *ch_caches; /* head of list of CatCache structs */ + slist_head ch_caches; /* head of list of CatCache structs */ int ch_ntup; /* # of tuples in all caches */ } CatCacheHeader; diff --git a/src/include/utils/elog.h b/src/include/utils/elog.h index 03298fbbaf..42c22cd90e 100644 --- a/src/include/utils/elog.h +++ b/src/include/utils/elog.h @@ -177,8 +177,19 @@ errhint(const char *fmt,...) the supplied arguments. */ __attribute__((format(PG_PRINTF_ATTRIBUTE, 1, 2))); +/* + * errcontext() is typically called in error context callback functions, not + * within an ereport() invocation. The callback function can be in a different + * module than the ereport() call, so the message domain passed in errstart() + * is not usually the correct domain for translating the context message. + * set_errcontext_domain() first sets the domain to be used, and + * errcontext_msg() passes the actual message. + */ +#define errcontext set_errcontext_domain(TEXTDOMAIN), errcontext_msg + +extern int set_errcontext_domain(const char *domain); extern int -errcontext(const char *fmt,...) +errcontext_msg(const char *fmt,...) /* This extension allows gcc to check the format string for consistency with the supplied arguments. */ __attribute__((format(PG_PRINTF_ATTRIBUTE, 1, 2))); @@ -320,6 +331,7 @@ typedef struct ErrorData int lineno; /* __LINE__ of ereport() call */ const char *funcname; /* __func__ of ereport() call */ const char *domain; /* message domain */ + const char *context_domain; /* message domain for context message */ int sqlerrcode; /* encoded ERRSTATE */ char *message; /* primary error message */ char *detail; /* detail error message */ diff --git a/src/include/utils/portal.h b/src/include/utils/portal.h index daafd0e15e..bfb03b8890 100644 --- a/src/include/utils/portal.h +++ b/src/include/utils/portal.h @@ -220,5 +220,6 @@ extern void PortalDefineQuery(Portal portal, extern Node *PortalListGetPrimaryStmt(List *stmts); extern void PortalCreateHoldStore(Portal portal); extern void PortalHashTableDeleteAll(void); +extern bool ThereAreNoReadyPortals(void); #endif /* PORTAL_H */ diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h index 6b2ef80d06..da47e79eda 100644 --- a/src/include/utils/snapmgr.h +++ b/src/include/utils/snapmgr.h @@ -48,5 +48,6 @@ extern Datum pg_export_snapshot(PG_FUNCTION_ARGS); extern void ImportSnapshot(const char *idstr); extern bool XactHasExportedSnapshots(void); extern void DeleteAllExportedSnapshotFiles(void); +extern bool ThereAreNoPriorRegisteredSnapshots(void); #endif /* SNAPMGR_H */ diff --git a/src/include/utils/timestamp.h b/src/include/utils/timestamp.h index e7cdb417e5..b4b402f018 100644 --- a/src/include/utils/timestamp.h +++ b/src/include/utils/timestamp.h @@ -206,13 +206,24 @@ extern Datum generate_series_timestamptz(PG_FUNCTION_ARGS); /* Internal routines (not fmgr-callable) */ extern TimestampTz GetCurrentTimestamp(void); - extern void TimestampDifference(TimestampTz start_time, TimestampTz stop_time, long *secs, int *microsecs); extern bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec); +/* + * Prototypes for functions to deal with integer timestamps, when the native + * format is float timestamps. + */ +#ifndef HAVE_INT64_TIMESTAMP +extern int64 GetCurrentIntegerTimestamp(void); +extern TimestampTz IntegerTimestampToTimestampTz(int64 timestamp); +#else +#define GetCurrentIntegerTimestamp() GetCurrentTimestamp() +#define IntegerTimestampToTimestampTz(timestamp) (timestamp) +#endif + extern TimestampTz time_t_to_timestamptz(pg_time_t tm); extern pg_time_t timestamptz_to_time_t(TimestampTz t); diff --git a/src/interfaces/ecpg/Makefile b/src/interfaces/ecpg/Makefile index 0116ec0db9..e397210a71 100644 --- a/src/interfaces/ecpg/Makefile +++ b/src/interfaces/ecpg/Makefile @@ -10,7 +10,9 @@ SUBDIRS = include pgtypeslib ecpglib compatlib preproc # (There are some other parallelism bugs in the subdirectory makefiles # themselves, but there's little point in fixing them as long as we have # to use this big hammer.) +ifeq ($(MAKE_VERSION),3.82) .NOTPARALLEL: +endif $(recurse) diff --git a/src/interfaces/ecpg/ecpglib/Makefile b/src/interfaces/ecpg/ecpglib/Makefile index 565df3c735..daac615c1a 100644 --- a/src/interfaces/ecpg/ecpglib/Makefile +++ b/src/interfaces/ecpg/ecpglib/Makefile @@ -26,7 +26,7 @@ LIBS := $(filter-out -lpgport, $(LIBS)) OBJS= execute.o typename.o descriptor.o sqlda.o data.o error.o prepare.o memory.o \ connect.o misc.o path.o pgstrcasecmp.o \ - $(filter snprintf.o strlcpy.o win32setlocale.o, $(LIBOBJS)) + $(filter snprintf.o strlcpy.o win32setlocale.o isinf.o, $(LIBOBJS)) # thread.c is needed only for non-WIN32 implementation of path.c ifneq ($(PORTNAME), win32) @@ -57,7 +57,7 @@ include $(top_srcdir)/src/Makefile.shlib # necessarily use the same object files as the backend uses. Instead, # symlink the source files in here and build our own object file. -path.c pgstrcasecmp.c snprintf.c strlcpy.c thread.c win32setlocale.c: % : $(top_srcdir)/src/port/% +path.c pgstrcasecmp.c snprintf.c strlcpy.c thread.c win32setlocale.c isinf.c: % : $(top_srcdir)/src/port/% rm -f $@ && $(LN_S) $< . misc.o: misc.c $(top_builddir)/src/port/pg_config_paths.h diff --git a/src/interfaces/ecpg/preproc/Makefile b/src/interfaces/ecpg/preproc/Makefile index 6e117d49c7..5834299067 100644 --- a/src/interfaces/ecpg/preproc/Makefile +++ b/src/interfaces/ecpg/preproc/Makefile @@ -30,6 +30,12 @@ OBJS= preproc.o type.o ecpg.o output.o parser.o \ keywords.o c_keywords.o ecpg_keywords.o kwlookup.o ../ecpglib/typename.o descriptor.o variable.o \ $(WIN32RES) +# Suppress parallel build to avoid a bug in gmake 3.82 +# (see comments in ../Makefile) +ifeq ($(MAKE_VERSION),3.82) +.NOTPARALLEL: +endif + all: ecpg ecpg: $(OBJS) | submake-libpgport diff --git a/src/interfaces/ecpg/preproc/type.c b/src/interfaces/ecpg/preproc/type.c index c743616a6c..cf2ff15841 100644 --- a/src/interfaces/ecpg/preproc/type.c +++ b/src/interfaces/ecpg/preproc/type.c @@ -506,8 +506,8 @@ ECPGdump_a_struct(FILE *o, const char *name, const char *ind_name, char *arrsiz, */ struct ECPGstruct_member *p, *ind_p = NULL; - char pbuf[BUFSIZ], - ind_pbuf[BUFSIZ]; + char *pbuf = (char *) mm_alloc(strlen(name) + ((prefix == NULL) ? 0 : strlen(prefix)) + 3); + char *ind_pbuf = (char *) mm_alloc(strlen(ind_name) + ((ind_prefix == NULL) ? 0 : strlen(ind_prefix)) + 3); if (atoi(arrsiz) == 1) sprintf(pbuf, "%s%s.", prefix ? prefix : "", name); @@ -540,6 +540,9 @@ ECPGdump_a_struct(FILE *o, const char *name, const char *ind_name, char *arrsiz, if (ind_p != NULL && ind_p != &struct_no_indicator) ind_p = ind_p->next; } + + free(pbuf); + free(ind_pbuf); } void diff --git a/src/interfaces/ecpg/preproc/variable.c b/src/interfaces/ecpg/preproc/variable.c index e08e14ac18..6ff574bbd9 100644 --- a/src/interfaces/ecpg/preproc/variable.c +++ b/src/interfaces/ecpg/preproc/variable.c @@ -100,7 +100,11 @@ find_struct_member(char *name, char *str, struct ECPGstruct_member * members, in } break; case '-': - return (find_struct_member(name, end, members->type->u.element->u.members, brace_level)); + if (members->type->type == ECPGt_array) + return (find_struct_member(name, ++end, members->type->u.element->u.members, brace_level)); + else + return (find_struct_member(name, ++end, members->type->u.members, brace_level)); + break; break; case '.': if (members->type->type == ECPGt_array) diff --git a/src/interfaces/libpq/exports.txt b/src/interfaces/libpq/exports.txt index 56d0bb8dc5..93da50df31 100644 --- a/src/interfaces/libpq/exports.txt +++ b/src/interfaces/libpq/exports.txt @@ -164,3 +164,4 @@ PQsetSingleRowMode 161 lo_lseek64 162 lo_tell64 163 lo_truncate64 164 +PQconninfo 165 diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c index 9eaf41025b..beee8172a0 100644 --- a/src/interfaces/libpq/fe-connect.c +++ b/src/interfaces/libpq/fe-connect.c @@ -137,81 +137,112 @@ static int ldapServiceLookup(const char *purl, PQconninfoOption *options, * PQconninfoOptions[] *must* be NULL. In a working copy, non-null "val" * fields point to malloc'd strings that should be freed when the working * array is freed (see PQconninfoFree). + * + * The first part of each struct is identical to the one in libpq-fe.h, + * which is required since we memcpy() data between the two! * ---------- */ -static const PQconninfoOption PQconninfoOptions[] = { +typedef struct _internalPQconninfoOption +{ + char *keyword; /* The keyword of the option */ + char *envvar; /* Fallback environment variable name */ + char *compiled; /* Fallback compiled in default value */ + char *val; /* Option's current value, or NULL */ + char *label; /* Label for field in connect dialog */ + char *dispchar; /* Indicates how to display this field in a + * connect dialog. Values are: "" Display + * entered value as is "*" Password field - + * hide value "D" Debug option - don't show + * by default */ + int dispsize; /* Field size in characters for dialog */ + /* --- + * Anything above this comment must be synchronized with + * PQconninfoOption in libpq-fe.h, since we memcpy() data + * between them! + * --- + */ + off_t connofs; /* Offset into PGconn struct, -1 if not there */ +} internalPQconninfoOption; + +static const internalPQconninfoOption PQconninfoOptions[] = { /* * "authtype" is no longer used, so mark it "don't show". We keep it in * the array so as not to reject conninfo strings from old apps that might * still try to set it. */ {"authtype", "PGAUTHTYPE", DefaultAuthtype, NULL, - "Database-Authtype", "D", 20}, + "Database-Authtype", "D", 20, -1}, {"service", "PGSERVICE", NULL, NULL, - "Database-Service", "", 20}, + "Database-Service", "", 20, -1}, {"user", "PGUSER", NULL, NULL, - "Database-User", "", 20}, + "Database-User", "", 20, + offsetof(struct pg_conn, pguser)}, {"password", "PGPASSWORD", NULL, NULL, - "Database-Password", "*", 20}, + "Database-Password", "*", 20, + offsetof(struct pg_conn, pgpass)}, {"connect_timeout", "PGCONNECT_TIMEOUT", NULL, NULL, - "Connect-timeout", "", 10}, /* strlen(INT32_MAX) == 10 */ + "Connect-timeout", "", 10, /* strlen(INT32_MAX) == 10 */ + offsetof(struct pg_conn, connect_timeout)}, {"dbname", "PGDATABASE", NULL, NULL, - "Database-Name", "", 20}, + "Database-Name", "", 20, + offsetof(struct pg_conn, dbName)}, {"host", "PGHOST", NULL, NULL, - "Database-Host", "", 40}, + "Database-Host", "", 40, + offsetof(struct pg_conn, pghost)}, {"hostaddr", "PGHOSTADDR", NULL, NULL, - "Database-Host-IP-Address", "", 45}, + "Database-Host-IP-Address", "", 45, + offsetof(struct pg_conn, pghostaddr)}, {"port", "PGPORT", DEF_PGPORT_STR, NULL, - "Database-Port", "", 6}, + "Database-Port", "", 6, + offsetof(struct pg_conn, pgport)}, {"client_encoding", "PGCLIENTENCODING", NULL, NULL, - "Client-Encoding", "", 10}, + "Client-Encoding", "", 10, + offsetof(struct pg_conn, client_encoding_initial)}, /* * "tty" is no longer used either, but keep it present for backwards * compatibility. */ {"tty", "PGTTY", DefaultTty, NULL, - "Backend-Debug-TTY", "D", 40}, + "Backend-Debug-TTY", "D", 40, + offsetof(struct pg_conn, pgtty)}, {"options", "PGOPTIONS", DefaultOption, NULL, - "Backend-Debug-Options", "D", 40}, + "Backend-Debug-Options", "D", 40, + offsetof(struct pg_conn, pgoptions)}, {"application_name", "PGAPPNAME", NULL, NULL, - "Application-Name", "", 64}, + "Application-Name", "", 64, + offsetof(struct pg_conn, appname)}, {"fallback_application_name", NULL, NULL, NULL, - "Fallback-Application-Name", "", 64}, + "Fallback-Application-Name", "", 64, + offsetof(struct pg_conn, fbappname)}, {"keepalives", NULL, NULL, NULL, - "TCP-Keepalives", "", 1}, /* should be just '0' or '1' */ + "TCP-Keepalives", "", 1, /* should be just '0' or '1' */ + offsetof(struct pg_conn, keepalives)}, {"keepalives_idle", NULL, NULL, NULL, - "TCP-Keepalives-Idle", "", 10}, /* strlen(INT32_MAX) == 10 */ + "TCP-Keepalives-Idle", "", 10, /* strlen(INT32_MAX) == 10 */ + offsetof(struct pg_conn, keepalives_idle)}, {"keepalives_interval", NULL, NULL, NULL, - "TCP-Keepalives-Interval", "", 10}, /* strlen(INT32_MAX) == 10 */ + "TCP-Keepalives-Interval", "", 10, /* strlen(INT32_MAX) == 10 */ + offsetof(struct pg_conn, keepalives_interval)}, {"keepalives_count", NULL, NULL, NULL, - "TCP-Keepalives-Count", "", 10}, /* strlen(INT32_MAX) == 10 */ - -#ifdef USE_SSL - - /* - * "requiressl" is deprecated, its purpose having been taken over by - * "sslmode". It remains for backwards compatibility. - */ - {"requiressl", "PGREQUIRESSL", "0", NULL, - "Require-SSL", "D", 1}, -#endif + "TCP-Keepalives-Count", "", 10, /* strlen(INT32_MAX) == 10 */ + offsetof(struct pg_conn, keepalives_count)}, /* * ssl options are allowed even without client SSL support because the @@ -220,30 +251,38 @@ static const PQconninfoOption PQconninfoOptions[] = { * to exclude them since none of them are mandatory. */ {"sslmode", "PGSSLMODE", DefaultSSLMode, NULL, - "SSL-Mode", "", 8}, /* sizeof("disable") == 8 */ + "SSL-Mode", "", 8, /* sizeof("disable") == 8 */ + offsetof(struct pg_conn, sslmode)}, {"sslcompression", "PGSSLCOMPRESSION", "1", NULL, - "SSL-Compression", "", 1}, + "SSL-Compression", "", 1, + offsetof(struct pg_conn, sslcompression)}, {"sslcert", "PGSSLCERT", NULL, NULL, - "SSL-Client-Cert", "", 64}, + "SSL-Client-Cert", "", 64, + offsetof(struct pg_conn, sslcert)}, {"sslkey", "PGSSLKEY", NULL, NULL, - "SSL-Client-Key", "", 64}, + "SSL-Client-Key", "", 64, + offsetof(struct pg_conn, sslkey)}, {"sslrootcert", "PGSSLROOTCERT", NULL, NULL, - "SSL-Root-Certificate", "", 64}, + "SSL-Root-Certificate", "", 64, + offsetof(struct pg_conn, sslrootcert)}, {"sslcrl", "PGSSLCRL", NULL, NULL, - "SSL-Revocation-List", "", 64}, + "SSL-Revocation-List", "", 64, + offsetof(struct pg_conn, sslcrl)}, {"requirepeer", "PGREQUIREPEER", NULL, NULL, - "Require-Peer", "", 10}, + "Require-Peer", "", 10, + offsetof(struct pg_conn, requirepeer)}, #if defined(KRB5) || defined(ENABLE_GSS) || defined(ENABLE_SSPI) /* Kerberos and GSSAPI authentication support specifying the service name */ {"krbsrvname", "PGKRBSRVNAME", PG_KRB_SRVNAM, NULL, - "Kerberos-service-name", "", 20}, + "Kerberos-service-name", "", 20, + offsetof(struct pg_conn, krbsrvname)}, #endif #if defined(ENABLE_GSS) && defined(ENABLE_SSPI) @@ -253,11 +292,13 @@ static const PQconninfoOption PQconninfoOptions[] = { * default */ {"gsslib", "PGGSSLIB", NULL, NULL, - "GSS-library", "", 7}, /* sizeof("gssapi") = 7 */ + "GSS-library", "", 7, /* sizeof("gssapi") = 7 */ + offsetof(struct pg_conn, gsslib)}, #endif {"replication", NULL, NULL, NULL, - "Replication", "D", 5}, + "Replication", "D", 5, + offsetof(struct pg_conn, replication)}, /* Terminating entry --- MUST BE LAST */ {NULL, NULL, NULL, NULL, @@ -627,7 +668,7 @@ PQconnectStart(const char *conninfo) static void fillPGconn(PGconn *conn, PQconninfoOption *connOptions) { - const char *tmp; + const internalPQconninfoOption *option; /* * Move option values into conn structure @@ -637,72 +678,19 @@ fillPGconn(PGconn *conn, PQconninfoOption *connOptions) * * XXX: probably worth checking strdup() return value here... */ - tmp = conninfo_getval(connOptions, "hostaddr"); - conn->pghostaddr = tmp ? strdup(tmp) : NULL; - tmp = conninfo_getval(connOptions, "host"); - conn->pghost = tmp ? strdup(tmp) : NULL; - tmp = conninfo_getval(connOptions, "port"); - conn->pgport = tmp ? strdup(tmp) : NULL; - tmp = conninfo_getval(connOptions, "tty"); - conn->pgtty = tmp ? strdup(tmp) : NULL; - tmp = conninfo_getval(connOptions, "options"); - conn->pgoptions = tmp ? strdup(tmp) : NULL; - tmp = conninfo_getval(connOptions, "application_name"); - conn->appname = tmp ? strdup(tmp) : NULL; - tmp = conninfo_getval(connOptions, "fallback_application_name"); - conn->fbappname = tmp ? strdup(tmp) : NULL; - tmp = conninfo_getval(connOptions, "dbname"); - conn->dbName = tmp ? strdup(tmp) : NULL; - tmp = conninfo_getval(connOptions, "user"); - conn->pguser = tmp ? strdup(tmp) : NULL; - tmp = conninfo_getval(connOptions, "password"); - conn->pgpass = tmp ? strdup(tmp) : NULL; - tmp = conninfo_getval(connOptions, "connect_timeout"); - conn->connect_timeout = tmp ? strdup(tmp) : NULL; - tmp = conninfo_getval(connOptions, "client_encoding"); - conn->client_encoding_initial = tmp ? strdup(tmp) : NULL; - tmp = conninfo_getval(connOptions, "keepalives"); - conn->keepalives = tmp ? strdup(tmp) : NULL; - tmp = conninfo_getval(connOptions, "keepalives_idle"); - conn->keepalives_idle = tmp ? strdup(tmp) : NULL; - tmp = conninfo_getval(connOptions, "keepalives_interval"); - conn->keepalives_interval = tmp ? strdup(tmp) : NULL; - tmp = conninfo_getval(connOptions, "keepalives_count"); - conn->keepalives_count = tmp ? strdup(tmp) : NULL; - tmp = conninfo_getval(connOptions, "sslmode"); - conn->sslmode = tmp ? strdup(tmp) : NULL; - tmp = conninfo_getval(connOptions, "sslcompression"); - conn->sslcompression = tmp ? strdup(tmp) : NULL; - tmp = conninfo_getval(connOptions, "sslkey"); - conn->sslkey = tmp ? strdup(tmp) : NULL; - tmp = conninfo_getval(connOptions, "sslcert"); - conn->sslcert = tmp ? strdup(tmp) : NULL; - tmp = conninfo_getval(connOptions, "sslrootcert"); - conn->sslrootcert = tmp ? strdup(tmp) : NULL; - tmp = conninfo_getval(connOptions, "sslcrl"); - conn->sslcrl = tmp ? strdup(tmp) : NULL; -#ifdef USE_SSL - tmp = conninfo_getval(connOptions, "requiressl"); - if (tmp && tmp[0] == '1') + for (option = PQconninfoOptions; option->keyword; option++) { - /* here warn that the requiressl option is deprecated? */ - if (conn->sslmode) - free(conn->sslmode); - conn->sslmode = strdup("require"); + const char *tmp = conninfo_getval(connOptions, option->keyword); + + if (tmp && option->connofs >= 0) + { + char **connmember = (char **) ((char *) conn + option->connofs); + + if (*connmember) + free(*connmember); + *connmember = tmp ? strdup(tmp) : NULL; + } } -#endif - tmp = conninfo_getval(connOptions, "requirepeer"); - conn->requirepeer = tmp ? strdup(tmp) : NULL; -#if defined(KRB5) || defined(ENABLE_GSS) || defined(ENABLE_SSPI) - tmp = conninfo_getval(connOptions, "krbsrvname"); - conn->krbsrvname = tmp ? strdup(tmp) : NULL; -#endif -#if defined(ENABLE_GSS) && defined(ENABLE_SSPI) - tmp = conninfo_getval(connOptions, "gsslib"); - conn->gsslib = tmp ? strdup(tmp) : NULL; -#endif - tmp = conninfo_getval(connOptions, "replication"); - conn->replication = tmp ? strdup(tmp) : NULL; } /* @@ -1322,7 +1310,7 @@ static int connectDBStart(PGconn *conn) { int portnum; - char portstr[128]; + char portstr[MAXPGPATH]; struct addrinfo *addrs = NULL; struct addrinfo hint; const char *node; @@ -1384,6 +1372,15 @@ connectDBStart(PGconn *conn) node = NULL; hint.ai_family = AF_UNIX; UNIXSOCK_PATH(portstr, portnum, conn->pgunixsocket); + if (strlen(portstr) >= UNIXSOCK_PATH_BUFLEN) + { + appendPQExpBuffer(&conn->errorMessage, + libpq_gettext("Unix-domain socket path \"%s\" is too long (maximum %d bytes)\n"), + portstr, + (int) (UNIXSOCK_PATH_BUFLEN - 1)); + conn->options_valid = false; + goto connect_errReturn; + } #else /* Without Unix sockets, default to localhost instead */ node = DefaultHost; @@ -4011,15 +4008,29 @@ static PQconninfoOption * conninfo_init(PQExpBuffer errorMessage) { PQconninfoOption *options; + PQconninfoOption *opt_dest; + const internalPQconninfoOption *cur_opt; - options = (PQconninfoOption *) malloc(sizeof(PQconninfoOptions)); + /* + * Get enough memory for all options in PQconninfoOptions, even if some + * end up being filtered out. + */ + options = (PQconninfoOption *) malloc(sizeof(PQconninfoOption) * sizeof(PQconninfoOptions) / sizeof(PQconninfoOptions[0])); if (options == NULL) { printfPQExpBuffer(errorMessage, libpq_gettext("out of memory\n")); return NULL; } - memcpy(options, PQconninfoOptions, sizeof(PQconninfoOptions)); + opt_dest = options; + + for (cur_opt = PQconninfoOptions; cur_opt->keyword; cur_opt++) + { + /* Only copy the public part of the struct, not the full internal */ + memcpy(opt_dest, cur_opt, sizeof(PQconninfoOption)); + opt_dest++; + } + MemSet(opt_dest, 0, sizeof(PQconninfoOption)); return options; } @@ -5008,6 +5019,20 @@ conninfo_storeval(PQconninfoOption *connOptions, PQconninfoOption *option; char *value_copy; + /* + * For backwards compatibility, requiressl=1 gets translated to + * sslmode=require, and requiressl=0 gets translated to sslmode=prefer + * (which is the default for sslmode). + */ + if (strcmp(keyword, "requiressl") == 0) + { + keyword = "sslmode"; + if (value[0] == '1') + value = "require"; + else + value = "prefer"; + } + option = conninfo_find(connOptions, keyword); if (option == NULL) { @@ -5066,6 +5091,50 @@ conninfo_find(PQconninfoOption *connOptions, const char *keyword) } +/* + * Return the connection options used for the connection + */ +PQconninfoOption * +PQconninfo(PGconn *conn) +{ + PQExpBufferData errorBuf; + PQconninfoOption *connOptions; + + if (conn == NULL) + return NULL; + + /* We don't actually report any errors here, but callees want a buffer */ + initPQExpBuffer(&errorBuf); + if (PQExpBufferDataBroken(errorBuf)) + return NULL; /* out of memory already :-( */ + + connOptions = conninfo_init(&errorBuf); + + if (connOptions != NULL) + { + const internalPQconninfoOption *option; + + for (option = PQconninfoOptions; option->keyword; option++) + { + char **connmember; + + if (option->connofs < 0) + continue; + + connmember = (char **) ((char *) conn + option->connofs); + + if (*connmember) + conninfo_storeval(connOptions, option->keyword, *connmember, + &errorBuf, true, false); + } + } + + termPQExpBuffer(&errorBuf); + + return connOptions; +} + + void PQconninfoFree(PQconninfoOption *connOptions) { diff --git a/src/interfaces/libpq/libpq-fe.h b/src/interfaces/libpq/libpq-fe.h index 0b8d9a6813..a289ef3090 100644 --- a/src/interfaces/libpq/libpq-fe.h +++ b/src/interfaces/libpq/libpq-fe.h @@ -262,6 +262,9 @@ extern PQconninfoOption *PQconndefaults(void); /* parse connection options in same way as PQconnectdb */ extern PQconninfoOption *PQconninfoParse(const char *conninfo, char **errmsg); +/* return the connection options used by a live connection */ +extern PQconninfoOption *PQconninfo(PGconn *conn); + /* free the data structure returned by PQconndefaults() or PQconninfoParse() */ extern void PQconninfoFree(PQconninfoOption *connOptions); diff --git a/src/makefiles/pgxs.mk b/src/makefiles/pgxs.mk index 318d5ef307..fd6473fd91 100644 --- a/src/makefiles/pgxs.mk +++ b/src/makefiles/pgxs.mk @@ -146,6 +146,9 @@ endif # MODULE_big installdirs: +ifneq (,$(EXTENSION)) + $(MKDIR_P) '$(DESTDIR)$(datadir)/extension' +endif ifneq (,$(DATA)$(DATA_built)) $(MKDIR_P) '$(DESTDIR)$(datadir)/$(datamoduledir)' endif diff --git a/src/test/isolation/Makefile b/src/test/isolation/Makefile index 6579be148c..5e5c9bb74e 100644 --- a/src/test/isolation/Makefile +++ b/src/test/isolation/Makefile @@ -13,19 +13,19 @@ override CPPFLAGS := -I$(srcdir) -I$(libpq_srcdir) -I$(srcdir)/../regress $(CPPF OBJS = specparse.o isolationtester.o +all: isolationtester$(X) pg_isolation_regress$(X) + submake-regress: $(MAKE) -C $(top_builddir)/src/test/regress pg_regress.o pg_regress.o: | submake-regress rm -f $@ && $(LN_S) $(top_builddir)/src/test/regress/pg_regress.o . -pg_isolation_regress: isolation_main.o pg_regress.o - $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X) - -all: isolationtester pg_isolation_regress +pg_isolation_regress$(X): isolation_main.o pg_regress.o + $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@ -isolationtester: $(OBJS) | submake-libpq submake-libpgport - $(CC) $(CFLAGS) $^ $(libpq_pgport) $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X) +isolationtester$(X): $(OBJS) | submake-libpq submake-libpgport + $(CC) $(CFLAGS) $^ $(libpq_pgport) $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@ distprep: specparse.c specscanner.c @@ -36,8 +36,6 @@ distprep: specparse.c specscanner.c # important, otherwise make will choose the built-in rule for # gram.y=>gram.c. -all: isolationtester$(X) pg_isolation_regress$(X) - specparse.h: specparse.c ; # specscanner is compiled as part of specparse diff --git a/src/test/isolation/expected/drop-index-concurrently-1.out b/src/test/isolation/expected/drop-index-concurrently-1.out new file mode 100644 index 0000000000..8bedf13d33 --- /dev/null +++ b/src/test/isolation/expected/drop-index-concurrently-1.out @@ -0,0 +1,40 @@ +Parsed test spec with 3 sessions + +starting permutation: noseq prepi preps begin explaini explains select2 drop insert2 end2 selecti selects end +step noseq: SET enable_seqscan = false; +step prepi: PREPARE getrow_idx AS SELECT * FROM test_dc WHERE data=34 ORDER BY id,data; +step preps: PREPARE getrow_seq AS SELECT * FROM test_dc WHERE data::text=34::text ORDER BY id,data; +step begin: BEGIN; +step explaini: EXPLAIN (COSTS OFF) EXECUTE getrow_idx; +QUERY PLAN + +Sort + Sort Key: id + -> Index Scan using test_dc_data on test_dc + Index Cond: (data = 34) +step explains: EXPLAIN (COSTS OFF) EXECUTE getrow_seq; +QUERY PLAN + +Sort + Sort Key: id, data + -> Seq Scan on test_dc + Filter: ((data)::text = '34'::text) +step select2: SELECT * FROM test_dc WHERE data=34 ORDER BY id,data; +id data + +34 34 +step drop: DROP INDEX CONCURRENTLY test_dc_data; +step insert2: INSERT INTO test_dc(data) SELECT * FROM generate_series(1, 100); +step end2: COMMIT; +step selecti: EXECUTE getrow_idx; +id data + +34 34 +134 34 +step selects: EXECUTE getrow_seq; +id data + +34 34 +134 34 +step end: COMMIT; +step drop: <... completed> diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule index 75e33bc99f..1d0770cd37 100644 --- a/src/test/isolation/isolation_schedule +++ b/src/test/isolation/isolation_schedule @@ -14,3 +14,4 @@ test: fk-contention test: fk-deadlock test: fk-deadlock2 test: eval-plan-qual +test: drop-index-concurrently-1 diff --git a/src/test/isolation/specs/drop-index-concurrently-1.spec b/src/test/isolation/specs/drop-index-concurrently-1.spec new file mode 100644 index 0000000000..8ebe5e6f0d --- /dev/null +++ b/src/test/isolation/specs/drop-index-concurrently-1.spec @@ -0,0 +1,38 @@ +# DROP INDEX CONCURRENTLY +# +# This test shows that the concurrent write behaviour works correctly +# with the expected output being 2 rows. +# +setup +{ + CREATE TABLE test_dc(id serial primary key, data int); + INSERT INTO test_dc(data) SELECT * FROM generate_series(1, 100); + CREATE INDEX test_dc_data ON test_dc(data); +} + +teardown +{ + DROP TABLE test_dc; +} + +session "s1" +step "noseq" { SET enable_seqscan = false; } +step "prepi" { PREPARE getrow_idx AS SELECT * FROM test_dc WHERE data=34 ORDER BY id,data; } +step "preps" { PREPARE getrow_seq AS SELECT * FROM test_dc WHERE data::text=34::text ORDER BY id,data; } +step "begin" { BEGIN; } +step "explaini" { EXPLAIN (COSTS OFF) EXECUTE getrow_idx; } +step "explains" { EXPLAIN (COSTS OFF) EXECUTE getrow_seq; } +step "selecti" { EXECUTE getrow_idx; } +step "selects" { EXECUTE getrow_seq; } +step "end" { COMMIT; } + +session "s2" +setup { BEGIN; } +step "select2" { SELECT * FROM test_dc WHERE data=34 ORDER BY id,data; } +step "insert2" { INSERT INTO test_dc(data) SELECT * FROM generate_series(1, 100); } +step "end2" { COMMIT; } + +session "s3" +step "drop" { DROP INDEX CONCURRENTLY test_dc_data; } + +permutation "noseq" "prepi" "preps" "begin" "explaini" "explains" "select2" "drop" "insert2" "end2" "selecti" "selects" "end" diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index 7286f1aa44..4c5b98a612 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -740,6 +740,45 @@ select min(f1), max(f1) from minmaxtest; 11 | 18 (1 row) +-- DISTINCT doesn't do anything useful here, but it shouldn't fail +explain (costs off) + select distinct min(f1), max(f1) from minmaxtest; + QUERY PLAN +---------------------------------------------------------------------------------------------- + HashAggregate + InitPlan 1 (returns $0) + -> Limit + -> Merge Append + Sort Key: minmaxtest.f1 + -> Index Only Scan using minmaxtesti on minmaxtest + Index Cond: (f1 IS NOT NULL) + -> Index Only Scan using minmaxtest1i on minmaxtest1 + Index Cond: (f1 IS NOT NULL) + -> Index Only Scan Backward using minmaxtest2i on minmaxtest2 + Index Cond: (f1 IS NOT NULL) + -> Index Only Scan using minmaxtest3i on minmaxtest3 + Index Cond: (f1 IS NOT NULL) + InitPlan 2 (returns $1) + -> Limit + -> Merge Append + Sort Key: minmaxtest_1.f1 + -> Index Only Scan Backward using minmaxtesti on minmaxtest minmaxtest_1 + Index Cond: (f1 IS NOT NULL) + -> Index Only Scan Backward using minmaxtest1i on minmaxtest1 minmaxtest1_1 + Index Cond: (f1 IS NOT NULL) + -> Index Only Scan using minmaxtest2i on minmaxtest2 minmaxtest2_1 + Index Cond: (f1 IS NOT NULL) + -> Index Only Scan Backward using minmaxtest3i on minmaxtest3 minmaxtest3_1 + Index Cond: (f1 IS NOT NULL) + -> Result +(26 rows) + +select distinct min(f1), max(f1) from minmaxtest; + min | max +-----+----- + 11 | 18 +(1 row) + drop table minmaxtest cascade; NOTICE: drop cascades to 3 other objects DETAIL: drop cascades to table minmaxtest1 diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index c22d74c7b5..8392c27b36 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -1780,6 +1780,28 @@ where oid = 'test_storage'::regclass; t (1 row) +-- ALTER TYPE with a check constraint and a child table (bug before Nov 2012) +CREATE TABLE test_inh_check (a float check (a > 10.2)); +CREATE TABLE test_inh_check_child() INHERITS(test_inh_check); +ALTER TABLE test_inh_check ALTER COLUMN a TYPE numeric; +\d test_inh_check +Table "public.test_inh_check" + Column | Type | Modifiers +--------+---------+----------- + a | numeric | +Check constraints: + "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision) +Number of child tables: 1 (Use \d+ to list them.) + +\d test_inh_check_child +Table "public.test_inh_check_child" + Column | Type | Modifiers +--------+---------+----------- + a | numeric | +Check constraints: + "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision) +Inherits: test_inh_check + -- -- lock levels -- diff --git a/src/test/regress/expected/copy2.out b/src/test/regress/expected/copy2.out index b1d07b3e1f..7a9e546aad 100644 --- a/src/test/regress/expected/copy2.out +++ b/src/test/regress/expected/copy2.out @@ -254,6 +254,130 @@ SELECT * FROM testnull; | (4 rows) +CREATE TABLE vistest (LIKE testeoc); +BEGIN; +TRUNCATE vistest; +COPY vistest FROM stdin CSV; +SELECT * FROM vistest; + a +---- + a1 + b +(2 rows) + +SAVEPOINT s1; +TRUNCATE vistest; +COPY vistest FROM stdin CSV; +SELECT * FROM vistest; + a +---- + d1 + e +(2 rows) + +COMMIT; +SELECT * FROM vistest; + a +---- + d1 + e +(2 rows) + +BEGIN; +TRUNCATE vistest; +COPY vistest FROM stdin CSV FREEZE; +SELECT * FROM vistest; + a +---- + a2 + b +(2 rows) + +SAVEPOINT s1; +TRUNCATE vistest; +COPY vistest FROM stdin CSV FREEZE; +SELECT * FROM vistest; + a +---- + d2 + e +(2 rows) + +COMMIT; +SELECT * FROM vistest; + a +---- + d2 + e +(2 rows) + +BEGIN; +TRUNCATE vistest; +COPY vistest FROM stdin CSV FREEZE; +SELECT * FROM vistest; + a +--- + x + y +(2 rows) + +COMMIT; +TRUNCATE vistest; +COPY vistest FROM stdin CSV FREEZE; +NOTICE: FREEZE option specified but pre-conditions not met +BEGIN; +INSERT INTO vistest VALUES ('z'); +SAVEPOINT s1; +TRUNCATE vistest; +ROLLBACK TO SAVEPOINT s1; +COPY vistest FROM stdin CSV FREEZE; +NOTICE: FREEZE option specified but pre-conditions not met +SELECT * FROM vistest; + a +---- + p + g + z + d3 + e +(5 rows) + +COMMIT; +CREATE FUNCTION truncate_in_subxact() RETURNS VOID AS +$$ +BEGIN + TRUNCATE vistest; +EXCEPTION + WHEN OTHERS THEN + INSERT INTO vistest VALUES ('subxact failure'); +END; +$$ language plpgsql; +BEGIN; +INSERT INTO vistest VALUES ('z'); +SELECT truncate_in_subxact(); + truncate_in_subxact +--------------------- + +(1 row) + +COPY vistest FROM stdin CSV FREEZE; +SELECT * FROM vistest; + a +---- + d4 + e +(2 rows) + +COMMIT; +SELECT * FROM vistest; + a +---- + d4 + e +(2 rows) + +DROP TABLE vistest; +DROP FUNCTION truncate_in_subxact(); DROP TABLE x, y; DROP FUNCTION fn_x_before(); DROP FUNCTION fn_x_after(); diff --git a/src/test/regress/expected/enum.out b/src/test/regress/expected/enum.out index ed729dddc3..36826428a0 100644 --- a/src/test/regress/expected/enum.out +++ b/src/test/regress/expected/enum.out @@ -556,6 +556,30 @@ ERROR: foreign key constraint "enumtest_bogus_child_parent_fkey" cannot be impl DETAIL: Key columns "parent" and "id" are of incompatible types: bogus and rainbow. DROP TYPE bogus; -- +-- check transactional behaviour of ALTER TYPE ... ADD VALUE +-- +CREATE TYPE bogus AS ENUM('good'); +-- check that we can't add new values to existing enums in a transaction +BEGIN; +ALTER TYPE bogus ADD VALUE 'bad'; +ERROR: ALTER TYPE ... ADD cannot run inside a transaction block +COMMIT; +-- check that we recognize the case where the enum already existed but was +-- modified in the current txn +BEGIN; +ALTER TYPE bogus RENAME TO bogon; +ALTER TYPE bogon ADD VALUE 'bad'; +ERROR: ALTER TYPE ... ADD cannot run inside a transaction block +ROLLBACK; +DROP TYPE bogus; +-- check that we *can* add new values to existing enums in a transaction, +-- if the type is new as well +BEGIN; +CREATE TYPE bogus AS ENUM(); +ALTER TYPE bogus ADD VALUE 'good'; +ALTER TYPE bogus ADD VALUE 'ugly'; +ROLLBACK; +-- -- Cleanup -- DROP TABLE enumtest_child; diff --git a/src/test/regress/expected/int2.out b/src/test/regress/expected/int2.out index 021d476822..53b484f718 100644 --- a/src/test/regress/expected/int2.out +++ b/src/test/regress/expected/int2.out @@ -255,3 +255,14 @@ SELECT ((-1::int2<<15)+1::int2)::text; -32767 (1 row) +-- check sane handling of INT16_MIN overflow cases +SELECT (-32768)::int2 * (-1)::int2; +ERROR: smallint out of range +SELECT (-32768)::int2 / (-1)::int2; +ERROR: smallint out of range +SELECT (-32768)::int2 % (-1)::int2; + ?column? +---------- + 0 +(1 row) + diff --git a/src/test/regress/expected/int4.out b/src/test/regress/expected/int4.out index 8f780240ae..fcb14e3855 100644 --- a/src/test/regress/expected/int4.out +++ b/src/test/regress/expected/int4.out @@ -342,3 +342,24 @@ SELECT ((-1::int4<<31)+1)::text; -2147483647 (1 row) +-- check sane handling of INT_MIN overflow cases +SELECT (-2147483648)::int4 * (-1)::int4; +ERROR: integer out of range +SELECT (-2147483648)::int4 / (-1)::int4; +ERROR: integer out of range +SELECT (-2147483648)::int4 % (-1)::int4; + ?column? +---------- + 0 +(1 row) + +SELECT (-2147483648)::int4 * (-1)::int2; +ERROR: integer out of range +SELECT (-2147483648)::int4 / (-1)::int2; +ERROR: integer out of range +SELECT (-2147483648)::int4 % (-1)::int2; + ?column? +---------- + 0 +(1 row) + diff --git a/src/test/regress/expected/int8-exp-three-digits.out b/src/test/regress/expected/int8-exp-three-digits.out index b523bfcc01..a1c70ed3e8 100644 --- a/src/test/regress/expected/int8-exp-three-digits.out +++ b/src/test/regress/expected/int8-exp-three-digits.out @@ -815,3 +815,34 @@ SELECT ((-1::int8<<63)+1)::text; -9223372036854775807 (1 row) +-- check sane handling of INT64_MIN overflow cases +SELECT (-9223372036854775808)::int8 * (-1)::int8; +ERROR: bigint out of range +SELECT (-9223372036854775808)::int8 / (-1)::int8; +ERROR: bigint out of range +SELECT (-9223372036854775808)::int8 % (-1)::int8; + ?column? +---------- + 0 +(1 row) + +SELECT (-9223372036854775808)::int8 * (-1)::int4; +ERROR: bigint out of range +SELECT (-9223372036854775808)::int8 / (-1)::int4; +ERROR: bigint out of range +SELECT (-9223372036854775808)::int8 % (-1)::int4; + ?column? +---------- + 0 +(1 row) + +SELECT (-9223372036854775808)::int8 * (-1)::int2; +ERROR: bigint out of range +SELECT (-9223372036854775808)::int8 / (-1)::int2; +ERROR: bigint out of range +SELECT (-9223372036854775808)::int8 % (-1)::int2; + ?column? +---------- + 0 +(1 row) + diff --git a/src/test/regress/expected/int8.out b/src/test/regress/expected/int8.out index 811d6a5520..e79c3a8af9 100644 --- a/src/test/regress/expected/int8.out +++ b/src/test/regress/expected/int8.out @@ -815,3 +815,34 @@ SELECT ((-1::int8<<63)+1)::text; -9223372036854775807 (1 row) +-- check sane handling of INT64_MIN overflow cases +SELECT (-9223372036854775808)::int8 * (-1)::int8; +ERROR: bigint out of range +SELECT (-9223372036854775808)::int8 / (-1)::int8; +ERROR: bigint out of range +SELECT (-9223372036854775808)::int8 % (-1)::int8; + ?column? +---------- + 0 +(1 row) + +SELECT (-9223372036854775808)::int8 * (-1)::int4; +ERROR: bigint out of range +SELECT (-9223372036854775808)::int8 / (-1)::int4; +ERROR: bigint out of range +SELECT (-9223372036854775808)::int8 % (-1)::int4; + ?column? +---------- + 0 +(1 row) + +SELECT (-9223372036854775808)::int8 * (-1)::int2; +ERROR: bigint out of range +SELECT (-9223372036854775808)::int8 / (-1)::int2; +ERROR: bigint out of range +SELECT (-9223372036854775808)::int8 % (-1)::int2; + ?column? +---------- + 0 +(1 row) + diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index d2c41b5e4f..22265d7a7c 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -2826,6 +2826,77 @@ select b.unique1 from (5 rows) +-- +-- test handling of potential equivalence clauses above outer joins +-- +explain (costs off) +select q1, unique2, thousand, hundred + from int8_tbl a left join tenk1 b on q1 = unique2 + where coalesce(thousand,123) = q1 and q1 = coalesce(hundred,123); + QUERY PLAN +-------------------------------------------------------------------------------------- + Nested Loop Left Join + Filter: ((COALESCE(b.thousand, 123) = a.q1) AND (a.q1 = COALESCE(b.hundred, 123))) + -> Seq Scan on int8_tbl a + -> Index Scan using tenk1_unique2 on tenk1 b + Index Cond: (a.q1 = unique2) +(5 rows) + +select q1, unique2, thousand, hundred + from int8_tbl a left join tenk1 b on q1 = unique2 + where coalesce(thousand,123) = q1 and q1 = coalesce(hundred,123); + q1 | unique2 | thousand | hundred +----+---------+----------+--------- +(0 rows) + +explain (costs off) +select f1, unique2, case when unique2 is null then f1 else 0 end + from int4_tbl a left join tenk1 b on f1 = unique2 + where (case when unique2 is null then f1 else 0 end) = 0; + QUERY PLAN +-------------------------------------------------------------------- + Nested Loop Left Join + Filter: (CASE WHEN (b.unique2 IS NULL) THEN a.f1 ELSE 0 END = 0) + -> Seq Scan on int4_tbl a + -> Index Only Scan using tenk1_unique2 on tenk1 b + Index Cond: (unique2 = a.f1) +(5 rows) + +select f1, unique2, case when unique2 is null then f1 else 0 end + from int4_tbl a left join tenk1 b on f1 = unique2 + where (case when unique2 is null then f1 else 0 end) = 0; + f1 | unique2 | case +----+---------+------ + 0 | 0 | 0 +(1 row) + +-- +-- test ability to push constants through outer join clauses +-- +explain (costs off) + select * from int4_tbl a left join tenk1 b on f1 = unique2 where f1 = 0; + QUERY PLAN +------------------------------------------------- + Nested Loop Left Join + Join Filter: (a.f1 = b.unique2) + -> Seq Scan on int4_tbl a + Filter: (f1 = 0) + -> Index Scan using tenk1_unique2 on tenk1 b + Index Cond: (unique2 = 0) +(6 rows) + +explain (costs off) + select * from tenk1 a full join tenk1 b using(unique2) where unique2 = 42; + QUERY PLAN +------------------------------------------------- + Merge Full Join + Merge Cond: (a.unique2 = b.unique2) + -> Index Scan using tenk1_unique2 on tenk1 a + Index Cond: (unique2 = 42) + -> Index Scan using tenk1_unique2 on tenk1 b + Index Cond: (unique2 = 42) +(6 rows) + -- -- test join removal -- diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index ad1591be59..a235571b3d 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1457,6 +1457,28 @@ ERROR: cannot drop rule _RETURN on view fooview because view fooview requires i HINT: You can drop view fooview instead. drop view fooview; -- +-- test conversion of table to view (needed to load some pg_dump files) +-- +create table fooview (x int, y text); +select xmin, * from fooview; + xmin | x | y +------+---+--- +(0 rows) + +create rule "_RETURN" as on select to fooview do instead + select 1 as x, 'aaa'::text as y; +select * from fooview; + x | y +---+----- + 1 | aaa +(1 row) + +select xmin, * from fooview; -- fail, views don't have such a column +ERROR: column "xmin" does not exist +LINE 1: select xmin, * from fooview; + ^ +drop view fooview; +-- -- check for planner problems with complex inherited UPDATES -- create table id (id serial primary key, name text); diff --git a/src/test/regress/expected/triggers.out b/src/test/regress/expected/triggers.out index b5af066558..94ea61f80c 100644 --- a/src/test/regress/expected/triggers.out +++ b/src/test/regress/expected/triggers.out @@ -1550,3 +1550,197 @@ drop table depth_a, depth_b, depth_c; drop function depth_a_tf(); drop function depth_b_tf(); drop function depth_c_tf(); +-- +-- Test updates to rows during firing of BEFORE ROW triggers. +-- As of 9.2, such cases should be rejected (see bug #6123). +-- +create temp table parent ( + aid int not null primary key, + val1 text, + val2 text, + val3 text, + val4 text, + bcnt int not null default 0); +create temp table child ( + bid int not null primary key, + aid int not null, + val1 text); +create function parent_upd_func() + returns trigger language plpgsql as +$$ +begin + if old.val1 <> new.val1 then + new.val2 = new.val1; + delete from child where child.aid = new.aid and child.val1 = new.val1; + end if; + return new; +end; +$$; +create trigger parent_upd_trig before update on parent + for each row execute procedure parent_upd_func(); +create function parent_del_func() + returns trigger language plpgsql as +$$ +begin + delete from child where aid = old.aid; + return old; +end; +$$; +create trigger parent_del_trig before delete on parent + for each row execute procedure parent_del_func(); +create function child_ins_func() + returns trigger language plpgsql as +$$ +begin + update parent set bcnt = bcnt + 1 where aid = new.aid; + return new; +end; +$$; +create trigger child_ins_trig after insert on child + for each row execute procedure child_ins_func(); +create function child_del_func() + returns trigger language plpgsql as +$$ +begin + update parent set bcnt = bcnt - 1 where aid = old.aid; + return old; +end; +$$; +create trigger child_del_trig after delete on child + for each row execute procedure child_del_func(); +insert into parent values (1, 'a', 'a', 'a', 'a', 0); +insert into child values (10, 1, 'b'); +select * from parent; select * from child; + aid | val1 | val2 | val3 | val4 | bcnt +-----+------+------+------+------+------ + 1 | a | a | a | a | 1 +(1 row) + + bid | aid | val1 +-----+-----+------ + 10 | 1 | b +(1 row) + +update parent set val1 = 'b' where aid = 1; -- should fail +ERROR: tuple to be updated was already modified by an operation triggered by the current command +HINT: Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows. +select * from parent; select * from child; + aid | val1 | val2 | val3 | val4 | bcnt +-----+------+------+------+------+------ + 1 | a | a | a | a | 1 +(1 row) + + bid | aid | val1 +-----+-----+------ + 10 | 1 | b +(1 row) + +delete from parent where aid = 1; -- should fail +ERROR: tuple to be updated was already modified by an operation triggered by the current command +HINT: Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows. +select * from parent; select * from child; + aid | val1 | val2 | val3 | val4 | bcnt +-----+------+------+------+------+------ + 1 | a | a | a | a | 1 +(1 row) + + bid | aid | val1 +-----+-----+------ + 10 | 1 | b +(1 row) + +-- replace the trigger function with one that restarts the deletion after +-- having modified a child +create or replace function parent_del_func() + returns trigger language plpgsql as +$$ +begin + delete from child where aid = old.aid; + if found then + delete from parent where aid = old.aid; + return null; -- cancel outer deletion + end if; + return old; +end; +$$; +delete from parent where aid = 1; +select * from parent; select * from child; + aid | val1 | val2 | val3 | val4 | bcnt +-----+------+------+------+------+------ +(0 rows) + + bid | aid | val1 +-----+-----+------ +(0 rows) + +drop table parent, child; +drop function parent_upd_func(); +drop function parent_del_func(); +drop function child_ins_func(); +drop function child_del_func(); +-- similar case, but with a self-referencing FK so that parent and child +-- rows can be affected by a single operation +create temp table self_ref_trigger ( + id int primary key, + parent int references self_ref_trigger, + data text, + nchildren int not null default 0 +); +create function self_ref_trigger_ins_func() + returns trigger language plpgsql as +$$ +begin + if new.parent is not null then + update self_ref_trigger set nchildren = nchildren + 1 + where id = new.parent; + end if; + return new; +end; +$$; +create trigger self_ref_trigger_ins_trig before insert on self_ref_trigger + for each row execute procedure self_ref_trigger_ins_func(); +create function self_ref_trigger_del_func() + returns trigger language plpgsql as +$$ +begin + if old.parent is not null then + update self_ref_trigger set nchildren = nchildren - 1 + where id = old.parent; + end if; + return old; +end; +$$; +create trigger self_ref_trigger_del_trig before delete on self_ref_trigger + for each row execute procedure self_ref_trigger_del_func(); +insert into self_ref_trigger values (1, null, 'root'); +insert into self_ref_trigger values (2, 1, 'root child A'); +insert into self_ref_trigger values (3, 1, 'root child B'); +insert into self_ref_trigger values (4, 2, 'grandchild 1'); +insert into self_ref_trigger values (5, 3, 'grandchild 2'); +update self_ref_trigger set data = 'root!' where id = 1; +select * from self_ref_trigger; + id | parent | data | nchildren +----+--------+--------------+----------- + 2 | 1 | root child A | 1 + 4 | 2 | grandchild 1 | 0 + 3 | 1 | root child B | 1 + 5 | 3 | grandchild 2 | 0 + 1 | | root! | 2 +(5 rows) + +delete from self_ref_trigger; +ERROR: tuple to be updated was already modified by an operation triggered by the current command +HINT: Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows. +select * from self_ref_trigger; + id | parent | data | nchildren +----+--------+--------------+----------- + 2 | 1 | root child A | 1 + 4 | 2 | grandchild 1 | 0 + 3 | 1 | root child B | 1 + 5 | 3 | grandchild 2 | 0 + 1 | | root! | 2 +(5 rows) + +drop table self_ref_trigger; +drop function self_ref_trigger_ins_func(); +drop function self_ref_trigger_del_func(); diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index f1a72f7eca..663bf8ac56 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -83,9 +83,13 @@ test: select_into select_distinct select_distinct_on select_implicit select_havi # ---------- # Another group of parallel tests # ---------- -test: privileges security_label collate alter_generic +test: privileges security_label collate + +# ---------- +# Another group of parallel tests +# ---------- +test: misc alter_generic -test: misc # rules cannot run concurrently with any test that creates a view test: rules # event triggers cannot run concurrently with any test that runs DDL diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c index 81e7b69231..166811914f 100644 --- a/src/test/regress/pg_regress.c +++ b/src/test/regress/pg_regress.c @@ -782,6 +782,19 @@ initialize_environment(void) doputenv("PGPORT", s); } + /* + * GNU make stores some flags in the MAKEFLAGS environment variable to + * pass arguments to its own children. If we are invoked by make, + * that causes the make invoked by us to think its part of the make + * task invoking us, and so it tries to communicate with the toplevel + * make. Which fails. + * + * Unset the variable to protect against such problems. We also reset + * MAKELEVEL to be certain the child doesn't notice the make above us. + */ + unsetenv("MAKEFLAGS"); + unsetenv("MAKELEVEL"); + /* * Adjust path variables to point into the temp-install tree */ @@ -1906,13 +1919,6 @@ help(void) int regression_main(int argc, char *argv[], init_function ifunc, test_function tfunc) { - _stringlist *sl; - int c; - int i; - int option_index; - char buf[MAXPGPATH * 4]; - char buf2[MAXPGPATH * 4]; - static struct option long_options[] = { {"help", no_argument, NULL, 'h'}, {"version", no_argument, NULL, 'V'}, @@ -1941,6 +1947,13 @@ regression_main(int argc, char *argv[], init_function ifunc, test_function tfunc {NULL, 0, NULL, 0} }; + _stringlist *sl; + int c; + int i; + int option_index; + char buf[MAXPGPATH * 4]; + char buf2[MAXPGPATH * 4]; + progname = get_progname(argv[0]); set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_regress")); diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index f2813fb88c..be789e3f44 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -51,7 +51,6 @@ test: create_function_1 test: create_type test: create_table test: create_function_2 -test: create_function_3 test: copy test: copyselect test: create_misc @@ -59,6 +58,7 @@ test: create_operator test: create_index test: create_view test: create_aggregate +test: create_function_3 test: create_cast test: constraints test: triggers @@ -95,6 +95,7 @@ test: privileges test: security_label test: collate test: misc +test: alter_generic test: rules test: event_trigger test: select_views @@ -125,7 +126,6 @@ test: without_oid test: conversion test: truncate test: alter_table -test: alter_generic test: sequence test: polymorphism test: rowtypes diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql index 53a2183b3d..38d4757df3 100644 --- a/src/test/regress/sql/aggregates.sql +++ b/src/test/regress/sql/aggregates.sql @@ -278,6 +278,11 @@ explain (costs off) select min(f1), max(f1) from minmaxtest; select min(f1), max(f1) from minmaxtest; +-- DISTINCT doesn't do anything useful here, but it shouldn't fail +explain (costs off) + select distinct min(f1), max(f1) from minmaxtest; +select distinct min(f1), max(f1) from minmaxtest; + drop table minmaxtest cascade; -- check for correct detection of nested-aggregate errors diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql index 0f9cb380e1..dcf8121d70 100644 --- a/src/test/regress/sql/alter_table.sql +++ b/src/test/regress/sql/alter_table.sql @@ -1239,6 +1239,13 @@ select reltoastrelid <> 0 as has_toast_table from pg_class where oid = 'test_storage'::regclass; +-- ALTER TYPE with a check constraint and a child table (bug before Nov 2012) +CREATE TABLE test_inh_check (a float check (a > 10.2)); +CREATE TABLE test_inh_check_child() INHERITS(test_inh_check); +ALTER TABLE test_inh_check ALTER COLUMN a TYPE numeric; +\d test_inh_check +\d test_inh_check_child + -- -- lock levels -- diff --git a/src/test/regress/sql/copy2.sql b/src/test/regress/sql/copy2.sql index 1961446fdb..fbaa433be4 100644 --- a/src/test/regress/sql/copy2.sql +++ b/src/test/regress/sql/copy2.sql @@ -179,6 +179,86 @@ COPY testnull FROM stdin WITH NULL AS E'\\0'; SELECT * FROM testnull; +CREATE TABLE vistest (LIKE testeoc); +BEGIN; +TRUNCATE vistest; +COPY vistest FROM stdin CSV; +a1 +b +\. +SELECT * FROM vistest; +SAVEPOINT s1; +TRUNCATE vistest; +COPY vistest FROM stdin CSV; +d1 +e +\. +SELECT * FROM vistest; +COMMIT; +SELECT * FROM vistest; + +BEGIN; +TRUNCATE vistest; +COPY vistest FROM stdin CSV FREEZE; +a2 +b +\. +SELECT * FROM vistest; +SAVEPOINT s1; +TRUNCATE vistest; +COPY vistest FROM stdin CSV FREEZE; +d2 +e +\. +SELECT * FROM vistest; +COMMIT; +SELECT * FROM vistest; + +BEGIN; +TRUNCATE vistest; +COPY vistest FROM stdin CSV FREEZE; +x +y +\. +SELECT * FROM vistest; +COMMIT; +TRUNCATE vistest; +COPY vistest FROM stdin CSV FREEZE; +p +g +\. +BEGIN; +INSERT INTO vistest VALUES ('z'); +SAVEPOINT s1; +TRUNCATE vistest; +ROLLBACK TO SAVEPOINT s1; +COPY vistest FROM stdin CSV FREEZE; +d3 +e +\. +SELECT * FROM vistest; +COMMIT; +CREATE FUNCTION truncate_in_subxact() RETURNS VOID AS +$$ +BEGIN + TRUNCATE vistest; +EXCEPTION + WHEN OTHERS THEN + INSERT INTO vistest VALUES ('subxact failure'); +END; +$$ language plpgsql; +BEGIN; +INSERT INTO vistest VALUES ('z'); +SELECT truncate_in_subxact(); +COPY vistest FROM stdin CSV FREEZE; +d4 +e +\. +SELECT * FROM vistest; +COMMIT; +SELECT * FROM vistest; +DROP TABLE vistest; +DROP FUNCTION truncate_in_subxact(); DROP TABLE x, y; DROP FUNCTION fn_x_before(); DROP FUNCTION fn_x_after(); diff --git a/src/test/regress/sql/enum.sql b/src/test/regress/sql/enum.sql index 130a723f69..88a835e8aa 100644 --- a/src/test/regress/sql/enum.sql +++ b/src/test/regress/sql/enum.sql @@ -257,6 +257,33 @@ CREATE TYPE bogus AS ENUM('good', 'bad', 'ugly'); CREATE TABLE enumtest_bogus_child(parent bogus REFERENCES enumtest_parent); DROP TYPE bogus; +-- +-- check transactional behaviour of ALTER TYPE ... ADD VALUE +-- +CREATE TYPE bogus AS ENUM('good'); + +-- check that we can't add new values to existing enums in a transaction +BEGIN; +ALTER TYPE bogus ADD VALUE 'bad'; +COMMIT; + +-- check that we recognize the case where the enum already existed but was +-- modified in the current txn +BEGIN; +ALTER TYPE bogus RENAME TO bogon; +ALTER TYPE bogon ADD VALUE 'bad'; +ROLLBACK; + +DROP TYPE bogus; + +-- check that we *can* add new values to existing enums in a transaction, +-- if the type is new as well +BEGIN; +CREATE TYPE bogus AS ENUM(); +ALTER TYPE bogus ADD VALUE 'good'; +ALTER TYPE bogus ADD VALUE 'ugly'; +ROLLBACK; + -- -- Cleanup -- diff --git a/src/test/regress/sql/int2.sql b/src/test/regress/sql/int2.sql index f11eb283c6..bacfbb24ac 100644 --- a/src/test/regress/sql/int2.sql +++ b/src/test/regress/sql/int2.sql @@ -87,3 +87,8 @@ SELECT '' AS five, i.f1, i.f1 / int4 '2' AS x FROM INT2_TBL i; -- corner cases SELECT (-1::int2<<15)::text; SELECT ((-1::int2<<15)+1::int2)::text; + +-- check sane handling of INT16_MIN overflow cases +SELECT (-32768)::int2 * (-1)::int2; +SELECT (-32768)::int2 / (-1)::int2; +SELECT (-32768)::int2 % (-1)::int2; diff --git a/src/test/regress/sql/int4.sql b/src/test/regress/sql/int4.sql index ffae7ce4cb..1843a6d33b 100644 --- a/src/test/regress/sql/int4.sql +++ b/src/test/regress/sql/int4.sql @@ -127,3 +127,11 @@ SELECT (2 + 2) / 2 AS two; -- corner case SELECT (-1::int4<<31)::text; SELECT ((-1::int4<<31)+1)::text; + +-- check sane handling of INT_MIN overflow cases +SELECT (-2147483648)::int4 * (-1)::int4; +SELECT (-2147483648)::int4 / (-1)::int4; +SELECT (-2147483648)::int4 % (-1)::int4; +SELECT (-2147483648)::int4 * (-1)::int2; +SELECT (-2147483648)::int4 / (-1)::int2; +SELECT (-2147483648)::int4 % (-1)::int2; diff --git a/src/test/regress/sql/int8.sql b/src/test/regress/sql/int8.sql index 27e0696b32..2f7f30c91d 100644 --- a/src/test/regress/sql/int8.sql +++ b/src/test/regress/sql/int8.sql @@ -194,3 +194,14 @@ SELECT * FROM generate_series('+4567890123456789'::int8, '+4567890123456799'::in -- corner case SELECT (-1::int8<<63)::text; SELECT ((-1::int8<<63)+1)::text; + +-- check sane handling of INT64_MIN overflow cases +SELECT (-9223372036854775808)::int8 * (-1)::int8; +SELECT (-9223372036854775808)::int8 / (-1)::int8; +SELECT (-9223372036854775808)::int8 % (-1)::int8; +SELECT (-9223372036854775808)::int8 * (-1)::int4; +SELECT (-9223372036854775808)::int8 / (-1)::int4; +SELECT (-9223372036854775808)::int8 % (-1)::int4; +SELECT (-9223372036854775808)::int8 * (-1)::int2; +SELECT (-9223372036854775808)::int8 / (-1)::int2; +SELECT (-9223372036854775808)::int8 % (-1)::int2; diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index b0cf51cbc8..6c1e3394ad 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -749,6 +749,38 @@ select b.unique1 from right join int4_tbl i2 on i2.f1 = b.tenthous order by 1; +-- +-- test handling of potential equivalence clauses above outer joins +-- + +explain (costs off) +select q1, unique2, thousand, hundred + from int8_tbl a left join tenk1 b on q1 = unique2 + where coalesce(thousand,123) = q1 and q1 = coalesce(hundred,123); + +select q1, unique2, thousand, hundred + from int8_tbl a left join tenk1 b on q1 = unique2 + where coalesce(thousand,123) = q1 and q1 = coalesce(hundred,123); + +explain (costs off) +select f1, unique2, case when unique2 is null then f1 else 0 end + from int4_tbl a left join tenk1 b on f1 = unique2 + where (case when unique2 is null then f1 else 0 end) = 0; + +select f1, unique2, case when unique2 is null then f1 else 0 end + from int4_tbl a left join tenk1 b on f1 = unique2 + where (case when unique2 is null then f1 else 0 end) = 0; + +-- +-- test ability to push constants through outer join clauses +-- + +explain (costs off) + select * from int4_tbl a left join tenk1 b on f1 = unique2 where f1 = 0; + +explain (costs off) + select * from tenk1 a full join tenk1 b using(unique2) where unique2 = 42; + -- -- test join removal -- diff --git a/src/test/regress/sql/rules.sql b/src/test/regress/sql/rules.sql index 1de5b0b685..458c2f026c 100644 --- a/src/test/regress/sql/rules.sql +++ b/src/test/regress/sql/rules.sql @@ -859,6 +859,21 @@ create view fooview as select 'foo'::text; drop rule "_RETURN" on fooview; drop view fooview; +-- +-- test conversion of table to view (needed to load some pg_dump files) +-- + +create table fooview (x int, y text); +select xmin, * from fooview; + +create rule "_RETURN" as on select to fooview do instead + select 1 as x, 'aaa'::text as y; + +select * from fooview; +select xmin, * from fooview; -- fail, views don't have such a column + +drop view fooview; + -- -- check for planner problems with complex inherited UPDATES -- diff --git a/src/test/regress/sql/triggers.sql b/src/test/regress/sql/triggers.sql index f88cb81940..78c5407560 100644 --- a/src/test/regress/sql/triggers.sql +++ b/src/test/regress/sql/triggers.sql @@ -1025,3 +1025,158 @@ drop table depth_a, depth_b, depth_c; drop function depth_a_tf(); drop function depth_b_tf(); drop function depth_c_tf(); + +-- +-- Test updates to rows during firing of BEFORE ROW triggers. +-- As of 9.2, such cases should be rejected (see bug #6123). +-- + +create temp table parent ( + aid int not null primary key, + val1 text, + val2 text, + val3 text, + val4 text, + bcnt int not null default 0); +create temp table child ( + bid int not null primary key, + aid int not null, + val1 text); + +create function parent_upd_func() + returns trigger language plpgsql as +$$ +begin + if old.val1 <> new.val1 then + new.val2 = new.val1; + delete from child where child.aid = new.aid and child.val1 = new.val1; + end if; + return new; +end; +$$; +create trigger parent_upd_trig before update on parent + for each row execute procedure parent_upd_func(); + +create function parent_del_func() + returns trigger language plpgsql as +$$ +begin + delete from child where aid = old.aid; + return old; +end; +$$; +create trigger parent_del_trig before delete on parent + for each row execute procedure parent_del_func(); + +create function child_ins_func() + returns trigger language plpgsql as +$$ +begin + update parent set bcnt = bcnt + 1 where aid = new.aid; + return new; +end; +$$; +create trigger child_ins_trig after insert on child + for each row execute procedure child_ins_func(); + +create function child_del_func() + returns trigger language plpgsql as +$$ +begin + update parent set bcnt = bcnt - 1 where aid = old.aid; + return old; +end; +$$; +create trigger child_del_trig after delete on child + for each row execute procedure child_del_func(); + +insert into parent values (1, 'a', 'a', 'a', 'a', 0); +insert into child values (10, 1, 'b'); +select * from parent; select * from child; + +update parent set val1 = 'b' where aid = 1; -- should fail +select * from parent; select * from child; + +delete from parent where aid = 1; -- should fail +select * from parent; select * from child; + +-- replace the trigger function with one that restarts the deletion after +-- having modified a child +create or replace function parent_del_func() + returns trigger language plpgsql as +$$ +begin + delete from child where aid = old.aid; + if found then + delete from parent where aid = old.aid; + return null; -- cancel outer deletion + end if; + return old; +end; +$$; + +delete from parent where aid = 1; +select * from parent; select * from child; + +drop table parent, child; + +drop function parent_upd_func(); +drop function parent_del_func(); +drop function child_ins_func(); +drop function child_del_func(); + +-- similar case, but with a self-referencing FK so that parent and child +-- rows can be affected by a single operation + +create temp table self_ref_trigger ( + id int primary key, + parent int references self_ref_trigger, + data text, + nchildren int not null default 0 +); + +create function self_ref_trigger_ins_func() + returns trigger language plpgsql as +$$ +begin + if new.parent is not null then + update self_ref_trigger set nchildren = nchildren + 1 + where id = new.parent; + end if; + return new; +end; +$$; +create trigger self_ref_trigger_ins_trig before insert on self_ref_trigger + for each row execute procedure self_ref_trigger_ins_func(); + +create function self_ref_trigger_del_func() + returns trigger language plpgsql as +$$ +begin + if old.parent is not null then + update self_ref_trigger set nchildren = nchildren - 1 + where id = old.parent; + end if; + return old; +end; +$$; +create trigger self_ref_trigger_del_trig before delete on self_ref_trigger + for each row execute procedure self_ref_trigger_del_func(); + +insert into self_ref_trigger values (1, null, 'root'); +insert into self_ref_trigger values (2, 1, 'root child A'); +insert into self_ref_trigger values (3, 1, 'root child B'); +insert into self_ref_trigger values (4, 2, 'grandchild 1'); +insert into self_ref_trigger values (5, 3, 'grandchild 2'); + +update self_ref_trigger set data = 'root!' where id = 1; + +select * from self_ref_trigger; + +delete from self_ref_trigger; + +select * from self_ref_trigger; + +drop table self_ref_trigger; +drop function self_ref_trigger_ins_func(); +drop function self_ref_trigger_del_func(); diff --git a/src/timezone/data/africa b/src/timezone/data/africa index 1c6e5203f1..54c7a1e6ad 100644 --- a/src/timezone/data/africa +++ b/src/timezone/data/africa @@ -4,7 +4,7 @@ # This data is by no means authoritative; if you think you know better, # go ahead and edit the file (and please send any changes to -# tz@elsie.nci.nih.gov for general use in the future). +# tz@iana.org for general use in the future). # From Paul Eggert (2006-03-22): # @@ -424,6 +424,20 @@ Zone Africa/Monrovia -0:43:08 - LMT 1882 # Libya +# From Even Scharning (2012-11-10): +# Libya set their time one hour back at 02:00 on Saturday November 10. +# http://www.libyaherald.com/2012/11/04/clocks-to-go-back-an-hour-on-saturday/ +# Here is an official source [in Arabic]: http://ls.ly/fb6Yc +# +# Steffen Thorsen forwarded a translation (2012-11-10) in +# http://mm.icann.org/pipermail/tz/2012-November/018451.html +# +# From Tim Parenti (2012-11-11): +# Treat the 2012-11-10 change as a zone change from UTC+2 to UTC+1. +# The DST rules planned for 2013 and onward roughly mirror those of Europe +# (either two days before them or five days after them, so as to fall on +# lastFri instead of lastSun). + # Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S Rule Libya 1951 only - Oct 14 2:00 1:00 S Rule Libya 1952 only - Jan 1 0:00 0 - @@ -438,17 +452,21 @@ Rule Libya 1986 only - Apr 4 0:00 1:00 S Rule Libya 1986 only - Oct 3 0:00 0 - Rule Libya 1987 1989 - Apr 1 0:00 1:00 S Rule Libya 1987 1989 - Oct 1 0:00 0 - +Rule Libya 1997 only - Apr 4 0:00 1:00 S +Rule Libya 1997 only - Oct 4 0:00 0 - +Rule Libya 2013 max - Mar lastFri 1:00 1:00 S +Rule Libya 2013 max - Oct lastFri 2:00 0 - # Zone NAME GMTOFF RULES FORMAT [UNTIL] Zone Africa/Tripoli 0:52:44 - LMT 1920 1:00 Libya CE%sT 1959 2:00 - EET 1982 1:00 Libya CE%sT 1990 May 4 -# The following entries are from Shanks & Pottenger; +# The 1996 and 1997 entries are from Shanks & Pottenger; # the IATA SSIM data contain some obvious errors. 2:00 - EET 1996 Sep 30 - 1:00 - CET 1997 Apr 4 - 1:00 1:00 CEST 1997 Oct 4 - 2:00 - EET + 1:00 Libya CE%sT 1997 Oct 4 + 2:00 - EET 2012 Nov 10 2:00 + 1:00 Libya CE%sT # Madagascar # Zone NAME GMTOFF RULES FORMAT [UNTIL] diff --git a/src/timezone/data/asia b/src/timezone/data/asia index a02e10924e..d5562c8467 100644 --- a/src/timezone/data/asia +++ b/src/timezone/data/asia @@ -4,7 +4,7 @@ # This data is by no means authoritative; if you think you know better, # go ahead and edit the file (and please send any changes to -# tz@elsie.nci.nih.gov for general use in the future). +# tz@iana.org for general use in the future). # From Paul Eggert (2006-03-22): # @@ -1170,15 +1170,15 @@ Rule Zion 2004 only - Sep 22 1:00 0 S # # ftp://ftp.cs.huji.ac.il/pub/tz/announcements/2005+beyond.ps -# From Paul Eggert (2005-02-22): +# From Paul Eggert (2012-10-26): # I used Ephraim Silverberg's dst-israel.el program # (2005-02-20) # along with Ed Reingold's cal-hebrew in GNU Emacs 21.4, -# to generate the transitions in this list. +# to generate the transitions from 2005 through 2012. # (I replaced "lastFri" with "Fri>=26" by hand.) -# The spring transitions below all correspond to the following Rule: +# The spring transitions all correspond to the following Rule: # -# Rule Zion 2005 max - Mar Fri>=26 2:00 1:00 D +# Rule Zion 2005 2012 - Mar Fri>=26 2:00 1:00 D # # but older zic implementations (e.g., Solaris 8) do not support # "Fri>=26" to mean April 1 in years like 2005, so for now we list the @@ -1195,39 +1195,39 @@ Rule Zion 2009 only - Sep 27 2:00 0 S Rule Zion 2010 only - Sep 12 2:00 0 S Rule Zion 2011 only - Apr 1 2:00 1:00 D Rule Zion 2011 only - Oct 2 2:00 0 S -Rule Zion 2012 2015 - Mar Fri>=26 2:00 1:00 D +Rule Zion 2012 only - Mar Fri>=26 2:00 1:00 D Rule Zion 2012 only - Sep 23 2:00 0 S -Rule Zion 2013 only - Sep 8 2:00 0 S -Rule Zion 2014 only - Sep 28 2:00 0 S -Rule Zion 2015 only - Sep 20 2:00 0 S -Rule Zion 2016 only - Apr 1 2:00 1:00 D -Rule Zion 2016 only - Oct 9 2:00 0 S -Rule Zion 2017 2021 - Mar Fri>=26 2:00 1:00 D -Rule Zion 2017 only - Sep 24 2:00 0 S -Rule Zion 2018 only - Sep 16 2:00 0 S -Rule Zion 2019 only - Oct 6 2:00 0 S -Rule Zion 2020 only - Sep 27 2:00 0 S -Rule Zion 2021 only - Sep 12 2:00 0 S -Rule Zion 2022 only - Apr 1 2:00 1:00 D -Rule Zion 2022 only - Oct 2 2:00 0 S -Rule Zion 2023 2032 - Mar Fri>=26 2:00 1:00 D -Rule Zion 2023 only - Sep 24 2:00 0 S -Rule Zion 2024 only - Oct 6 2:00 0 S -Rule Zion 2025 only - Sep 28 2:00 0 S -Rule Zion 2026 only - Sep 20 2:00 0 S -Rule Zion 2027 only - Oct 10 2:00 0 S -Rule Zion 2028 only - Sep 24 2:00 0 S -Rule Zion 2029 only - Sep 16 2:00 0 S -Rule Zion 2030 only - Oct 6 2:00 0 S -Rule Zion 2031 only - Sep 21 2:00 0 S -Rule Zion 2032 only - Sep 12 2:00 0 S -Rule Zion 2033 only - Apr 1 2:00 1:00 D -Rule Zion 2033 only - Oct 2 2:00 0 S -Rule Zion 2034 2037 - Mar Fri>=26 2:00 1:00 D -Rule Zion 2034 only - Sep 17 2:00 0 S -Rule Zion 2035 only - Oct 7 2:00 0 S -Rule Zion 2036 only - Sep 28 2:00 0 S -Rule Zion 2037 only - Sep 13 2:00 0 S + +# From Ephraim Silverberg (2012-10-18): +# Yesterday, the Interior Ministry Committee, after more than a year +# past, approved sending the proposed June 2011 changes to the Time +# Decree Law back to the Knesset for second and third (final) votes +# before the upcoming elections on Jan. 22, 2013. Hence, although the +# changes are not yet law, they are expected to be so before February 2013. +# +# As of 2013, DST starts at 02:00 on the Friday before the last Sunday in March. +# DST ends at 02:00 on the first Sunday after October 1, unless it occurs on the +# second day of the Jewish Rosh Hashana holiday, in which case DST ends a day +# later (i.e. at 02:00 the first Monday after October 2). +# [Rosh Hashana holidays are factored in until 2100.] + +# From Ephraim Silverberg (2012-11-05): +# The Knesset passed today (in second and final readings) the amendment to the +# Time Decree Law making the changes ... law. + +# Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S +Rule Zion 2013 max - Mar Fri>=23 2:00 1:00 D +Rule Zion 2013 2026 - Oct Sun>=2 2:00 0 S +Rule Zion 2027 only - Oct Mon>=3 2:00 0 S +Rule Zion 2028 max - Oct Sun>=2 2:00 0 S +# The following rules are commented out for now, as they break older +# versions of zic that support only signed 32-bit timestamps, i.e., +# through 2038-01-19 03:14:07 UTC. +#Rule Zion 2028 2053 - Oct Sun>=2 2:00 0 S +#Rule Zion 2054 only - Oct Mon>=3 2:00 0 S +#Rule Zion 2055 2080 - Oct Sun>=2 2:00 0 S +#Rule Zion 2081 only - Oct Mon>=3 2:00 0 S +#Rule Zion 2082 max - Oct Sun>=2 2:00 0 S # Zone NAME GMTOFF RULES FORMAT [UNTIL] Zone Asia/Jerusalem 2:20:56 - LMT 1880 @@ -1362,6 +1362,16 @@ Zone Asia/Tokyo 9:18:59 - LMT 1887 Dec 31 15:00u # From Arthur David Olson (2009-04-06): # We still have Jordan switching to DST on Thursdays in 2000 and 2001. +# From Steffen Thorsen (2012-10-25): +# Yesterday the government in Jordan announced that they will not +# switch back to standard time this winter, so the will stay on DST +# until about the same time next year (at least). +# http://www.petra.gov.jo/Public_News/Nws_NewsDetails.aspx?NewsID=88950 +# +# From Paul Eggert (2012-10-25): +# For now, assume this is just a one-year measure. If it becomes +# permanent, we should move Jordan from EET to AST effective tomorrow. + # Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S Rule Jordan 1973 only - Jun 6 0:00 1:00 S Rule Jordan 1973 1975 - Oct 1 0:00 0 - @@ -1390,7 +1400,8 @@ Rule Jordan 2002 max - Mar lastThu 24:00 1:00 S Rule Jordan 2003 only - Oct 24 0:00s 0 - Rule Jordan 2004 only - Oct 15 0:00s 0 - Rule Jordan 2005 only - Sep lastFri 0:00s 0 - -Rule Jordan 2006 max - Oct lastFri 0:00s 0 - +Rule Jordan 2006 2011 - Oct lastFri 0:00s 0 - +Rule Jordan 2013 max - Oct lastFri 0:00s 0 - # Zone NAME GMTOFF RULES FORMAT [UNTIL] Zone Asia/Amman 2:23:44 - LMT 1931 2:00 Jordan EE%sT @@ -2041,8 +2052,7 @@ Zone Asia/Karachi 4:28:12 - LMT 1907 # occurred before our cutoff date of 1970. # However, as we get more information, we may need to add entries # for parts of the West Bank as they transitioned from Israel's rules -# to Palestine's rules. If you have more info about this, please -# send it to tz@elsie.nci.nih.gov for incorporation into future editions. +# to Palestine's rules. # From IINS News Service - Israel - 1998-03-23 10:38:07 Israel time, # forwarded by Ephraim Silverberg: @@ -2293,6 +2303,8 @@ Rule Palestine 2010 only - Aug 11 0:00 0 - # From Arthur David Olson (2011-09-20): # 2011 transitions per http://www.timeanddate.com as of 2011-09-20. +# From Paul Eggert (2012-10-12): +# 2012 transitions per http://www.timeanddate.com as of 2012-10-12. # Zone NAME GMTOFF RULES FORMAT [UNTIL] Zone Asia/Gaza 2:17:52 - LMT 1900 Oct @@ -2303,7 +2315,7 @@ Zone Asia/Gaza 2:17:52 - LMT 1900 Oct 2:00 Palestine EE%sT 2011 Apr 2 12:01 2:00 1:00 EEST 2011 Aug 1 2:00 - EET 2012 Mar 30 - 2:00 1:00 EEST 2012 Sep 28 + 2:00 1:00 EEST 2012 Sep 21 1:00 2:00 - EET Zone Asia/Hebron 2:20:23 - LMT 1900 Oct @@ -2318,7 +2330,7 @@ Zone Asia/Hebron 2:20:23 - LMT 1900 Oct 2:00 - EET 2011 Aug 30 2:00 1:00 EEST 2011 Sep 30 3:00 2:00 - EET 2012 Mar 30 - 2:00 1:00 EEST 2012 Sep 28 3:00 + 2:00 1:00 EEST 2012 Sep 21 1:00 2:00 - EET # Paracel Is diff --git a/src/timezone/data/australasia b/src/timezone/data/australasia index eb241175d7..bef6f20422 100644 --- a/src/timezone/data/australasia +++ b/src/timezone/data/australasia @@ -628,6 +628,23 @@ Zone Pacific/Pago_Pago 12:37:12 - LMT 1879 Jul 5 # Although Samoa has used Daylight Saving Time in the 2010-2011 and 2011-2012 # seasons, there is not yet any indication that this trend will continue on # a regular basis. For now, we have explicitly listed the transitions below. +# +# From Nicky (2012-09-10): +# Daylight Saving Time commences on Sunday 30th September 2012 and +# ends on Sunday 7th of April 2013. +# +# Please find link below for more information. +# http://www.mcil.gov.ws/mcil_publications.html +# +# That publication also includes dates for Summer of 2013/4 as well +# which give the impression of a pattern in selecting dates for the +# future, so for now, we will guess this will continue. + +# Western Samoa +# Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S +Rule WS 2012 max - Sep lastSun 3:00 1 D +Rule WS 2012 max - Apr Sun>=1 4:00 0 - +# Zone NAME GMTOFF RULES FORMAT [UNTIL] Zone Pacific/Apia 12:33:04 - LMT 1879 Jul 5 -11:26:56 - LMT 1911 -11:30 - SAMT 1950 # Samoa Time @@ -635,8 +652,8 @@ Zone Pacific/Apia 12:33:04 - LMT 1879 Jul 5 -11:00 1:00 WSDT 2011 Apr 2 4:00 -11:00 - WST 2011 Sep 24 3:00 -11:00 1:00 WSDT 2011 Dec 30 - 13:00 1:00 WSDT 2012 Apr 1 4:00 - 13:00 - WST + 13:00 1:00 WSDT 2012 Apr Sun>=1 4:00 + 13:00 WS WS%sT # Solomon Is # excludes Bougainville, for which see Papua New Guinea @@ -763,7 +780,7 @@ Zone Pacific/Wallis 12:15:20 - LMT 1901 # This data is by no means authoritative; if you think you know better, # go ahead and edit the file (and please send any changes to -# tz@elsie.nci.nih.gov for general use in the future). +# tz@iana.org for general use in the future). # From Paul Eggert (2006-03-22): # A good source for time zone historical data outside the U.S. is diff --git a/src/timezone/data/europe b/src/timezone/data/europe index 5d2acc5a7c..ad9816c10f 100644 --- a/src/timezone/data/europe +++ b/src/timezone/data/europe @@ -4,7 +4,7 @@ # This data is by no means authoritative; if you think you know better, # go ahead and edit the file (and please send any changes to -# tz@elsie.nci.nih.gov for general use in the future). +# tz@iana.org for general use in the future). # From Paul Eggert (2006-03-22): # A good source for time zone historical data outside the U.S. is diff --git a/src/timezone/data/northamerica b/src/timezone/data/northamerica index 1f784e0523..772d7a439e 100644 --- a/src/timezone/data/northamerica +++ b/src/timezone/data/northamerica @@ -6,7 +6,7 @@ # This data is by no means authoritative; if you think you know better, # go ahead and edit the file (and please send any changes to -# tz@elsie.nci.nih.gov for general use in the future). +# tz@iana.org for general use in the future). # From Paul Eggert (1999-03-22): # A reliable and entertaining source about time zones is @@ -2797,6 +2797,13 @@ Zone America/Costa_Rica -5:36:20 - LMT 1890 # San Jose # http://www.timeanddate.com/news/time/cuba-starts-dst-2012.html # +# From Steffen Thorsen (2012-11-03): +# Radio Reloj and many other sources report that Cuba is changing back +# to standard time on 2012-11-04: +# http://www.radioreloj.cu/index.php/noticias-radio-reloj/36-nacionales/9961-regira-horario-normal-en-cuba-desde-el-domingo-cuatro-de-noviembre +# From Paul Eggert (2012-11-03): +# For now, assume the future rule is first Sunday in November. + # Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S Rule Cuba 1928 only - Jun 10 0:00 1:00 D Rule Cuba 1928 only - Oct 10 0:00 0 S @@ -2834,7 +2841,7 @@ Rule Cuba 2009 2010 - Mar Sun>=8 0:00s 1:00 D Rule Cuba 2011 only - Mar Sun>=15 0:00s 1:00 D Rule Cuba 2011 only - Nov 13 0:00s 0 S Rule Cuba 2012 only - Apr 1 0:00s 1:00 D -Rule Cuba 2012 max - Oct lastSun 0:00s 0 S +Rule Cuba 2012 max - Nov Sun>=1 0:00s 0 S Rule Cuba 2013 max - Mar Sun>=8 0:00s 1:00 D # Zone NAME GMTOFF RULES FORMAT [UNTIL] diff --git a/src/timezone/data/southamerica b/src/timezone/data/southamerica index ab4d14d0e1..3301a43acd 100644 --- a/src/timezone/data/southamerica +++ b/src/timezone/data/southamerica @@ -4,7 +4,7 @@ # This data is by no means authoritative; if you think you know better, # go ahead and edit the file (and please send any changes to -# tz@elsie.nci.nih.gov for general use in the future). +# tz@iana.org for general use in the future). # From Paul Eggert (2006-03-22): # A good source for time zone historical data outside the U.S. is @@ -829,6 +829,15 @@ Zone America/La_Paz -4:32:36 - LMT 1890 # http://www.in.gov.br/visualiza/index.jsp?data=13/10/2011&jornal=1000&pagina=6&totalArquivos=6 # +# From Kelley Cook (2012-10-16): +# The governor of state of Bahia in Brazil announced on Thursday that +# due to public pressure, he is reversing the DST policy they implemented +# last year and will not be going to Summer Time on October 21st.... +# http://www.correio24horas.com.br/r/artigo/apos-pressoes-wagner-suspende-horario-de-verao-na-bahia + +# From Rodrigo Severo (2012-10-16): +# Tocantins state will have DST. +# http://noticias.terra.com.br/brasil/noticias/0,,OI6232536-EI306.html # Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S # Decree 20,466 (1931-10-01) @@ -1048,7 +1057,8 @@ Zone America/Araguaina -3:12:48 - LMT 1914 -3:00 Brazil BR%sT 1990 Sep 17 -3:00 - BRT 1995 Sep 14 -3:00 Brazil BR%sT 2003 Sep 24 - -3:00 - BRT + -3:00 - BRT 2012 Oct 21 + -3:00 Brazil BR%sT # # Alagoas (AL), Sergipe (SE) Zone America/Maceio -2:22:52 - LMT 1914 @@ -1067,7 +1077,8 @@ Zone America/Maceio -2:22:52 - LMT 1914 Zone America/Bahia -2:34:04 - LMT 1914 -3:00 Brazil BR%sT 2003 Sep 24 -3:00 - BRT 2011 Oct 16 - -3:00 Brazil BR%sT + -3:00 Brazil BR%sT 2012 Oct 21 + -3:00 - BRT # # Goias (GO), Distrito Federal (DF), Minas Gerais (MG), # Espirito Santo (ES), Rio de Janeiro (RJ), Sao Paulo (SP), Parana (PR), diff --git a/src/tools/find_static b/src/tools/find_static index c7014e6014..0643021879 100755 --- a/src/tools/find_static +++ b/src/tools/find_static @@ -16,8 +16,7 @@ trap "rm -f /tmp/$$" 0 1 2 3 15 # symbol, then the function name. find . -name '[a-z]*.o' -type f -print | while read FILE -do - nm $FILE | cut -c10-100 |awk '{printf "%s\t%s\t%s\n", "'"$FILE"'",$1,$2}' +do nm $FILE | cut -c17-100 |awk '{printf "%s\t%s\t%s\n", "'"$FILE"'",$1,$2}' done >/tmp/$$ dropdb debug createdb debug