From 51e20d97283f2e7101d69a73da2f923205171616 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Mon, 1 Feb 2021 17:20:09 +0300 Subject: [PATCH 01/63] fix forgotten in 29adb5b comment --- src/utils/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/file.c b/src/utils/file.c index 634ddfba0..0266e23b9 100644 --- a/src/utils/file.c +++ b/src/utils/file.c @@ -1678,7 +1678,7 @@ int fio_send_pages(const char *to_fullpath, const char *from_fullpath, pgFile *f /* send message with header - 8bytes 24bytes var var + 16bytes 24bytes var var -------------------------------------------------------------- | fio_header | fio_send_request | FILE PATH | BITMAP(if any) | -------------------------------------------------------------- From c4a8488205cb563272d06c39dcfa58f0c1f8d935 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Fri, 14 May 2021 12:59:12 +0300 Subject: [PATCH 02/63] porting catchup to release_2_5 branch --- Makefile | 2 +- src/backup.c | 96 ++-- src/catchup.c | 810 ++++++++++++++++++++++++++++++++ src/data.c | 277 +++++++++++ src/help.c | 44 ++ src/pg_probackup.c | 32 +- src/pg_probackup.h | 58 +++ src/stream.c | 13 +- src/util.c | 15 + src/utils/configuration.c | 1 + src/utils/configuration.h | 3 +- src/utils/file.c | 179 ++++++- tests/catchup.py | 157 +++++++ tests/helpers/ptrack_helpers.py | 25 + 14 files changed, 1657 insertions(+), 55 deletions(-) create mode 100644 src/catchup.c create mode 100644 tests/catchup.py diff --git a/Makefile b/Makefile index 1431be4ef..5173aa38f 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ OBJS = src/utils/configuration.o src/utils/json.o src/utils/logger.o \ OBJS += src/archive.o src/backup.o src/catalog.o src/checkdb.o src/configure.o src/data.o \ src/delete.o src/dir.o src/fetch.o src/help.o src/init.o src/merge.o \ src/parsexlog.o src/ptrack.o src/pg_probackup.o src/restore.o src/show.o src/stream.o \ - src/util.o src/validate.o src/datapagemap.o + src/util.o src/validate.o src/datapagemap.o src/catchup.o # borrowed files OBJS += src/pg_crc.o src/receivelog.o src/streamutil.o \ diff --git a/src/backup.c b/src/backup.c index 7bdf9d6fb..94b6a1024 100644 --- a/src/backup.c +++ b/src/backup.c @@ -27,7 +27,7 @@ //const char *progname = "pg_probackup"; /* list of files contained in backup */ -static parray *backup_files_list = NULL; +parray *backup_files_list = NULL; /* We need critical section for datapagemap_add() in case of using threads */ static pthread_mutex_t backup_pagemap_mutex = PTHREAD_MUTEX_INITIALIZER; @@ -49,17 +49,8 @@ static void *backup_files(void *arg); static void do_backup_instance(PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync, bool backup_logs); -static void pg_start_backup(const char *label, bool smooth, pgBackup *backup, - PGNodeInfo *nodeInfo, PGconn *conn); -static void pg_switch_wal(PGconn *conn); -static void pg_stop_backup(pgBackup *backup, PGconn *pg_startbackup_conn, PGNodeInfo *nodeInfo); +void pg_switch_wal(PGconn *conn); -static XLogRecPtr wait_wal_lsn(XLogRecPtr lsn, bool is_start_lsn, TimeLineID tli, - bool in_prev_segment, bool segment_only, - int timeout_elevel, bool in_stream_dir); - -static void check_external_for_tablespaces(parray *external_list, - PGconn *backup_conn); static parray *get_database_map(PGconn *pg_startbackup_conn); /* pgpro specific functions */ @@ -83,7 +74,7 @@ backup_stopbackup_callback(bool fatal, void *userdata) if (backup_in_progress) { elog(WARNING, "backup in progress, stop backup"); - pg_stop_backup(NULL, pg_startbackup_conn, NULL); /* don't care about stop_lsn in case of error */ + pg_stop_backup(NULL, pg_startbackup_conn, NULL, NULL); /* don't care about stop_lsn in case of error */ } } @@ -137,7 +128,7 @@ do_backup_instance(PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync, bool strlen(" with pg_probackup")); /* Call pg_start_backup function in PostgreSQL connect */ - pg_start_backup(label, smooth_checkpoint, ¤t, nodeInfo, backup_conn); + pg_start_backup(label, smooth_checkpoint, current.backup_mode, current.from_replica, ¤t.start_lsn, nodeInfo, backup_conn); /* Obtain current timeline */ #if PG_VERSION_NUM >= 90600 @@ -268,7 +259,7 @@ do_backup_instance(PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync, bool * Because WAL streaming will start after pg_start_backup() in stream * mode. */ - wait_wal_lsn(current.start_lsn, true, current.tli, false, true, ERROR, false); + wait_wal_lsn(current.start_lsn, true, current.tli, false, true, ERROR, false, arclog_path); } /* start stream replication */ @@ -525,7 +516,11 @@ do_backup_instance(PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync, bool } /* Notify end of backup */ - pg_stop_backup(¤t, backup_conn, nodeInfo); + { + char backup_database_dir[MAXPGPATH]; + pgBackupGetPath(¤t, backup_database_dir, lengthof(backup_database_dir), DATABASE_DIR); + pg_stop_backup(¤t, backup_conn, nodeInfo, backup_database_dir); + } /* In case of backup from replica >= 9.6 we must fix minRecPoint, * First we must find pg_control in backup_files_list. @@ -1025,8 +1020,8 @@ confirm_block_size(PGconn *conn, const char *name, int blcksz) /* * Notify start of backup to PostgreSQL server. */ -static void -pg_start_backup(const char *label, bool smooth, pgBackup *backup, +void +pg_start_backup(const char *label, bool smooth, BackupMode backup_mode, bool from_replica, XLogRecPtr *start_lsn, PGNodeInfo *nodeInfo, PGconn *conn) { PGresult *res; @@ -1059,12 +1054,12 @@ pg_start_backup(const char *label, bool smooth, pgBackup *backup, /* Extract timeline and LSN from results of pg_start_backup() */ XLogDataFromLSN(PQgetvalue(res, 0, 0), &lsn_hi, &lsn_lo); /* Calculate LSN */ - backup->start_lsn = ((uint64) lsn_hi )<< 32 | lsn_lo; + *start_lsn = ((uint64) lsn_hi )<< 32 | lsn_lo; PQclear(res); - if ((!stream_wal || current.backup_mode == BACKUP_MODE_DIFF_PAGE) && - !backup->from_replica && + if ((!stream_wal || backup_mode == BACKUP_MODE_DIFF_PAGE) && + !from_replica && !(nodeInfo->server_version < 90600 && !nodeInfo->is_superuser)) /* @@ -1080,7 +1075,7 @@ pg_start_backup(const char *label, bool smooth, pgBackup *backup, * Switch to a new WAL segment. It should be called only for master. * For PG 9.5 it should be called only if pguser is superuser. */ -static void +void pg_switch_wal(PGconn *conn) { PGresult *res; @@ -1259,15 +1254,14 @@ pg_is_superuser(PGconn *conn) * Returns target LSN if such is found, failing that returns LSN of record prior to target LSN. * Returns InvalidXLogRecPtr if 'segment_only' flag is used. */ -static XLogRecPtr +XLogRecPtr wait_wal_lsn(XLogRecPtr target_lsn, bool is_start_lsn, TimeLineID tli, bool in_prev_segment, bool segment_only, - int timeout_elevel, bool in_stream_dir) + int timeout_elevel, bool in_stream_dir, const char *wal_segment_dir) { XLogSegNo targetSegNo; - char pg_wal_dir[MAXPGPATH]; + //char pg_wal_dir[MAXPGPATH]; char wal_segment_path[MAXPGPATH], - *wal_segment_dir, wal_segment[MAXFNAMELEN]; bool file_exists = false; uint32 try_count = 0, @@ -1285,6 +1279,7 @@ wait_wal_lsn(XLogRecPtr target_lsn, bool is_start_lsn, TimeLineID tli, GetXLogFileName(wal_segment, tli, targetSegNo, instance_config.xlog_seg_size); + join_path_components(wal_segment_path, wal_segment_dir, wal_segment); /* * In pg_start_backup we wait for 'target_lsn' in 'pg_wal' directory if it is * stream and non-page backup. Page backup needs archived WAL files, so we @@ -1292,7 +1287,7 @@ wait_wal_lsn(XLogRecPtr target_lsn, bool is_start_lsn, TimeLineID tli, * * In pg_stop_backup it depends only on stream_wal. */ - if (in_stream_dir) + /*if (in_stream_dir) { pgBackupGetPath2(¤t, pg_wal_dir, lengthof(pg_wal_dir), DATABASE_DIR, PG_XLOG_DIR); @@ -1303,7 +1298,7 @@ wait_wal_lsn(XLogRecPtr target_lsn, bool is_start_lsn, TimeLineID tli, { join_path_components(wal_segment_path, arclog_path, wal_segment); wal_segment_dir = arclog_path; - } + }*/ /* TODO: remove this in 3.0 (it is a cludge against some old bug with archive_timeout) */ if (instance_config.archive_timeout > 0) @@ -1435,9 +1430,9 @@ wait_wal_lsn(XLogRecPtr target_lsn, bool is_start_lsn, TimeLineID tli, /* * Notify end of backup to PostgreSQL server. */ -static void +void pg_stop_backup(pgBackup *backup, PGconn *pg_startbackup_conn, - PGNodeInfo *nodeInfo) + PGNodeInfo *nodeInfo, const char *destination_dir) { PGconn *conn; PGresult *res; @@ -1669,9 +1664,12 @@ pg_stop_backup(pgBackup *backup, PGconn *pg_startbackup_conn, if (stream_wal) { - pgBackupGetPath2(backup, stream_xlog_path, + /*pgBackupGetPath2(backup, stream_xlog_path, lengthof(stream_xlog_path), DATABASE_DIR, PG_XLOG_DIR); + */ + /* destination_dir!= NULL if !cleanup */ + join_path_components(stream_xlog_path, destination_dir, PG_XLOG_DIR); xlog_path = stream_xlog_path; } else @@ -1701,7 +1699,7 @@ pg_stop_backup(pgBackup *backup, PGconn *pg_startbackup_conn, { /* Wait for segment with current stop_lsn, it is ok for it to never arrive */ wait_wal_lsn(stop_backup_lsn_tmp, false, backup->tli, - false, true, WARNING, stream_wal); + false, true, WARNING, stream_wal, xlog_path); /* Get the first record in segment with current stop_lsn */ lsn_tmp = get_first_record_lsn(xlog_path, segno, backup->tli, @@ -1729,7 +1727,7 @@ pg_stop_backup(pgBackup *backup, PGconn *pg_startbackup_conn, * because previous record can be the contrecord. */ lsn_tmp = wait_wal_lsn(stop_backup_lsn_tmp, false, backup->tli, - true, false, ERROR, stream_wal); + true, false, ERROR, stream_wal, xlog_path); /* sanity */ if (!XRecOffIsValid(lsn_tmp) || XLogRecPtrIsInvalid(lsn_tmp)) @@ -1743,7 +1741,7 @@ pg_stop_backup(pgBackup *backup, PGconn *pg_startbackup_conn, { /* Wait for segment with current stop_lsn */ wait_wal_lsn(stop_backup_lsn_tmp, false, backup->tli, - false, true, ERROR, stream_wal); + false, true, ERROR, stream_wal, xlog_path); /* Get the next closest record in segment with current stop_lsn */ lsn_tmp = get_next_record_lsn(xlog_path, segno, backup->tli, @@ -1774,10 +1772,12 @@ pg_stop_backup(pgBackup *backup, PGconn *pg_startbackup_conn, if (!exclusive_backup) { Assert(PQnfields(res) >= 4); - pgBackupGetPath(backup, path, lengthof(path), DATABASE_DIR); + /*pgBackupGetPath(backup, path, lengthof(path), DATABASE_DIR);*/ /* Write backup_label */ - join_path_components(backup_label, path, PG_BACKUP_LABEL_FILE); + /*join_path_components(backup_label, path, PG_BACKUP_LABEL_FILE);*/ + /* destination_dir!= NULL if !cleanup */ + join_path_components(backup_label, destination_dir, PG_BACKUP_LABEL_FILE); fp = fio_fopen(backup_label, PG_BINARY_W, FIO_BACKUP_HOST); if (fp == NULL) elog(ERROR, "can't open backup label file \"%s\": %s", @@ -1827,7 +1827,8 @@ pg_stop_backup(pgBackup *backup, PGconn *pg_startbackup_conn, { char tablespace_map[MAXPGPATH]; - join_path_components(tablespace_map, path, PG_TABLESPACE_MAP_FILE); + /*join_path_components(tablespace_map, path, PG_TABLESPACE_MAP_FILE);*/ + join_path_components(tablespace_map, destination_dir, PG_TABLESPACE_MAP_FILE); fp = fio_fopen(tablespace_map, PG_BINARY_W, FIO_BACKUP_HOST); if (fp == NULL) elog(ERROR, "can't open tablespace map file \"%s\": %s", @@ -1865,6 +1866,17 @@ pg_stop_backup(pgBackup *backup, PGconn *pg_startbackup_conn, char *xlog_path, stream_xlog_path[MAXPGPATH]; + if (stream_wal) + { + /*pgBackupGetPath2(backup, stream_xlog_path, + lengthof(stream_xlog_path), + DATABASE_DIR, PG_XLOG_DIR);*/ + /* destination_dir!= NULL if backup!= NULL */ + join_path_components(stream_xlog_path, destination_dir, PG_XLOG_DIR); + xlog_path = stream_xlog_path; + } + else + xlog_path = arclog_path; /* * Wait for stop_lsn to be archived or streamed. * If replica returned valid STOP_LSN of not actually existing record, @@ -1872,7 +1884,7 @@ pg_stop_backup(pgBackup *backup, PGconn *pg_startbackup_conn, */ if (!stop_lsn_exists) stop_backup_lsn = wait_wal_lsn(stop_backup_lsn_tmp, false, backup->tli, - false, false, ERROR, stream_wal); + false, false, ERROR, stream_wal, xlog_path); if (stream_wal) { @@ -1880,14 +1892,8 @@ pg_stop_backup(pgBackup *backup, PGconn *pg_startbackup_conn, * to the passed filelist */ if(wait_WAL_streaming_end(backup_files_list)) elog(ERROR, "WAL streaming failed"); - - pgBackupGetPath2(backup, stream_xlog_path, - lengthof(stream_xlog_path), - DATABASE_DIR, PG_XLOG_DIR); - xlog_path = stream_xlog_path; } - else - xlog_path = arclog_path; + backup->stop_lsn = stop_backup_lsn; backup->recovery_xid = recovery_xid; @@ -2259,7 +2265,7 @@ process_block_change(ForkNumber forknum, RelFileNode rnode, BlockNumber blkno) } -static void +void check_external_for_tablespaces(parray *external_list, PGconn *backup_conn) { PGresult *res; diff --git a/src/catchup.c b/src/catchup.c new file mode 100644 index 000000000..5a8beb370 --- /dev/null +++ b/src/catchup.c @@ -0,0 +1,810 @@ +/*------------------------------------------------------------------------- + * + * catchup.c: sync DB cluster + * + * Portions Copyright (c) 2009-2013, NIPPON TELEGRAPH AND TELEPHONE CORPORATION + * Portions Copyright (c) 2015-2021, Postgres Professional + * + *------------------------------------------------------------------------- + */ + +#include "pg_probackup.h" + +#if PG_VERSION_NUM < 110000 +#include "catalog/catalog.h" +#endif +#include "catalog/pg_tablespace.h" +#include "pgtar.h" +#include "streamutil.h" + +#include +#include +#include + +#include "utils/thread.h" +#include "utils/file.h" + +/* + * Catchup routines + */ +static void *catchup_files(void *arg); + +static void do_catchup_instance(char *source_pgdata, char *dest_pgdata, PGconn *source_conn, PGNodeInfo *nodeInfo, BackupMode backup_mode, bool no_sync, bool backup_logs); + +static void +do_catchup_instance(char *source_pgdata, char *dest_pgdata, PGconn *source_conn, PGNodeInfo *nodeInfo, BackupMode backup_mode, bool no_sync, bool backup_logs) +{ + int i; + //char database_path[MAXPGPATH]; + //char external_prefix[MAXPGPATH]; /* Temp value. Used as template */ + char dst_xlog_path[MAXPGPATH]; + char label[1024]; + /* XLogRecPtr prev_backup_start_lsn = InvalidXLogRecPtr; */ + XLogRecPtr sync_lsn = InvalidXLogRecPtr; + XLogRecPtr start_lsn; + + /* arrays with meta info for multi threaded backup */ + pthread_t *threads; + catchup_files_arg *threads_args; + bool backup_isok = true; + + /* pgBackup *prev_backup = NULL; */ + parray *prev_backup_filelist = NULL; + parray *backup_list = NULL; + parray *external_dirs = NULL; + + /* used for multitimeline incremental backup */ + parray *tli_list = NULL; + + /* for fancy reporting */ + time_t start_time, end_time; + char pretty_time[20]; + char pretty_bytes[20]; + + elog(LOG, "Database catchup start"); + if(current.external_dir_str) + { + external_dirs = make_external_directory_list(current.external_dir_str, + false); + check_external_for_tablespaces(external_dirs, source_conn); + } + + /* Clear ptrack files for not PTRACK backups */ + if (backup_mode != BACKUP_MODE_DIFF_PTRACK && nodeInfo->is_ptrack_enable) + pg_ptrack_clear(source_conn, nodeInfo->ptrack_version_num); + + /* notify start of backup to PostgreSQL server */ + time2iso(label, lengthof(label), current.start_time, false); + strncat(label, " with pg_probackup", lengthof(label) - + strlen(" with pg_probackup")); + + /* Call pg_start_backup function in PostgreSQL connect */ + pg_start_backup(label, smooth_checkpoint, backup_mode, current.from_replica, &start_lsn, nodeInfo, source_conn); + elog(LOG, "pg_start_backup START LSN %X/%X", (uint32) (start_lsn >> 32), (uint32) (start_lsn)); + + /* Obtain current timeline */ +#if PG_VERSION_NUM >= 90600 + current.tli = get_current_timeline(source_conn); +#else + current.tli = get_current_timeline_from_control(false); +#endif + + /* In PAGE mode or in ARCHIVE wal-mode wait for current segment */ + if (backup_mode == BACKUP_MODE_DIFF_PAGE ||!stream_wal) + /* + * Do not wait start_lsn for stream backup. + * Because WAL streaming will start after pg_start_backup() in stream + * mode. + */ + wait_wal_lsn(start_lsn, true, current.tli, false, true, ERROR, false, arclog_path); + + if (backup_mode == BACKUP_MODE_DIFF_PAGE || + backup_mode == BACKUP_MODE_DIFF_PTRACK || + backup_mode == BACKUP_MODE_DIFF_DELTA) + { + prev_backup_filelist = parray_new(); + dir_list_file(prev_backup_filelist, dest_pgdata, + true, true, false, backup_logs, true, 0, FIO_LOCAL_HOST); + + sync_lsn = get_min_recovery_point(dest_pgdata); + elog(INFO, "syncLSN = %X/%X", (uint32) (sync_lsn >> 32), (uint32) sync_lsn); + } + + /* + * It`s illegal to take PTRACK backup if LSN from ptrack_control() is not + * equal to start_lsn of previous backup. + */ + if (backup_mode == BACKUP_MODE_DIFF_PTRACK) + { + XLogRecPtr ptrack_lsn = get_last_ptrack_lsn(source_conn, nodeInfo); + + if (nodeInfo->ptrack_version_num < 20) + { + elog(ERROR, "ptrack extension is too old.\n" + "Upgrade ptrack to version >= 2"); + } + else + { + // new ptrack is more robust and checks Start LSN + if (ptrack_lsn > sync_lsn || ptrack_lsn == InvalidXLogRecPtr) + { + elog(ERROR, "LSN from ptrack_control %X/%X is greater than checkpoint LSN %X/%X.\n" + "Create new full backup before an incremental one.", + (uint32) (ptrack_lsn >> 32), (uint32) (ptrack_lsn), + (uint32) (sync_lsn >> 32), + (uint32) (sync_lsn)); + } + } + } + + /* For incremental backup check that start_lsn is not from the past + * Though it will not save us if PostgreSQL instance is actually + * restored STREAM backup. + */ + /* TODO это нужно? */ + if (backup_mode != BACKUP_MODE_FULL && + sync_lsn > start_lsn) + elog(ERROR, "Current START LSN %X/%X is lower than START LSN %X/%X. " + "It may indicate that we are trying to backup PostgreSQL instance from the past.", + (uint32) (start_lsn >> 32), (uint32) (start_lsn), + (uint32) (sync_lsn >> 32), (uint32) (sync_lsn)); + + /* Update running backup meta with START LSN */ + //write_backup(¤t, true); + + //pgBackupGetPath(¤t, database_path, lengthof(database_path), + // DATABASE_DIR); + //pgBackupGetPath(¤t, external_prefix, lengthof(external_prefix), + // EXTERNAL_DIR); + + /* start stream replication */ + if (stream_wal) + { + instance_config.system_identifier = get_system_identifier(source_pgdata); + join_path_components(dst_xlog_path, dest_pgdata, PG_XLOG_DIR); + fio_mkdir(dst_xlog_path, DIR_PERMISSION, FIO_BACKUP_HOST); + + start_WAL_streaming(source_conn, dst_xlog_path, &instance_config.conn_opt, + start_lsn, current.tli); + } + + /* initialize backup list */ + backup_files_list = parray_new(); + + /* list files with the logical path. omit $PGDATA */ + if (fio_is_remote(FIO_DB_HOST)) + fio_list_dir(backup_files_list, source_pgdata, + true, true, false, backup_logs, true, 0); + else + dir_list_file(backup_files_list, source_pgdata, + true, true, false, backup_logs, true, 0, FIO_LOCAL_HOST); + + /* + * Append to backup list all files and directories + * from external directory option + */ + if (external_dirs) + { + for (i = 0; i < parray_num(external_dirs); i++) + { + /* External dirs numeration starts with 1. + * 0 value is not external dir */ + if (fio_is_remote(FIO_DB_HOST)) + fio_list_dir(backup_files_list, parray_get(external_dirs, i), + false, true, false, false, true, i+1); + else + dir_list_file(backup_files_list, parray_get(external_dirs, i), + false, true, false, false, true, i+1, FIO_LOCAL_HOST); + } + } + + /* close ssh session in main thread */ + fio_disconnect(); + + /* Sanity check for backup_files_list, thank you, Windows: + * https://github.com/postgrespro/pg_probackup/issues/48 + */ + + if (parray_num(backup_files_list) < 100) + elog(ERROR, "PGDATA is almost empty. Either it was concurrently deleted or " + "pg_probackup do not possess sufficient permissions to list PGDATA content"); + + /* Calculate pgdata_bytes */ + for (i = 0; i < parray_num(backup_files_list); i++) + { + pgFile *file = (pgFile *) parray_get(backup_files_list, i); + + if (file->external_dir_num != 0) + continue; + + if (S_ISDIR(file->mode)) + { + current.pgdata_bytes += 4096; + continue; + } + + current.pgdata_bytes += file->size; + } + + pretty_size(current.pgdata_bytes, pretty_bytes, lengthof(pretty_bytes)); + elog(INFO, "PGDATA size: %s", pretty_bytes); + + /* + * Sort pathname ascending. It is necessary to create intermediate + * directories sequentially. + * + * For example: + * 1 - create 'base' + * 2 - create 'base/1' + * + * Sorted array is used at least in parse_filelist_filenames(), + * extractPageMap(), make_pagemap_from_ptrack(). + */ + parray_qsort(backup_files_list, pgFileCompareRelPathWithExternal); + + /* Extract information about files in backup_list parsing their names:*/ + parse_filelist_filenames(backup_files_list, source_pgdata); + + elog(LOG, "Current Start LSN: %X/%X, TLI: %X", + (uint32) (start_lsn >> 32), (uint32) (start_lsn), + current.tli); + /* TODO проверить, нужна ли проверка TLI */ + /*if (backup_mode != BACKUP_MODE_FULL) + elog(LOG, "Parent Start LSN: %X/%X, TLI: %X", + (uint32) (sync_lsn >> 32), (uint32) (sync_lsn), + prev_backup->tli); + */ + /* + * Build page mapping in incremental mode. + */ + + if (backup_mode == BACKUP_MODE_DIFF_PAGE || + backup_mode == BACKUP_MODE_DIFF_PTRACK) + { + bool pagemap_isok = true; + + time(&start_time); + elog(INFO, "Extracting pagemap of changed blocks"); + + if (backup_mode == BACKUP_MODE_DIFF_PAGE) + { + /* + * Build the page map. Obtain information about changed pages + * reading WAL segments present in archives up to the point + * where this backup has started. + */ + /* TODO page пока не поддерживается */ + /* pagemap_isok = extractPageMap(arclog_path, instance_config.xlog_seg_size, + sync_lsn, prev_backup->tli, + current.start_lsn, current.tli, tli_list); + */ + } + else if (backup_mode == BACKUP_MODE_DIFF_PTRACK) + { + /* + * Build the page map from ptrack information. + */ + make_pagemap_from_ptrack_2(backup_files_list, source_conn, + nodeInfo->ptrack_schema, + nodeInfo->ptrack_version_num, + sync_lsn); + } + + time(&end_time); + + /* TODO: add ms precision */ + if (pagemap_isok) + elog(INFO, "Pagemap successfully extracted, time elapsed: %.0f sec", + difftime(end_time, start_time)); + else + elog(ERROR, "Pagemap extraction failed, time elasped: %.0f sec", + difftime(end_time, start_time)); + } + + /* + * Make directories before backup and setup threads at the same time + */ + for (i = 0; i < parray_num(backup_files_list); i++) + { + pgFile *file = (pgFile *) parray_get(backup_files_list, i); + + /* if the entry was a directory, create it in the backup */ + if (S_ISDIR(file->mode)) + { + char dirpath[MAXPGPATH]; + + if (file->external_dir_num) + { + char temp[MAXPGPATH]; + /* TODO пока непонятно, разобраться! */ + /* snprintf(temp, MAXPGPATH, "%s%d", external_prefix, + file->external_dir_num); */ + join_path_components(dirpath, temp, file->rel_path); + } + else + join_path_components(dirpath, dest_pgdata, file->rel_path); + + elog(VERBOSE, "Create directory '%s'", dirpath); + fio_mkdir(dirpath, DIR_PERMISSION, FIO_BACKUP_HOST); + } + + /* setup threads */ + pg_atomic_clear_flag(&file->lock); + } + + /* Sort by size for load balancing */ + parray_qsort(backup_files_list, pgFileCompareSize); + /* Sort the array for binary search */ + if (prev_backup_filelist) + parray_qsort(prev_backup_filelist, pgFileCompareRelPathWithExternal); + + /* write initial backup_content.control file and update backup.control */ + //write_backup_filelist(¤t, backup_files_list, + // instance_config.pgdata, external_dirs, true); + //write_backup(¤t, true); + + /* Init backup page header map */ + //init_header_map(¤t); + + /* init thread args with own file lists */ + threads = (pthread_t *) palloc(sizeof(pthread_t) * num_threads); + threads_args = (catchup_files_arg *) palloc(sizeof(catchup_files_arg)*num_threads); + + for (i = 0; i < num_threads; i++) + { + catchup_files_arg *arg = &(threads_args[i]); + + arg->nodeInfo = nodeInfo; + arg->from_root = source_pgdata; + arg->to_root = dest_pgdata; + /* TODO разобраться */ + //arg->external_prefix = external_prefix; + //arg->external_dirs = external_dirs; + arg->files_list = backup_files_list; + /* TODO !!!! change to target file_list */ + arg->prev_filelist = prev_backup_filelist; + /* arg->prev_start_lsn = prev_backup_start_lsn; */ + arg->prev_start_lsn = sync_lsn; + arg->backup_mode = backup_mode; + arg->conn_arg.conn = NULL; + arg->conn_arg.cancel_conn = NULL; + /* TODO !!!! */ + arg->hdr_map = &(current.hdr_map); + arg->thread_num = i+1; + /* By default there are some error */ + arg->ret = 1; + } + + /* Run threads */ + thread_interrupted = false; + elog(INFO, "Start transferring data files"); + time(&start_time); + for (i = 0; i < num_threads; i++) + { + catchup_files_arg *arg = &(threads_args[i]); + + elog(VERBOSE, "Start thread num: %i", i); + pthread_create(&threads[i], NULL, catchup_files, arg); + } + + /* Wait threads */ + for (i = 0; i < num_threads; i++) + { + pthread_join(threads[i], NULL); + if (threads_args[i].ret == 1) + backup_isok = false; + } + + time(&end_time); + pretty_time_interval(difftime(end_time, start_time), + pretty_time, lengthof(pretty_time)); + if (backup_isok) + elog(INFO, "Data files are transferred, time elapsed: %s", + pretty_time); + else + elog(ERROR, "Data files transferring failed, time elapsed: %s", + pretty_time); + + /* clean previous backup file list */ + if (prev_backup_filelist) + { + parray_walk(prev_backup_filelist, pgFileFree); + parray_free(prev_backup_filelist); + } + + /* Notify end of backup */ + current.start_lsn = start_lsn; + pg_stop_backup(¤t, source_conn, nodeInfo, dest_pgdata); + + /* In case of backup from replica >= 9.6 we must fix minRecPoint, + * First we must find pg_control in backup_files_list. + */ + if (current.from_replica && !exclusive_backup) + { + pgFile *pg_control = NULL; + + for (i = 0; i < parray_num(backup_files_list); i++) + { + pgFile *tmp_file = (pgFile *) parray_get(backup_files_list, i); + + if (tmp_file->external_dir_num == 0 && + (strcmp(tmp_file->rel_path, XLOG_CONTROL_FILE) == 0)) + { + pg_control = tmp_file; + break; + } + } + + if (!pg_control) + elog(ERROR, "Failed to find file \"%s\" in backup filelist.", + XLOG_CONTROL_FILE); + + set_min_recovery_point(pg_control, dest_pgdata, current.stop_lsn); + } + + /* close and sync page header map */ + //if (current.hdr_map.fp) + //{ + // cleanup_header_map(&(current.hdr_map)); + // + // if (fio_sync(current.hdr_map.path, FIO_BACKUP_HOST) != 0) + // elog(ERROR, "Cannot sync file \"%s\": %s", current.hdr_map.path, strerror(errno)); + //} + + /* close ssh session in main thread */ + fio_disconnect(); + + /* Print the list of files to backup catalog */ + //write_backup_filelist(¤t, backup_files_list, instance_config.pgdata, + // external_dirs, true); + /* update backup control file to update size info */ + //write_backup(¤t, true); + + /* Sync all copied files unless '--no-sync' flag is used */ + if (no_sync) + elog(WARNING, "Backup files are not synced to disk"); + else + { + elog(INFO, "Syncing backup files to disk"); + time(&start_time); + + for (i = 0; i < parray_num(backup_files_list); i++) + { + char to_fullpath[MAXPGPATH]; + pgFile *file = (pgFile *) parray_get(backup_files_list, i); + + /* TODO: sync directory ? */ + if (S_ISDIR(file->mode)) + continue; + + if (file->write_size <= 0) + continue; + + /* construct fullpath */ + if (file->external_dir_num == 0) + join_path_components(to_fullpath, dest_pgdata, file->rel_path); + /* TODO разобраться с external */ + /*else + { + char external_dst[MAXPGPATH]; + + makeExternalDirPathByNum(external_dst, external_prefix, + file->external_dir_num); + join_path_components(to_fullpath, external_dst, file->rel_path); + } + */ + if (fio_sync(to_fullpath, FIO_BACKUP_HOST) != 0) + elog(ERROR, "Cannot sync file \"%s\": %s", to_fullpath, strerror(errno)); + } + + time(&end_time); + pretty_time_interval(difftime(end_time, start_time), + pretty_time, lengthof(pretty_time)); + elog(INFO, "Backup files are synced, time elapsed: %s", pretty_time); + } + + /* be paranoid about instance been from the past */ + // if (backup_mode != BACKUP_MODE_FULL && + // current.stop_lsn < prev_backup->stop_lsn) + // elog(ERROR, "Current backup STOP LSN %X/%X is lower than STOP LSN %X/%X of previous backup %s. " + // "It may indicate that we are trying to backup PostgreSQL instance from the past.", + // (uint32) (current.stop_lsn >> 32), (uint32) (current.stop_lsn), + // (uint32) (prev_backup->stop_lsn >> 32), (uint32) (prev_backup->stop_lsn), + // base36enc(prev_backup->stop_lsn)); + + /* clean external directories list */ + if (external_dirs) + free_dir_list(external_dirs); + + /* Cleanup */ + if (backup_list) + { + parray_walk(backup_list, pgBackupFree); + parray_free(backup_list); + } + + if (tli_list) + { + parray_walk(tli_list, timelineInfoFree); + parray_free(tli_list); + } + + parray_walk(backup_files_list, pgFileFree); + parray_free(backup_files_list); + backup_files_list = NULL; + // где закрывается backup_conn? +} + +/* + * Entry point of pg_probackup CATCHUP subcommand. + * + */ +int +do_catchup(char *source_pgdata, char *dest_pgdata, BackupMode backup_mode, ConnectionOptions conn_opt, bool stream_wal, int num_threads) +{ + PGconn *backup_conn = NULL; + PGNodeInfo nodeInfo; + //char pretty_bytes[20]; + bool no_sync = false; + bool backup_logs = false; + + /* Initialize PGInfonode */ + pgNodeInit(&nodeInfo); + + /* ugly hack */ + instance_config.xlog_seg_size = DEFAULT_XLOG_SEG_SIZE; + + //if (!instance_config.pgdata) + // elog(ERROR, "required parameter not specified: PGDATA " + // "(-D, --pgdata)"); + + /* Update backup status and other metainfo. */ + //current.status = BACKUP_STATUS_RUNNING; + //current.start_time = start_time; + + StrNCpy(current.program_version, PROGRAM_VERSION, + sizeof(current.program_version)); + + //current.compress_alg = instance_config.compress_alg; + //current.compress_level = instance_config.compress_level; + + /* Save list of external directories */ + //if (instance_config.external_dir_str && + // (pg_strcasecmp(instance_config.external_dir_str, "none") != 0)) + // current.external_dir_str = instance_config.external_dir_str; + + elog(INFO, "Catchup start, pg_probackup version: %s, `" + "wal mode: %s, remote: %s, catchup-source-pgdata: %s, catchup-destination-pgdata: %s", + PROGRAM_VERSION, + current.stream ? "STREAM" : "ARCHIVE", IsSshProtocol() ? "true" : "false", + source_pgdata, dest_pgdata); + + /* Create backup directory and BACKUP_CONTROL_FILE */ + //if (pgBackupCreateDir(¤t)) + // elog(ERROR, "Cannot create backup directory"); + //if (!lock_backup(¤t, true)) + // elog(ERROR, "Cannot lock backup %s directory", + // base36enc(current.start_time)); + //write_backup(¤t, true); + + //elog(LOG, "Backup destination is initialized"); + + /* + * setup backup_conn, do some compatibility checks and + * fill basic info about instance + */ + backup_conn = pgdata_basic_setup(instance_config.conn_opt, &nodeInfo); + + //if (current.from_replica) + // elog(INFO, "Backup %s is going to be taken from standby", base36enc(start_time)); + + /* TODO, print PostgreSQL full version */ + //elog(INFO, "PostgreSQL version: %s", nodeInfo.server_version_str); + + /* + * Ensure that backup directory was initialized for the same PostgreSQL + * instance we opened connection to. And that target backup database PGDATA + * belogns to the same instance. + */ + //check_system_identifiers(backup_conn, instance_config.pgdata); + + /* below perform checks specific for backup command */ +#if PG_VERSION_NUM >= 110000 + if (!RetrieveWalSegSize(backup_conn)) + elog(ERROR, "Failed to retrieve wal_segment_size"); +#endif + + get_ptrack_version(backup_conn, &nodeInfo); + // elog(WARNING, "ptrack_version_num %d", ptrack_version_num); + + if (nodeInfo.ptrack_version_num > 0) + nodeInfo.is_ptrack_enable = pg_ptrack_enable(backup_conn, nodeInfo.ptrack_version_num); + + if (backup_mode == BACKUP_MODE_DIFF_PTRACK) + { + if (nodeInfo.ptrack_version_num == 0) + elog(ERROR, "This PostgreSQL instance does not support ptrack"); + else + { + if (!nodeInfo.is_ptrack_enable) + elog(ERROR, "Ptrack is disabled"); + } + } + + if (current.from_replica && exclusive_backup) + /* Check master connection options */ + if (instance_config.master_conn_opt.pghost == NULL) + elog(ERROR, "Options for connection to master must be provided to perform backup from replica"); + + /* backup data */ + do_catchup_instance(source_pgdata, dest_pgdata, backup_conn, &nodeInfo, backup_mode, no_sync, backup_logs); + + //if (!no_validate) + // pgBackupValidate(¤t, NULL); + + /* Notify user about backup size */ + //if (current.stream) + // pretty_size(current.data_bytes + current.wal_bytes, pretty_bytes, lengthof(pretty_bytes)); + //else + // pretty_size(current.data_bytes, pretty_bytes, lengthof(pretty_bytes)); + //elog(INFO, "Backup %s resident size: %s", base36enc(current.start_time), pretty_bytes); + + //if (current.status == BACKUP_STATUS_OK || + // current.status == BACKUP_STATUS_DONE) + // elog(INFO, "Backup %s completed", base36enc(current.start_time)); + //else + // elog(ERROR, "Backup %s failed", base36enc(current.start_time)); + + return 0; +} + +/* + * Take a backup of the PGDATA at a file level. + * Copy all directories and files listed in backup_files_list. + * If the file is 'datafile' (regular relation's main fork), read it page by page, + * verify checksum and copy. + * In incremental backup mode, copy only files or datafiles' pages changed after + * previous backup. + */ +static void * +catchup_files(void *arg) +{ + int i; + char from_fullpath[MAXPGPATH]; + char to_fullpath[MAXPGPATH]; + static time_t prev_time; + + catchup_files_arg *arguments = (catchup_files_arg *) arg; + int n_catchup_files_list = parray_num(arguments->files_list); + + /* TODO !!!! remove current */ + prev_time = current.start_time; + + /* backup a file */ + for (i = 0; i < n_catchup_files_list; i++) + { + pgFile *file = (pgFile *) parray_get(arguments->files_list, i); + pgFile *prev_file = NULL; + + /* We have already copied all directories */ + if (S_ISDIR(file->mode)) + continue; + + if (arguments->thread_num == 1) + { + /* update backup_content.control every 60 seconds */ + if ((difftime(time(NULL), prev_time)) > 60) + { + // write_backup_filelist(¤t, arguments->files_list, arguments->from_root, + // arguments->external_dirs, false); + /* update backup control file to update size info */ + //write_backup(¤t, true); + + prev_time = time(NULL); + } + } + + if (!pg_atomic_test_set_flag(&file->lock)) + continue; + + /* check for interrupt */ + if (interrupted || thread_interrupted) + elog(ERROR, "interrupted during backup"); + + if (progress) + elog(INFO, "Progress: (%d/%d). Process file \"%s\"", + i + 1, n_catchup_files_list, file->rel_path); + + /* Handle zero sized files */ + //if (file->size == 0) + //{ + // file->write_size = 0; + // continue; + //} + + /* construct destination filepath */ + /* TODO разобраться нужен ли external */ + if (file->external_dir_num == 0) + { + join_path_components(from_fullpath, arguments->from_root, file->rel_path); + join_path_components(to_fullpath, arguments->to_root, file->rel_path); + } + /*else + { + char external_dst[MAXPGPATH]; + char *external_path = parray_get(arguments->external_dirs, + file->external_dir_num - 1); + + makeExternalDirPathByNum(external_dst, + arguments->external_prefix, + file->external_dir_num); + + join_path_components(to_fullpath, external_dst, file->rel_path); + join_path_components(from_fullpath, external_path, file->rel_path); + } + */ + + /* Encountered some strange beast */ + if (!S_ISREG(file->mode)) + elog(WARNING, "Unexpected type %d of file \"%s\", skipping", + file->mode, from_fullpath); + + /* Check that file exist in previous backup */ + if (arguments->backup_mode != BACKUP_MODE_FULL) + { + pgFile **prev_file_tmp = NULL; + prev_file_tmp = (pgFile **) parray_bsearch(arguments->prev_filelist, + file, pgFileCompareRelPathWithExternal); + if (prev_file_tmp) + { + /* File exists in previous backup */ + file->exists_in_prev = true; + prev_file = *prev_file_tmp; + } + } + + /* backup file */ + if (file->is_datafile && !file->is_cfs) + { + catchup_data_file(&(arguments->conn_arg), file, from_fullpath, to_fullpath, + arguments->prev_start_lsn, + arguments->backup_mode, + NONE_COMPRESS, + 0, + arguments->nodeInfo->checksum_version, + arguments->nodeInfo->ptrack_version_num, + arguments->nodeInfo->ptrack_schema, + arguments->hdr_map, false); + } + else + { + backup_non_data_file(file, prev_file, from_fullpath, to_fullpath, + arguments->backup_mode, current.parent_backup, true); + } + + if (file->write_size == FILE_NOT_FOUND) + continue; + + if (file->write_size == BYTES_INVALID) + { + elog(VERBOSE, "Skipping the unchanged file: \"%s\"", from_fullpath); + continue; + } + + elog(VERBOSE, "File \"%s\". Copied "INT64_FORMAT " bytes", + from_fullpath, file->write_size); + } + + /* ssh connection to longer needed */ + fio_disconnect(); + + /* Close connection */ + if (arguments->conn_arg.conn) + pgut_disconnect(arguments->conn_arg.conn); + + /* Data files transferring is successful */ + arguments->ret = 0; + + return NULL; +} + diff --git a/src/data.c b/src/data.c index d3f67f43c..b1b5b1fea 100644 --- a/src/data.c +++ b/src/data.c @@ -522,6 +522,23 @@ compress_and_backup_page(pgFile *file, BlockNumber blknum, return compressed_size; } +/* взята из compress_and_backup_page, но выпилена вся магия заголовков и компрессии, просто копирование 1-в-1 */ +static int +copy_page(pgFile *file, BlockNumber blknum, + FILE *in, FILE *out, Page page, + const char *to_fullpath) +{ + /* write data page */ + if (fio_fwrite(out, page, BLCKSZ) != BLCKSZ) + elog(ERROR, "File: \"%s\", cannot write at block %u: %s", + to_fullpath, blknum, strerror(errno)); + + file->write_size += BLCKSZ; + file->uncompressed_size += BLCKSZ; + + return BLCKSZ; +} + /* * Backup data file in the from_root directory to the to_root directory with * same relative path. If prev_backup_start_lsn is not NULL, only pages with @@ -688,6 +705,168 @@ backup_data_file(ConnectionArgs* conn_arg, pgFile *file, pg_free(headers); } +/* + * Backup data file in the from_root directory to the to_root directory with + * same relative path. If prev_backup_start_lsn is not NULL, only pages with + * higher lsn will be copied. + * Not just copy file, but read it block by block (use bitmap in case of + * incremental backup), validate checksum, optionally compress and write to + * backup with special header. + */ +void +catchup_data_file(ConnectionArgs* conn_arg, pgFile *file, + const char *from_fullpath, const char *to_fullpath, + XLogRecPtr prev_backup_start_lsn, BackupMode backup_mode, + CompressAlg calg, int clevel, uint32 checksum_version, + int ptrack_version_num, const char *ptrack_schema, + HeaderMap *hdr_map, bool is_merge) +{ + int rc; + bool use_pagemap; + char *errmsg = NULL; + BlockNumber err_blknum = 0; + /* page headers */ + BackupPageHeader2 *headers = NULL; + + /* sanity */ + if (file->size % BLCKSZ != 0) + elog(WARNING, "File: \"%s\", invalid file size %zu", from_fullpath, file->size); + + /* + * Compute expected number of blocks in the file. + * NOTE This is a normal situation, if the file size has changed + * since the moment we computed it. + */ + file->n_blocks = file->size/BLCKSZ; + + /* + * Skip unchanged file only if it exists in previous backup. + * This way we can correctly handle null-sized files which are + * not tracked by pagemap and thus always marked as unchanged. + */ + if ((backup_mode == BACKUP_MODE_DIFF_PAGE || + backup_mode == BACKUP_MODE_DIFF_PTRACK) && + file->pagemap.bitmapsize == PageBitmapIsEmpty && + file->exists_in_prev && !file->pagemap_isabsent) + { + /* + * There are no changed blocks since last backup. We want to make + * incremental backup, so we should exit. + */ + file->write_size = BYTES_INVALID; + return; + } + + /* reset size summary */ + file->read_size = 0; + file->write_size = 0; + file->uncompressed_size = 0; + INIT_FILE_CRC32(true, file->crc); + + /* + * Read each page, verify checksum and write it to backup. + * If page map is empty or file is not present in previous backup + * backup all pages of the relation. + * + * In PTRACK 1.x there was a problem + * of data files with missing _ptrack map. + * Such files should be fully copied. + */ + + if (file->pagemap.bitmapsize == PageBitmapIsEmpty || + file->pagemap_isabsent || !file->exists_in_prev || + !file->pagemap.bitmap) + use_pagemap = false; + else + use_pagemap = true; + + /* Remote mode */ + if (fio_is_remote(FIO_DB_HOST)) + { + rc = fio_copy_pages(to_fullpath, from_fullpath, file, + /* send prev backup START_LSN */ + backup_mode == BACKUP_MODE_DIFF_DELTA && + file->exists_in_prev ? prev_backup_start_lsn : InvalidXLogRecPtr, + calg, clevel, checksum_version, + /* send pagemap if any */ + use_pagemap, + /* variables for error reporting */ + &err_blknum, &errmsg, &headers); + } + else + { + /* TODO: stop handling errors internally */ + rc = copy_pages(conn_arg, to_fullpath, from_fullpath, file, + /* send prev backup START_LSN */ + backup_mode == BACKUP_MODE_DIFF_DELTA && + file->exists_in_prev ? prev_backup_start_lsn : InvalidXLogRecPtr, + checksum_version, use_pagemap, + backup_mode, ptrack_version_num, ptrack_schema); + } + + /* check for errors */ + if (rc == FILE_MISSING) + { + elog(is_merge ? ERROR : LOG, "File not found: \"%s\"", from_fullpath); + file->write_size = FILE_NOT_FOUND; + goto cleanup; + } + + else if (rc == WRITE_FAILED) + elog(ERROR, "Cannot write block %u of \"%s\": %s", + err_blknum, to_fullpath, strerror(errno)); + + else if (rc == PAGE_CORRUPTION) + { + if (errmsg) + elog(ERROR, "Corruption detected in file \"%s\", block %u: %s", + from_fullpath, err_blknum, errmsg); + else + elog(ERROR, "Corruption detected in file \"%s\", block %u", + from_fullpath, err_blknum); + } + /* OPEN_FAILED and READ_FAILED */ + else if (rc == OPEN_FAILED) + { + if (errmsg) + elog(ERROR, "%s", errmsg); + else + elog(ERROR, "Cannot open file \"%s\"", from_fullpath); + } + else if (rc == READ_FAILED) + { + if (errmsg) + elog(ERROR, "%s", errmsg); + else + elog(ERROR, "Cannot read file \"%s\"", from_fullpath); + } + + file->read_size = rc * BLCKSZ; + + /* refresh n_blocks for FULL and DELTA */ + if (backup_mode == BACKUP_MODE_FULL || + backup_mode == BACKUP_MODE_DIFF_DELTA) + file->n_blocks = file->read_size / BLCKSZ; + + /* Determine that file didn`t changed in case of incremental backup */ + if (backup_mode != BACKUP_MODE_FULL && + file->exists_in_prev && + file->write_size == 0 && + file->n_blocks > 0) + { + file->write_size = BYTES_INVALID; + } + +cleanup: + + /* finish CRC calculation */ + FIN_FILE_CRC32(true, file->crc); + + pg_free(errmsg); + pg_free(file->pagemap.bitmap); + pg_free(headers); +} + /* * Backup non data file * We do not apply compression to this file. @@ -2121,6 +2300,104 @@ send_pages(ConnectionArgs* conn_arg, const char *to_fullpath, const char *from_f return n_blocks_read; } +/* copy local file (взята из send_pages, но используется простое копирование странички, без добавления заголовков и компрессии) */ +int +copy_pages(ConnectionArgs* conn_arg, const char *to_fullpath, const char *from_fullpath, + pgFile *file, XLogRecPtr prev_backup_start_lsn, + uint32 checksum_version, bool use_pagemap, + BackupMode backup_mode, int ptrack_version_num, const char *ptrack_schema) +{ + FILE *in = NULL; + FILE *out = NULL; + char curr_page[BLCKSZ]; + int n_blocks_read = 0; + BlockNumber blknum = 0; + datapagemap_iterator_t *iter = NULL; + + /* stdio buffers */ + char *in_buf = NULL; + char *out_buf = NULL; + + /* open source file for read */ + in = fopen(from_fullpath, PG_BINARY_R); + if (in == NULL) + { + /* + * If file is not found, this is not en error. + * It could have been deleted by concurrent postgres transaction. + */ + if (errno == ENOENT) + return FILE_MISSING; + + elog(ERROR, "Cannot open file \"%s\": %s", from_fullpath, strerror(errno)); + } + + /* + * Enable stdio buffering for local input file, + * unless the pagemap is involved, which + * imply a lot of random access. + */ + + if (use_pagemap) + { + iter = datapagemap_iterate(&file->pagemap); + datapagemap_next(iter, &blknum); /* set first block */ + + setvbuf(in, NULL, _IONBF, BUFSIZ); + } + else + { + in_buf = pgut_malloc(STDIO_BUFSIZE); + setvbuf(in, in_buf, _IOFBF, STDIO_BUFSIZE); + } + + /* ошибки бы тут обработать! */ + out = open_local_file_rw(to_fullpath, &out_buf, STDIO_BUFSIZE); + + while (blknum < file->n_blocks) + { + PageState page_st; + int rc = prepare_page(conn_arg, file, prev_backup_start_lsn, + blknum, in, backup_mode, curr_page, + true, checksum_version, + ptrack_version_num, ptrack_schema, + from_fullpath, &page_st); + if (rc == PageIsTruncated) + break; + + else if (rc == PageIsOk) + copy_page(file, blknum, in, out, curr_page, to_fullpath); + + n_blocks_read++; + + /* next block */ + if (use_pagemap) + { + /* exit if pagemap is exhausted */ + if (!datapagemap_next(iter, &blknum)) + break; + } + else + blknum++; + } + + /* cleanup */ + if (in && fclose(in)) + elog(ERROR, "Cannot close the source file \"%s\": %s", + to_fullpath, strerror(errno)); + + /* close local output file */ + if (out && fclose(out)) + elog(ERROR, "Cannot close the backup file \"%s\": %s", + to_fullpath, strerror(errno)); + + pg_free(iter); + pg_free(in_buf); + pg_free(out_buf); + + return n_blocks_read; +} + /* * Attempt to open header file, read content and return as * array of headers. diff --git a/src/help.c b/src/help.c index cab143ad8..c6d7066ef 100644 --- a/src/help.c +++ b/src/help.c @@ -29,6 +29,7 @@ static void help_archive_get(void); static void help_checkdb(void); static void help_help(void); static void help_version(void); +static void help_catchup(void); void help_print_version(void) @@ -70,6 +71,7 @@ help_command(ProbackupSubcmd const subcmd) &help_internal, // AGENT_CMD &help_help, &help_version, + &help_catchup, }; Assert((int)subcmd < sizeof(help_functions) / sizeof(help_functions[0])); @@ -1002,3 +1004,45 @@ help_version(void) printf(_("\n%s version\n"), PROGRAM_NAME); printf(_("%s --version\n\n"), PROGRAM_NAME); } + +static void +help_catchup(void) +{ + printf(_("\n%s catchup -b backup-mode\n"), PROGRAM_NAME); + printf(_(" --catchup-source-pgdata=path_to_pgdata_on_remote_server\n")); + printf(_(" --catchup-destination-pgdata=path_to_local_dir\n")); + printf(_(" [--stream [-S slot-name]] [--temp-slot]\n")); + printf(_(" [-j num-threads]\n")); + printf(_(" [-d dbname] [-h host] [-p port] [-U username]\n")); + printf(_(" [-w --no-password] [-W --password]\n")); + printf(_(" [--remote-proto] [--remote-host]\n")); + printf(_(" [--remote-port] [--remote-path] [--remote-user]\n")); + printf(_(" [--ssh-options]\n\n")); + + printf(_(" -b, --backup-mode=backup-mode backup mode=FULL|PTRACK\n")); + printf(_(" --stream stream the transaction log and include it in the backup\n")); + printf(_(" -S, --slot=SLOTNAME replication slot to use\n")); + printf(_(" --temp-slot use temporary replication slot\n")); + + printf(_(" -j, --threads=NUM number of parallel threads\n")); + + printf(_("\n Connection options:\n")); + printf(_(" -U, --pguser=USERNAME user name to connect as (default: current local user)\n")); + printf(_(" -d, --pgdatabase=DBNAME database to connect (default: username)\n")); + printf(_(" -h, --pghost=HOSTNAME database server host or socket directory(default: 'local socket')\n")); + printf(_(" -p, --pgport=PORT database server port (default: 5432)\n")); + printf(_(" -w, --no-password never prompt for password\n")); + printf(_(" -W, --password force password prompt\n\n")); + + printf(_("\n Remote options:\n")); + printf(_(" --remote-proto=protocol remote protocol to use\n")); + printf(_(" available options: 'ssh', 'none' (default: ssh)\n")); + printf(_(" --remote-host=hostname remote host address or hostname\n")); + printf(_(" --remote-port=port remote host port (default: 22)\n")); + printf(_(" --remote-path=path path to directory with pg_probackup binary on remote host\n")); + printf(_(" (default: current binary path)\n")); + printf(_(" --remote-user=username user name for ssh connection (default: current user)\n")); + printf(_(" --ssh-options=ssh_options additional ssh options (default: none)\n")); + printf(_(" (example: --ssh-options='-c cipher_spec -F configfile')\n\n")); +} + diff --git a/src/pg_probackup.c b/src/pg_probackup.c index dd2ac97ee..5c230b619 100644 --- a/src/pg_probackup.c +++ b/src/pg_probackup.c @@ -63,6 +63,9 @@ bool backup_logs = false; bool smooth_checkpoint; char *remote_agent; static char *backup_note = NULL; +/* catchup options */ +static char *catchup_source_pgdata = NULL; +static char *catchup_destination_pgdata = NULL; /* restore options */ static char *target_time = NULL; static char *target_xid = NULL; @@ -170,6 +173,9 @@ static ConfigOption cmd_options[] = { 'b', 184, "merge-expired", &merge_expired, SOURCE_CMD_STRICT }, { 'b', 185, "dry-run", &dry_run, SOURCE_CMD_STRICT }, { 's', 238, "note", &backup_note, SOURCE_CMD_STRICT }, + /* catchup options */ + { 's', 239, "catchup-source-pgdata", &catchup_source_pgdata, SOURCE_CMD_STRICT }, + { 's', 240, "catchup-destination-pgdata", &catchup_destination_pgdata, SOURCE_CMD_STRICT }, /* restore options */ { 's', 136, "recovery-target-time", &target_time, SOURCE_CMD_STRICT }, { 's', 137, "recovery-target-xid", &target_xid, SOURCE_CMD_STRICT }, @@ -405,17 +411,17 @@ main(int argc, char *argv[]) elog(ERROR, "-B, --backup-path must be an absolute path"); } /* backup_path is required for all pg_probackup commands except help, version and checkdb */ - if (backup_path == NULL && backup_subcmd != CHECKDB_CMD && backup_subcmd != HELP_CMD && backup_subcmd != VERSION_CMD) + if (backup_path == NULL && backup_subcmd != CHECKDB_CMD && backup_subcmd != HELP_CMD && backup_subcmd != VERSION_CMD && backup_subcmd != CATCHUP_CMD) elog(ERROR, "required parameter not specified: BACKUP_PATH (-B, --backup-path)"); /* * Option --instance is required for all commands except - * init, show, checkdb and validate + * init, show, checkdb, validate and catchup */ if (instance_name == NULL) { if (backup_subcmd != INIT_CMD && backup_subcmd != SHOW_CMD && - backup_subcmd != VALIDATE_CMD && backup_subcmd != CHECKDB_CMD) + backup_subcmd != VALIDATE_CMD && backup_subcmd != CHECKDB_CMD && backup_subcmd != CATCHUP_CMD) elog(ERROR, "required parameter not specified: --instance"); } else @@ -510,6 +516,10 @@ main(int argc, char *argv[]) setMyLocation(backup_subcmd); } } + else if (backup_subcmd == CATCHUP_CMD) + { + config_get_opt_env(instance_options); + } /* * Disable logging into file for archive-push and archive-get. @@ -710,6 +720,20 @@ main(int argc, char *argv[]) } } + /* checking required options */ + if (backup_subcmd == CATCHUP_CMD) + { + if (catchup_source_pgdata == NULL) + elog(ERROR, "You must specify \"--catchup-source-pgdata\" option with the \"%s\" command", get_subcmd_name(backup_subcmd)); + if (catchup_destination_pgdata == NULL) + elog(ERROR, "You must specify \"--catchup-destination-pgdata\" option with the \"%s\" command", get_subcmd_name(backup_subcmd)); + if (current.backup_mode == BACKUP_MODE_INVALID) + elog(ERROR, "Required parameter not specified: BACKUP_MODE (-b, --backup-mode)"); + if (current.backup_mode != BACKUP_MODE_FULL && current.backup_mode != BACKUP_MODE_DIFF_PTRACK) + elog(ERROR, "Only \"FULL\" and \"PTRACK\" modes are supported with the \"%s\" command", get_subcmd_name(backup_subcmd)); + // TODO проверить instance_config.conn_opt + } + /* sanity */ if (backup_subcmd == VALIDATE_CMD && restore_params->no_validate) elog(ERROR, "You cannot specify \"--no-validate\" option with the \"%s\" command", @@ -751,6 +775,8 @@ main(int argc, char *argv[]) return do_backup(set_backup_params, no_validate, no_sync, backup_logs); } + case CATCHUP_CMD: + return do_catchup(catchup_source_pgdata, catchup_destination_pgdata, current.backup_mode, instance_config.conn_opt, stream_wal, num_threads); case RESTORE_CMD: return do_restore_or_validate(current.backup_id, recovery_target_options, diff --git a/src/pg_probackup.h b/src/pg_probackup.h index 217c8a7f1..40a841347 100644 --- a/src/pg_probackup.h +++ b/src/pg_probackup.h @@ -587,6 +587,32 @@ typedef struct int ret; } backup_files_arg; +typedef struct +{ + PGNodeInfo *nodeInfo; + + const char *from_root; + const char *to_root; + const char *external_prefix; + + parray *files_list; + parray *prev_filelist; + /* TODO разобраться */ + //parray *external_dirs; + XLogRecPtr prev_start_lsn; + BackupMode backup_mode; + + ConnectionArgs conn_arg; + int thread_num; + HeaderMap *hdr_map; + + /* + * Return value from the thread. + * 0 means there is no error, 1 - there is an error. + */ + int ret; +} catchup_files_arg; + typedef struct timelineInfo timelineInfo; @@ -826,6 +852,8 @@ extern char *pg_ptrack_get_block(ConnectionArgs *arguments, Oid dbOid, Oid tblsOid, Oid relOid, BlockNumber blknum, size_t *result_size, int ptrack_version_num, const char *ptrack_schema); +/* in catchup.c */ +extern int do_catchup(char *source_pgdata, char *dest_pgdata, BackupMode backup_mode, ConnectionOptions conn_opt, bool stream_wal, int num_threads); /* in restore.c */ extern int do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt, @@ -1051,6 +1079,12 @@ extern void backup_data_file(ConnectionArgs* conn_arg, pgFile *file, CompressAlg calg, int clevel, uint32 checksum_version, int ptrack_version_num, const char *ptrack_schema, HeaderMap *hdr_map, bool missing_ok); +extern void catchup_data_file(ConnectionArgs* conn_arg, pgFile *file, + const char *from_fullpath, const char *to_fullpath, + XLogRecPtr prev_backup_start_lsn, BackupMode backup_mode, + CompressAlg calg, int clevel, uint32 checksum_version, + int ptrack_version_num, const char *ptrack_schema, + HeaderMap *hdr_map, bool missing_ok); extern void backup_non_data_file(pgFile *file, pgFile *prev_file, const char *from_fullpath, const char *to_fullpath, BackupMode backup_mode, time_t parent_backup_time, @@ -1124,6 +1158,7 @@ extern uint32 get_data_checksum_version(bool safe); extern pg_crc32c get_pgcontrol_checksum(const char *pgdata_path); extern uint32 get_xlog_seg_size(char *pgdata_path); extern void get_redo(const char *pgdata_path, RedoParams *redo); +extern XLogRecPtr get_min_recovery_point(char *pgdata_path); extern void set_min_recovery_point(pgFile *file, const char *backup_path, XLogRecPtr stop_backup_lsn); extern void copy_pgcontrol_file(const char *from_fullpath, fio_location from_location, @@ -1177,6 +1212,10 @@ extern int send_pages(ConnectionArgs* conn_arg, const char *to_fullpath, const c pgFile *file, XLogRecPtr prev_backup_start_lsn, CompressAlg calg, int clevel, uint32 checksum_version, bool use_pagemap, BackupPageHeader2 **headers, BackupMode backup_mode, int ptrack_version_num, const char *ptrack_schema); +extern int copy_pages(ConnectionArgs* conn_arg, const char *to_fullpath, const char *from_fullpath, + pgFile *file, XLogRecPtr prev_backup_start_lsn, + uint32 checksum_version, bool use_pagemap, + BackupMode backup_mode, int ptrack_version_num, const char *ptrack_schema); /* FIO */ extern void setMyLocation(ProbackupSubcmd const subcmd); @@ -1185,6 +1224,10 @@ extern int fio_send_pages(const char *to_fullpath, const char *from_fullpath, pg XLogRecPtr horizonLsn, int calg, int clevel, uint32 checksum_version, bool use_pagemap, BlockNumber *err_blknum, char **errormsg, BackupPageHeader2 **headers); +extern int fio_copy_pages(const char *to_fullpath, const char *from_fullpath, pgFile *file, + XLogRecPtr horizonLsn, int calg, int clevel, uint32 checksum_version, + bool use_pagemap, BlockNumber *err_blknum, char **errormsg, + BackupPageHeader2 **headers); /* return codes for fio_send_pages */ extern int fio_send_file_gz(const char *from_fullpath, const char *to_fullpath, FILE* out, char **errormsg); extern int fio_send_file(const char *from_fullpath, const char *to_fullpath, FILE* out, @@ -1238,4 +1281,19 @@ extern void start_WAL_streaming(PGconn *backup_conn, char *stream_dst_path, ConnectionOptions *conn_opt, XLogRecPtr startpos, TimeLineID starttli); extern int wait_WAL_streaming_end(parray *backup_files_list); + +/* functions used in both backup.c and catchup.c, implemented in backup.c */ +extern void pg_start_backup(const char *label, bool smooth, BackupMode backup_mode, bool from_replica, XLogRecPtr *start_lsn, + PGNodeInfo *nodeInfo, PGconn *conn); + +extern void pg_stop_backup(pgBackup *backup, PGconn *pg_startbackup_conn, PGNodeInfo *nodeInfo, const char *destination_dir); +extern XLogRecPtr wait_wal_lsn(XLogRecPtr lsn, bool is_start_lsn, TimeLineID tli, + bool in_prev_segment, bool segment_only, + int timeout_elevel, bool in_stream_dir, const char *wal_segment_dir); + +extern void check_external_for_tablespaces(parray *external_list, + PGconn *backup_conn); + +extern parray *backup_files_list; + #endif /* PG_PROBACKUP_H */ diff --git a/src/stream.c b/src/stream.c index 825aa0e7d..c5c659499 100644 --- a/src/stream.c +++ b/src/stream.c @@ -381,20 +381,25 @@ start_WAL_streaming(PGconn *backup_conn, char *stream_dst_path, ConnectionOption /* Set error exit code as default */ stream_thread_arg.ret = 1; /* we must use startpos as start_lsn from start_backup */ - stream_thread_arg.startpos = current.start_lsn; - stream_thread_arg.starttli = current.tli; + stream_thread_arg.startpos = startpos; + stream_thread_arg.starttli = starttli; thread_interrupted = false; pthread_create(&stream_thread, NULL, StreamLog, &stream_thread_arg); } -/* Wait for the completion of stream */ +/* + * Wait for the completion of stream + * append list of streamed xlog files + * into backup_files_list (if it is not NULL) + */ int wait_WAL_streaming_end(parray *backup_files_list) { pthread_join(stream_thread, NULL); - parray_concat(backup_files_list, xlog_files_list); + if(backup_files_list != NULL) + parray_concat(backup_files_list, xlog_files_list); parray_free(xlog_files_list); return stream_thread_arg.ret; } diff --git a/src/util.c b/src/util.c index b8756e027..061a1d8f3 100644 --- a/src/util.c +++ b/src/util.c @@ -384,6 +384,21 @@ get_redo(const char *pgdata_path, RedoParams *redo) redo->checksum_version = ControlFile.data_checksum_version; } +/* Get minRecoveryPoint from control file from pgdata_path */ +XLogRecPtr +get_min_recovery_point(char *pgdata_path) +{ + ControlFileData ControlFile; + char *buffer; + size_t size; + + buffer = slurpFile(pgdata_path, XLOG_CONTROL_FILE, &size, false, FIO_LOCAL_HOST); + digestControlFile(&ControlFile, buffer, size); + pg_free(buffer); + + return ControlFile.minRecoveryPoint; +} + /* * Rewrite minRecoveryPoint of pg_control in backup directory. minRecoveryPoint * 'as-is' is not to be trusted. diff --git a/src/utils/configuration.c b/src/utils/configuration.c index afc1bc056..04bfbbe3b 100644 --- a/src/utils/configuration.c +++ b/src/utils/configuration.c @@ -110,6 +110,7 @@ static char const * const subcmd_names[] = "agent", "help", "version", + "catchup", }; ProbackupSubcmd diff --git a/src/utils/configuration.h b/src/utils/configuration.h index 4ed4e0e61..3a5de4b83 100644 --- a/src/utils/configuration.h +++ b/src/utils/configuration.h @@ -38,7 +38,8 @@ typedef enum ProbackupSubcmd SSH_CMD, AGENT_CMD, HELP_CMD, - VERSION_CMD + VERSION_CMD, + CATCHUP_CMD, } ProbackupSubcmd; typedef enum OptionSource diff --git a/src/utils/file.c b/src/utils/file.c index 0266e23b9..fa83177b0 100644 --- a/src/utils/file.c +++ b/src/utils/file.c @@ -96,7 +96,7 @@ setMyLocation(ProbackupSubcmd const subcmd) MyLocation = IsSshProtocol() ? (subcmd == ARCHIVE_PUSH_CMD || subcmd == ARCHIVE_GET_CMD) ? FIO_DB_HOST - : (subcmd == BACKUP_CMD || subcmd == RESTORE_CMD || subcmd == ADD_INSTANCE_CMD) + : (subcmd == BACKUP_CMD || subcmd == RESTORE_CMD || subcmd == ADD_INSTANCE_CMD || subcmd == CATCHUP_CMD) ? FIO_BACKUP_HOST : FIO_LOCAL_HOST : FIO_LOCAL_HOST; @@ -1812,6 +1812,183 @@ int fio_send_pages(const char *to_fullpath, const char *from_fullpath, pgFile *f return n_blocks_read; } +/* + * Return number of actually(!) readed blocks, attempts or + * half-readed block are not counted. + * Return values in case of error: + * FILE_MISSING + * OPEN_FAILED + * READ_ERROR + * PAGE_CORRUPTION + * WRITE_FAILED + * + * If none of the above, this function return number of blocks + * readed by remote agent. + * + * In case of DELTA mode horizonLsn must be a valid lsn, + * otherwise it should be set to InvalidXLogRecPtr. + * Взято из fio_send_pages + */ +int +fio_copy_pages(const char *to_fullpath, const char *from_fullpath, pgFile *file, + XLogRecPtr horizonLsn, int calg, int clevel, uint32 checksum_version, + bool use_pagemap, BlockNumber* err_blknum, char **errormsg, + BackupPageHeader2 **headers) +{ + FILE *out = NULL; + char *out_buf = NULL; + struct { + fio_header hdr; + fio_send_request arg; + } req; + BlockNumber n_blocks_read = 0; + BlockNumber blknum = 0; + + /* send message with header + + 16bytes 24bytes var var + -------------------------------------------------------------- + | fio_header | fio_send_request | FILE PATH | BITMAP(if any) | + -------------------------------------------------------------- + */ + + req.hdr.cop = FIO_SEND_PAGES; + + if (use_pagemap) + { + req.hdr.size = sizeof(fio_send_request) + (*file).pagemap.bitmapsize + strlen(from_fullpath) + 1; + req.arg.bitmapsize = (*file).pagemap.bitmapsize; + + /* TODO: add optimization for the case of pagemap + * containing small number of blocks with big serial numbers: + * https://github.com/postgrespro/pg_probackup/blob/remote_page_backup/src/utils/file.c#L1211 + */ + } + else + { + req.hdr.size = sizeof(fio_send_request) + strlen(from_fullpath) + 1; + req.arg.bitmapsize = 0; + } + + req.arg.nblocks = file->size/BLCKSZ; + req.arg.segmentno = file->segno * RELSEG_SIZE; + req.arg.horizonLsn = horizonLsn; + req.arg.checksumVersion = checksum_version; + req.arg.calg = calg; + req.arg.clevel = clevel; + req.arg.path_len = strlen(from_fullpath) + 1; + + file->compress_alg = calg; /* TODO: wtf? why here? */ + +//<----- +// datapagemap_iterator_t *iter; +// BlockNumber blkno; +// iter = datapagemap_iterate(pagemap); +// while (datapagemap_next(iter, &blkno)) +// elog(INFO, "block %u", blkno); +// pg_free(iter); +//<----- + + /* send header */ + IO_CHECK(fio_write_all(fio_stdout, &req, sizeof(req)), sizeof(req)); + + /* send file path */ + IO_CHECK(fio_write_all(fio_stdout, from_fullpath, req.arg.path_len), req.arg.path_len); + + /* send pagemap if any */ + if (use_pagemap) + IO_CHECK(fio_write_all(fio_stdout, (*file).pagemap.bitmap, (*file).pagemap.bitmapsize), (*file).pagemap.bitmapsize); + + //out = open_local_file_rw_append(to_fullpath, &out_buf, STDIO_BUFSIZE); + out = fio_fopen(to_fullpath, PG_BINARY_R "+", FIO_BACKUP_HOST); + if (out == NULL) + elog(ERROR, "Cannot open restore target file \"%s\": %s", to_fullpath, strerror(errno)); + + while (true) + { + fio_header hdr; + char buf[BLCKSZ + sizeof(BackupPageHeader)]; + IO_CHECK(fio_read_all(fio_stdin, &hdr, sizeof(hdr)), sizeof(hdr)); + + if (interrupted) + elog(ERROR, "Interrupted during page reading"); + + if (hdr.cop == FIO_ERROR) + { + /* FILE_MISSING, OPEN_FAILED and READ_FAILED */ + if (hdr.size > 0) + { + IO_CHECK(fio_read_all(fio_stdin, buf, hdr.size), hdr.size); + *errormsg = pgut_malloc(hdr.size); + snprintf(*errormsg, hdr.size, "%s", buf); + } + + return hdr.arg; + } + else if (hdr.cop == FIO_SEND_FILE_CORRUPTION) + { + *err_blknum = hdr.arg; + + if (hdr.size > 0) + { + IO_CHECK(fio_read_all(fio_stdin, buf, hdr.size), hdr.size); + *errormsg = pgut_malloc(hdr.size); + snprintf(*errormsg, hdr.size, "%s", buf); + } + return PAGE_CORRUPTION; + } + else if (hdr.cop == FIO_SEND_FILE_EOF) + { + /* n_blocks_read reported by EOF */ + n_blocks_read = hdr.arg; + + /* receive headers if any */ + if (hdr.size > 0) + { + *headers = pgut_malloc(hdr.size); + IO_CHECK(fio_read_all(fio_stdin, *headers, hdr.size), hdr.size); + file->n_headers = (hdr.size / sizeof(BackupPageHeader2)) -1; + } + + break; + } + else if (hdr.cop == FIO_PAGE) + { + blknum = hdr.arg; + + Assert(hdr.size <= sizeof(buf)); + IO_CHECK(fio_read_all(fio_stdin, buf, hdr.size), hdr.size); + + COMP_FILE_CRC32(true, file->crc, buf, hdr.size); + + elog(INFO, "Copy block %u with size %u of %s", blknum, hdr.size - sizeof(BackupPageHeader), to_fullpath); + if (fio_fseek(out, blknum * BLCKSZ) < 0) + { + elog(ERROR, "Cannot seek block %u of \"%s\": %s", + blknum, to_fullpath, strerror(errno)); + } + // должен прилетать некомпрессированный блок с заголовком + // Вставить assert? + if (fio_fwrite(out, buf + sizeof(BackupPageHeader), hdr.size - sizeof(BackupPageHeader)) != BLCKSZ) + { + fio_fclose(out); + *err_blknum = blknum; + return WRITE_FAILED; + } + file->write_size += BLCKSZ; + file->uncompressed_size += BLCKSZ; + } + else + elog(ERROR, "Remote agent returned message of unexpected type: %i", hdr.cop); + } + + if (out) + fclose(out); + pg_free(out_buf); + + return n_blocks_read; +} + /* TODO: read file using large buffer * Return codes: * FIO_ERROR: diff --git a/tests/catchup.py b/tests/catchup.py new file mode 100644 index 000000000..c98ab1dff --- /dev/null +++ b/tests/catchup.py @@ -0,0 +1,157 @@ +import os +import unittest +from .helpers.ptrack_helpers import ProbackupTest, ProbackupException + +module_name = 'catchup' + +class CatchupTest(ProbackupTest, unittest.TestCase): + + # @unittest.skip("skip") + def dummy(self): + """ + dummy test + """ + fname = self.id().split('.')[3] + node = self.make_simple_node( + base_dir = os.path.join(module_name, fname, 'node') + ) + node.slow_start() + + # Clean after yourself + node.stop() + self.del_test_dir(module_name, fname) + + # @unittest.skip("skip") + def test_multithread_local_transfer(self): + """ + Test 'multithreaded basebackup' mode + create node, insert some test data, catchup into other dir, start, select test data + """ + fname = self.id().split('.')[3] + + source_pg = self.make_simple_node(base_dir = os.path.join(module_name, fname, 'src')) + source_pg.slow_start() + source_pg.safe_psql( + "postgres", + "CREATE TABLE ultimate_question AS SELECT 42 AS answer") + result = source_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") + + dest_pg = self.catchup_node( + backup_mode = 'FULL', + source_pgdata = source_pg.data_dir, + destination_base_dir = os.path.join(module_name, fname, 'dst'), + options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream', '-j', '4'] + ) + source_pg.stop() + + dest_pg.slow_start() + self.assertEqual( + result, + dest_pg.safe_psql("postgres", "SELECT * FROM ultimate_question"), + 'Different answer from copy') + dest_pg.stop() + + # Clean after yourself + self.del_test_dir(module_name, fname) + + # @unittest.skip("skip") + def test_multithread_remote_transfer(self): + """ + Test 'multithreaded basebackup' mode + create node, insert some test data, catchup into other dir, start, select test data + """ + fname = self.id().split('.')[3] + + source_pg = self.make_simple_node(base_dir = os.path.join(module_name, fname, 'src')) + source_pg.slow_start() + source_pg.safe_psql( + "postgres", + "CREATE TABLE ultimate_question AS SELECT 42 AS answer") + result = source_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") + + dest_pg = self.catchup_node( + backup_mode = 'FULL', + source_pgdata = source_pg.data_dir, + destination_base_dir = os.path.join(module_name, fname, 'dst'), + options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream', '-j', '4', + '--remote-proto=ssh', '--remote-host=localhost'] + ) + source_pg.stop() + + dest_pg.slow_start() + self.assertEqual( + result, + dest_pg.safe_psql("postgres", "SELECT * FROM ultimate_question"), + 'Different answer from copy') + dest_pg.stop() + + # Clean after yourself + self.del_test_dir(module_name, fname) + + # @unittest.skip("skip") + def test_remote_catchup(self): + """ + Test 'catchup' mode + create node, + make a copy with replication, start copy, stop copy, + generate some load on master, insert some test data on master, + catchup copy, start and select test data + """ + fname = self.id().split('.')[3] + + # prepare master + source_pg = self.make_simple_node( + base_dir = os.path.join(module_name, fname, 'src'), + set_replication = True, + ptrack_enable = True, + initdb_params = ['--data-checksums'] + ) + source_pg.slow_start() + source_pg.safe_psql("postgres", "CREATE EXTENSION ptrack") + source_pg.safe_psql("postgres", "CREATE TABLE ultimate_question(answer int)") + + # make clean shutdowned lagging behind replica + dest_pg = self.catchup_node( + backup_mode = 'FULL', + source_pgdata = source_pg.data_dir, + destination_base_dir = os.path.join(module_name, fname, 'dst'), + options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream', + '--remote-proto=ssh', '--remote-host=localhost'] + ) + self.set_replica(source_pg, dest_pg) + dest_pg.slow_start(replica = True) + dest_pg.stop() + + # make changes on master + source_pg.pgbench_init(scale=10) + pgbench = source_pg.pgbench(options=['-T', '10', '--no-vacuum']) + pgbench.wait() + source_pg.safe_psql("postgres", "INSERT INTO ultimate_question VALUES(42)") + result = source_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") + + # catchup + self.catchup_node( + backup_mode = 'PTRACK', + source_pgdata = source_pg.data_dir, + destination_base_dir = os.path.join(module_name, fname, 'dst'), + options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream', + '--remote-proto=ssh', '--remote-host=localhost'], + node = dest_pg + ) + + # stop replication + source_pg.stop() + + # check latest changes + self.set_replica(source_pg, dest_pg) + dest_pg.slow_start(replica = True) + self.assertEqual( + result, + dest_pg.safe_psql("postgres", "SELECT * FROM ultimate_question"), + 'Different answer from copy') + dest_pg.stop() + + # Clean after yourself + self.del_test_dir(module_name, fname) + + diff --git a/tests/helpers/ptrack_helpers.py b/tests/helpers/ptrack_helpers.py index 3c75ca2e7..38998ce38 100644 --- a/tests/helpers/ptrack_helpers.py +++ b/tests/helpers/ptrack_helpers.py @@ -973,6 +973,31 @@ def restore_node( return self.run_pb(cmd_list + options, gdb=gdb, old_binary=old_binary) + def catchup_node( + self, + backup_mode, source_pgdata, destination_base_dir, + options = [], + node = None + ): + + real_destination_dir = os.path.join(self.tmp_path, destination_base_dir) + if not node: + shutil.rmtree(real_destination_dir, ignore_errors = True) + node = testgres.get_new_node('test', base_dir = real_destination_dir) + node.slow_start = slow_start.__get__(node) + node.should_rm_dirs = True + + cmd_list = [ + 'catchup', + '--backup-mode={0}'.format(backup_mode), + '--catchup-source-pgdata={0}'.format(source_pgdata), + '--catchup-destination-pgdata={0}'.format(node.data_dir) + ] + self.run_pb(cmd_list + options) + + node.append_conf(port=node.port) + return node + def show_pb( self, backup_dir, instance=None, backup_id=None, options=[], as_text=False, as_json=True, old_binary=False, From 69db990b64791087b18cc30fbf2dec93f61f8411 Mon Sep 17 00:00:00 2001 From: Grigory Smolkin Date: Sat, 15 May 2021 16:53:54 +0300 Subject: [PATCH 03/63] use correct args for wait_wal_lsn() --- src/backup.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/backup.c b/src/backup.c index 5eb984eec..c45cc2568 100644 --- a/src/backup.c +++ b/src/backup.c @@ -87,7 +87,6 @@ do_backup_instance(PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync, bool { int i; char external_prefix[MAXPGPATH]; /* Temp value. Used as template */ - char dst_backup_path[MAXPGPATH]; char label[1024]; XLogRecPtr prev_backup_start_lsn = InvalidXLogRecPtr; @@ -270,17 +269,19 @@ do_backup_instance(PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync, bool /* start stream replication */ if (stream_wal) { - join_path_components(dst_backup_path, current.database_dir, PG_XLOG_DIR); - fio_mkdir(dst_backup_path, DIR_PERMISSION, FIO_BACKUP_HOST); + char stream_xlog_path[MAXPGPATH]; - start_WAL_streaming(backup_conn, dst_backup_path, &instance_config.conn_opt, + join_path_components(stream_xlog_path, current.database_dir, PG_XLOG_DIR); + fio_mkdir(stream_xlog_path, DIR_PERMISSION, FIO_BACKUP_HOST); + + start_WAL_streaming(backup_conn, stream_xlog_path, &instance_config.conn_opt, current.start_lsn, current.tli); /* Make sure that WAL streaming is working * PAGE backup in stream mode is waited twice, first for * segment in WAL archive and then for streamed segment */ - wait_wal_lsn(current.start_lsn, true, current.tli, false, true, ERROR, true, ¤t); + wait_wal_lsn(current.start_lsn, true, current.tli, false, true, ERROR, true, stream_xlog_path); } /* initialize backup's file list */ From 6a3c3d3088c2a968bb8820502f777dc44f92246f Mon Sep 17 00:00:00 2001 From: Grigory Smolkin Date: Sun, 16 May 2021 22:02:24 +0300 Subject: [PATCH 04/63] [Issue #277] review, some improvements, refactoring and overhaul, several TODO added --- src/backup.c | 10 +- src/catchup.c | 441 ++++++++++++--------------------------------- src/help.c | 2 +- src/pg_probackup.h | 17 +- src/restore.c | 2 - src/util.c | 4 +- 6 files changed, 133 insertions(+), 343 deletions(-) diff --git a/src/backup.c b/src/backup.c index c45cc2568..7bbc8e4cc 100644 --- a/src/backup.c +++ b/src/backup.c @@ -528,11 +528,7 @@ do_backup_instance(PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync, bool } /* Notify end of backup */ - { - char backup_database_dir[MAXPGPATH]; - pgBackupGetPath(¤t, backup_database_dir, lengthof(backup_database_dir), DATABASE_DIR); - pg_stop_backup(¤t, backup_conn, nodeInfo, backup_database_dir); - } + pg_stop_backup(¤t, backup_conn, nodeInfo, current.database_dir); /* In case of backup from replica >= 9.6 we must fix minRecPoint, * First we must find pg_control in backup_files_list. @@ -719,7 +715,7 @@ pgdata_basic_setup(ConnectionOptions conn_opt, PGNodeInfo *nodeInfo) if (nodeInfo->is_superuser) elog(WARNING, "Current PostgreSQL role is superuser. " - "It is not recommended to run backup or checkdb as superuser."); + "It is not recommended to run pg_probackup under superuser."); StrNCpy(current.server_version, nodeInfo->server_version_str, sizeof(current.server_version)); @@ -974,7 +970,7 @@ check_server_version(PGconn *conn, PGNodeInfo *nodeInfo) * All system identifiers must be equal. */ void -check_system_identifiers(PGconn *conn, char *pgdata) +check_system_identifiers(PGconn *conn, const char *pgdata) { uint64 system_id_conn; uint64 system_id_pgdata; diff --git a/src/catchup.c b/src/catchup.c index 5a8beb370..a14d99456 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -29,31 +29,32 @@ */ static void *catchup_files(void *arg); -static void do_catchup_instance(char *source_pgdata, char *dest_pgdata, PGconn *source_conn, PGNodeInfo *nodeInfo, BackupMode backup_mode, bool no_sync, bool backup_logs); - +/* + * TODO: + * - add description + * - fallback to FULL mode if dest PGDATA is empty + */ static void -do_catchup_instance(char *source_pgdata, char *dest_pgdata, PGconn *source_conn, PGNodeInfo *nodeInfo, BackupMode backup_mode, bool no_sync, bool backup_logs) +do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn *source_conn, + PGNodeInfo *nodeInfo, BackupMode backup_mode, bool no_sync, bool backup_logs, + bool dest_pgdata_is_empty) { int i; - //char database_path[MAXPGPATH]; - //char external_prefix[MAXPGPATH]; /* Temp value. Used as template */ char dst_xlog_path[MAXPGPATH]; char label[1024]; - /* XLogRecPtr prev_backup_start_lsn = InvalidXLogRecPtr; */ XLogRecPtr sync_lsn = InvalidXLogRecPtr; XLogRecPtr start_lsn; /* arrays with meta info for multi threaded backup */ pthread_t *threads; catchup_files_arg *threads_args; - bool backup_isok = true; + bool catchup_isok = true; - /* pgBackup *prev_backup = NULL; */ - parray *prev_backup_filelist = NULL; - parray *backup_list = NULL; + parray *source_filelist = NULL; + parray *dest_filelist = NULL; parray *external_dirs = NULL; - /* used for multitimeline incremental backup */ + /* TODO: in case of timeline mistmatch, check that source PG timeline descending from dest PG timeline */ parray *tli_list = NULL; /* for fancy reporting */ @@ -69,10 +70,6 @@ do_catchup_instance(char *source_pgdata, char *dest_pgdata, PGconn *source_conn, check_external_for_tablespaces(external_dirs, source_conn); } - /* Clear ptrack files for not PTRACK backups */ - if (backup_mode != BACKUP_MODE_DIFF_PTRACK && nodeInfo->is_ptrack_enable) - pg_ptrack_clear(source_conn, nodeInfo->ptrack_version_num); - /* notify start of backup to PostgreSQL server */ time2iso(label, lengthof(label), current.start_time, false); strncat(label, " with pg_probackup", lengthof(label) - @@ -89,30 +86,20 @@ do_catchup_instance(char *source_pgdata, char *dest_pgdata, PGconn *source_conn, current.tli = get_current_timeline_from_control(false); #endif - /* In PAGE mode or in ARCHIVE wal-mode wait for current segment */ - if (backup_mode == BACKUP_MODE_DIFF_PAGE ||!stream_wal) - /* - * Do not wait start_lsn for stream backup. - * Because WAL streaming will start after pg_start_backup() in stream - * mode. - */ - wait_wal_lsn(start_lsn, true, current.tli, false, true, ERROR, false, arclog_path); - - if (backup_mode == BACKUP_MODE_DIFF_PAGE || - backup_mode == BACKUP_MODE_DIFF_PTRACK || - backup_mode == BACKUP_MODE_DIFF_DELTA) + if (!dest_pgdata_is_empty && + (backup_mode == BACKUP_MODE_DIFF_PTRACK || + backup_mode == BACKUP_MODE_DIFF_DELTA)) { - prev_backup_filelist = parray_new(); - dir_list_file(prev_backup_filelist, dest_pgdata, - true, true, false, backup_logs, true, 0, FIO_LOCAL_HOST); + dest_filelist = parray_new(); + dir_list_file(dest_filelist, dest_pgdata, + true, true, false, backup_logs, true, 0, FIO_LOCAL_HOST); sync_lsn = get_min_recovery_point(dest_pgdata); elog(INFO, "syncLSN = %X/%X", (uint32) (sync_lsn >> 32), (uint32) sync_lsn); } /* - * It`s illegal to take PTRACK backup if LSN from ptrack_control() is not - * equal to start_lsn of previous backup. + * TODO: move to separate function to use in both backup.c and catchup.c */ if (backup_mode == BACKUP_MODE_DIFF_PTRACK) { @@ -137,82 +124,44 @@ do_catchup_instance(char *source_pgdata, char *dest_pgdata, PGconn *source_conn, } } - /* For incremental backup check that start_lsn is not from the past - * Though it will not save us if PostgreSQL instance is actually - * restored STREAM backup. - */ + /* Check that sync_lsn is less than start_lsn */ /* TODO это нужно? */ if (backup_mode != BACKUP_MODE_FULL && sync_lsn > start_lsn) - elog(ERROR, "Current START LSN %X/%X is lower than START LSN %X/%X. " - "It may indicate that we are trying to backup PostgreSQL instance from the past.", + elog(ERROR, "Current START LSN %X/%X is lower than SYNC LSN %X/%X, " + "it may indicate that we are trying to catchup with PostgreSQL instance from the past", (uint32) (start_lsn >> 32), (uint32) (start_lsn), (uint32) (sync_lsn >> 32), (uint32) (sync_lsn)); - /* Update running backup meta with START LSN */ - //write_backup(¤t, true); - - //pgBackupGetPath(¤t, database_path, lengthof(database_path), - // DATABASE_DIR); - //pgBackupGetPath(¤t, external_prefix, lengthof(external_prefix), - // EXTERNAL_DIR); - - /* start stream replication */ + /* Start stream replication */ if (stream_wal) { - instance_config.system_identifier = get_system_identifier(source_pgdata); join_path_components(dst_xlog_path, dest_pgdata, PG_XLOG_DIR); fio_mkdir(dst_xlog_path, DIR_PERMISSION, FIO_BACKUP_HOST); - start_WAL_streaming(source_conn, dst_xlog_path, &instance_config.conn_opt, start_lsn, current.tli); } /* initialize backup list */ - backup_files_list = parray_new(); + source_filelist = parray_new(); /* list files with the logical path. omit $PGDATA */ if (fio_is_remote(FIO_DB_HOST)) - fio_list_dir(backup_files_list, source_pgdata, + fio_list_dir(source_filelist, source_pgdata, true, true, false, backup_logs, true, 0); else - dir_list_file(backup_files_list, source_pgdata, + dir_list_file(source_filelist, source_pgdata, true, true, false, backup_logs, true, 0, FIO_LOCAL_HOST); - /* - * Append to backup list all files and directories - * from external directory option - */ - if (external_dirs) - { - for (i = 0; i < parray_num(external_dirs); i++) - { - /* External dirs numeration starts with 1. - * 0 value is not external dir */ - if (fio_is_remote(FIO_DB_HOST)) - fio_list_dir(backup_files_list, parray_get(external_dirs, i), - false, true, false, false, true, i+1); - else - dir_list_file(backup_files_list, parray_get(external_dirs, i), - false, true, false, false, true, i+1, FIO_LOCAL_HOST); - } - } - /* close ssh session in main thread */ fio_disconnect(); - /* Sanity check for backup_files_list, thank you, Windows: - * https://github.com/postgrespro/pg_probackup/issues/48 + /* Calculate pgdata_bytes + * TODO: move to separate function to use in both backup.c and catchup.c */ - - if (parray_num(backup_files_list) < 100) - elog(ERROR, "PGDATA is almost empty. Either it was concurrently deleted or " - "pg_probackup do not possess sufficient permissions to list PGDATA content"); - - /* Calculate pgdata_bytes */ - for (i = 0; i < parray_num(backup_files_list); i++) + for (i = 0; i < parray_num(source_filelist); i++) { - pgFile *file = (pgFile *) parray_get(backup_files_list, i); + pgFile *file = (pgFile *) parray_get(source_filelist, i); if (file->external_dir_num != 0) continue; @@ -240,10 +189,10 @@ do_catchup_instance(char *source_pgdata, char *dest_pgdata, PGconn *source_conn, * Sorted array is used at least in parse_filelist_filenames(), * extractPageMap(), make_pagemap_from_ptrack(). */ - parray_qsort(backup_files_list, pgFileCompareRelPathWithExternal); + parray_qsort(source_filelist, pgFileCompareRelPathWithExternal); /* Extract information about files in backup_list parsing their names:*/ - parse_filelist_filenames(backup_files_list, source_pgdata); + parse_filelist_filenames(source_filelist, source_pgdata); elog(LOG, "Current Start LSN: %X/%X, TLI: %X", (uint32) (start_lsn >> 32), (uint32) (start_lsn), @@ -254,59 +203,32 @@ do_catchup_instance(char *source_pgdata, char *dest_pgdata, PGconn *source_conn, (uint32) (sync_lsn >> 32), (uint32) (sync_lsn), prev_backup->tli); */ - /* - * Build page mapping in incremental mode. - */ - if (backup_mode == BACKUP_MODE_DIFF_PAGE || - backup_mode == BACKUP_MODE_DIFF_PTRACK) - { - bool pagemap_isok = true; + /* Build page mapping in PTRACK mode */ + if (backup_mode == BACKUP_MODE_DIFF_PAGE) + elog(ERROR, "Catchup in PAGE mode currently is not supported"); + else if (backup_mode == BACKUP_MODE_DIFF_PTRACK) + { time(&start_time); elog(INFO, "Extracting pagemap of changed blocks"); - if (backup_mode == BACKUP_MODE_DIFF_PAGE) - { - /* - * Build the page map. Obtain information about changed pages - * reading WAL segments present in archives up to the point - * where this backup has started. - */ - /* TODO page пока не поддерживается */ - /* pagemap_isok = extractPageMap(arclog_path, instance_config.xlog_seg_size, - sync_lsn, prev_backup->tli, - current.start_lsn, current.tli, tli_list); - */ - } - else if (backup_mode == BACKUP_MODE_DIFF_PTRACK) - { - /* - * Build the page map from ptrack information. - */ - make_pagemap_from_ptrack_2(backup_files_list, source_conn, - nodeInfo->ptrack_schema, - nodeInfo->ptrack_version_num, - sync_lsn); - } - + /* Build the page map from ptrack information */ + make_pagemap_from_ptrack_2(source_filelist, source_conn, + nodeInfo->ptrack_schema, + nodeInfo->ptrack_version_num, + sync_lsn); time(&end_time); - - /* TODO: add ms precision */ - if (pagemap_isok) - elog(INFO, "Pagemap successfully extracted, time elapsed: %.0f sec", - difftime(end_time, start_time)); - else - elog(ERROR, "Pagemap extraction failed, time elasped: %.0f sec", - difftime(end_time, start_time)); + elog(INFO, "Pagemap successfully extracted, time elapsed: %.0f sec", + difftime(end_time, start_time)); } /* - * Make directories before backup and setup threads at the same time + * Make directories before catchup and setup threads at the same time */ - for (i = 0; i < parray_num(backup_files_list); i++) + for (i = 0; i < parray_num(source_filelist); i++) { - pgFile *file = (pgFile *) parray_get(backup_files_list, i); + pgFile *file = (pgFile *) parray_get(source_filelist, i); /* if the entry was a directory, create it in the backup */ if (S_ISDIR(file->mode)) @@ -333,18 +255,7 @@ do_catchup_instance(char *source_pgdata, char *dest_pgdata, PGconn *source_conn, } /* Sort by size for load balancing */ - parray_qsort(backup_files_list, pgFileCompareSize); - /* Sort the array for binary search */ - if (prev_backup_filelist) - parray_qsort(prev_backup_filelist, pgFileCompareRelPathWithExternal); - - /* write initial backup_content.control file and update backup.control */ - //write_backup_filelist(¤t, backup_files_list, - // instance_config.pgdata, external_dirs, true); - //write_backup(¤t, true); - - /* Init backup page header map */ - //init_header_map(¤t); + parray_qsort(source_filelist, pgFileCompareSize); /* init thread args with own file lists */ threads = (pthread_t *) palloc(sizeof(pthread_t) * num_threads); @@ -360,11 +271,10 @@ do_catchup_instance(char *source_pgdata, char *dest_pgdata, PGconn *source_conn, /* TODO разобраться */ //arg->external_prefix = external_prefix; //arg->external_dirs = external_dirs; - arg->files_list = backup_files_list; - /* TODO !!!! change to target file_list */ - arg->prev_filelist = prev_backup_filelist; - /* arg->prev_start_lsn = prev_backup_start_lsn; */ - arg->prev_start_lsn = sync_lsn; + arg->source_filelist = source_filelist; + /* TODO !!!! change to target file_list */ + arg->dest_filelist = dest_filelist; + arg->sync_lsn = sync_lsn; arg->backup_mode = backup_mode; arg->conn_arg.conn = NULL; arg->conn_arg.cancel_conn = NULL; @@ -392,40 +302,33 @@ do_catchup_instance(char *source_pgdata, char *dest_pgdata, PGconn *source_conn, { pthread_join(threads[i], NULL); if (threads_args[i].ret == 1) - backup_isok = false; + catchup_isok = false; } time(&end_time); pretty_time_interval(difftime(end_time, start_time), pretty_time, lengthof(pretty_time)); - if (backup_isok) + if (catchup_isok) elog(INFO, "Data files are transferred, time elapsed: %s", pretty_time); else elog(ERROR, "Data files transferring failed, time elapsed: %s", pretty_time); - /* clean previous backup file list */ - if (prev_backup_filelist) - { - parray_walk(prev_backup_filelist, pgFileFree); - parray_free(prev_backup_filelist); - } - /* Notify end of backup */ current.start_lsn = start_lsn; pg_stop_backup(¤t, source_conn, nodeInfo, dest_pgdata); /* In case of backup from replica >= 9.6 we must fix minRecPoint, - * First we must find pg_control in backup_files_list. + * First we must find pg_control in source_filelist. */ if (current.from_replica && !exclusive_backup) { pgFile *pg_control = NULL; - for (i = 0; i < parray_num(backup_files_list); i++) + for (i = 0; i < parray_num(source_filelist); i++) { - pgFile *tmp_file = (pgFile *) parray_get(backup_files_list, i); + pgFile *tmp_file = (pgFile *) parray_get(source_filelist, i); if (tmp_file->external_dir_num == 0 && (strcmp(tmp_file->rel_path, XLOG_CONTROL_FILE) == 0)) @@ -442,36 +345,21 @@ do_catchup_instance(char *source_pgdata, char *dest_pgdata, PGconn *source_conn, set_min_recovery_point(pg_control, dest_pgdata, current.stop_lsn); } - /* close and sync page header map */ - //if (current.hdr_map.fp) - //{ - // cleanup_header_map(&(current.hdr_map)); - // - // if (fio_sync(current.hdr_map.path, FIO_BACKUP_HOST) != 0) - // elog(ERROR, "Cannot sync file \"%s\": %s", current.hdr_map.path, strerror(errno)); - //} - /* close ssh session in main thread */ fio_disconnect(); - /* Print the list of files to backup catalog */ - //write_backup_filelist(¤t, backup_files_list, instance_config.pgdata, - // external_dirs, true); - /* update backup control file to update size info */ - //write_backup(¤t, true); - /* Sync all copied files unless '--no-sync' flag is used */ if (no_sync) - elog(WARNING, "Backup files are not synced to disk"); + elog(WARNING, "Files are not synced to disk"); else { - elog(INFO, "Syncing backup files to disk"); + elog(INFO, "Syncing copied files to disk"); time(&start_time); - for (i = 0; i < parray_num(backup_files_list); i++) + for (i = 0; i < parray_num(source_filelist); i++) { char to_fullpath[MAXPGPATH]; - pgFile *file = (pgFile *) parray_get(backup_files_list, i); + pgFile *file = (pgFile *) parray_get(source_filelist, i); /* TODO: sync directory ? */ if (S_ISDIR(file->mode)) @@ -500,125 +388,80 @@ do_catchup_instance(char *source_pgdata, char *dest_pgdata, PGconn *source_conn, time(&end_time); pretty_time_interval(difftime(end_time, start_time), pretty_time, lengthof(pretty_time)); - elog(INFO, "Backup files are synced, time elapsed: %s", pretty_time); + elog(INFO, "Files are synced, time elapsed: %s", pretty_time); } - /* be paranoid about instance been from the past */ - // if (backup_mode != BACKUP_MODE_FULL && - // current.stop_lsn < prev_backup->stop_lsn) - // elog(ERROR, "Current backup STOP LSN %X/%X is lower than STOP LSN %X/%X of previous backup %s. " - // "It may indicate that we are trying to backup PostgreSQL instance from the past.", - // (uint32) (current.stop_lsn >> 32), (uint32) (current.stop_lsn), - // (uint32) (prev_backup->stop_lsn >> 32), (uint32) (prev_backup->stop_lsn), - // base36enc(prev_backup->stop_lsn)); - - /* clean external directories list */ - if (external_dirs) - free_dir_list(external_dirs); - /* Cleanup */ - if (backup_list) - { - parray_walk(backup_list, pgBackupFree); - parray_free(backup_list); - } - - if (tli_list) + if (!dest_pgdata_is_empty && dest_filelist) { - parray_walk(tli_list, timelineInfoFree); - parray_free(tli_list); + parray_walk(dest_filelist, pgFileFree); + parray_free(dest_filelist); } - parray_walk(backup_files_list, pgFileFree); - parray_free(backup_files_list); - backup_files_list = NULL; - // где закрывается backup_conn? + parray_walk(source_filelist, pgFileFree); + parray_free(source_filelist); + // где закрывается conn? } /* * Entry point of pg_probackup CATCHUP subcommand. - * */ int -do_catchup(char *source_pgdata, char *dest_pgdata, BackupMode backup_mode, ConnectionOptions conn_opt, bool stream_wal, int num_threads) +do_catchup(const char *source_pgdata, const char *dest_pgdata, BackupMode backup_mode, + ConnectionOptions conn_opt, bool stream_wal, int num_threads) { - PGconn *backup_conn = NULL; + PGconn *conn = NULL; PGNodeInfo nodeInfo; - //char pretty_bytes[20]; bool no_sync = false; bool backup_logs = false; + bool dest_pgdata_is_empty = dir_is_empty(dest_pgdata, FIO_LOCAL_HOST); /* Initialize PGInfonode */ pgNodeInit(&nodeInfo); - /* ugly hack */ - instance_config.xlog_seg_size = DEFAULT_XLOG_SEG_SIZE; - - //if (!instance_config.pgdata) - // elog(ERROR, "required parameter not specified: PGDATA " - // "(-D, --pgdata)"); + // TODO: add sanity check that source PGDATA is not empty - /* Update backup status and other metainfo. */ - //current.status = BACKUP_STATUS_RUNNING; - //current.start_time = start_time; + /* Get WAL segments size and system ID of source PG instance */ + instance_config.xlog_seg_size = get_xlog_seg_size(source_pgdata); + instance_config.system_identifier = get_system_identifier(source_pgdata); - StrNCpy(current.program_version, PROGRAM_VERSION, - sizeof(current.program_version)); + StrNCpy(current.program_version, PROGRAM_VERSION, sizeof(current.program_version)); - //current.compress_alg = instance_config.compress_alg; - //current.compress_level = instance_config.compress_level; - - /* Save list of external directories */ - //if (instance_config.external_dir_str && - // (pg_strcasecmp(instance_config.external_dir_str, "none") != 0)) - // current.external_dir_str = instance_config.external_dir_str; - - elog(INFO, "Catchup start, pg_probackup version: %s, `" + elog(INFO, "Catchup start, pg_probackup version: %s, " "wal mode: %s, remote: %s, catchup-source-pgdata: %s, catchup-destination-pgdata: %s", PROGRAM_VERSION, current.stream ? "STREAM" : "ARCHIVE", IsSshProtocol() ? "true" : "false", source_pgdata, dest_pgdata); - /* Create backup directory and BACKUP_CONTROL_FILE */ - //if (pgBackupCreateDir(¤t)) - // elog(ERROR, "Cannot create backup directory"); - //if (!lock_backup(¤t, true)) - // elog(ERROR, "Cannot lock backup %s directory", - // base36enc(current.start_time)); - //write_backup(¤t, true); + /* Do some compatibility checks and fill basic info about PG instance */ + conn = pgdata_basic_setup(instance_config.conn_opt, &nodeInfo); - //elog(LOG, "Backup destination is initialized"); + elog(INFO, "PostgreSQL version: %s", nodeInfo.server_version_str); - /* - * setup backup_conn, do some compatibility checks and - * fill basic info about instance - */ - backup_conn = pgdata_basic_setup(instance_config.conn_opt, &nodeInfo); + if (current.from_replica) + elog(INFO, "Running catchup from standby"); - //if (current.from_replica) - // elog(INFO, "Backup %s is going to be taken from standby", base36enc(start_time)); + /* Check that connected PG instance and source PGDATA are the same */ + check_system_identifiers(conn, source_pgdata); - /* TODO, print PostgreSQL full version */ - //elog(INFO, "PostgreSQL version: %s", nodeInfo.server_version_str); - - /* - * Ensure that backup directory was initialized for the same PostgreSQL - * instance we opened connection to. And that target backup database PGDATA - * belogns to the same instance. - */ - //check_system_identifiers(backup_conn, instance_config.pgdata); + if (!dest_pgdata_is_empty && + check_incremental_compatibility(dest_pgdata, + instance_config.system_identifier, + INCR_CHECKSUM) != DEST_OK) + elog(ERROR, "Incremental restore is not allowed"); /* below perform checks specific for backup command */ #if PG_VERSION_NUM >= 110000 - if (!RetrieveWalSegSize(backup_conn)) + if (!RetrieveWalSegSize(conn)) elog(ERROR, "Failed to retrieve wal_segment_size"); #endif - get_ptrack_version(backup_conn, &nodeInfo); - // elog(WARNING, "ptrack_version_num %d", ptrack_version_num); + // TODO: move to separate function for reuse in backup.c and catchup.c + // -> + get_ptrack_version(conn, &nodeInfo); if (nodeInfo.ptrack_version_num > 0) - nodeInfo.is_ptrack_enable = pg_ptrack_enable(backup_conn, nodeInfo.ptrack_version_num); + nodeInfo.is_ptrack_enable = pg_ptrack_enable(conn, nodeInfo.ptrack_version_num); if (backup_mode == BACKUP_MODE_DIFF_PTRACK) { @@ -630,41 +473,21 @@ do_catchup(char *source_pgdata, char *dest_pgdata, BackupMode backup_mode, Conne elog(ERROR, "Ptrack is disabled"); } } + // <- if (current.from_replica && exclusive_backup) - /* Check master connection options */ - if (instance_config.master_conn_opt.pghost == NULL) - elog(ERROR, "Options for connection to master must be provided to perform backup from replica"); - - /* backup data */ - do_catchup_instance(source_pgdata, dest_pgdata, backup_conn, &nodeInfo, backup_mode, no_sync, backup_logs); + elog(ERROR, "Catchup from standby is available only for PG >= 9.6"); - //if (!no_validate) - // pgBackupValidate(¤t, NULL); + do_catchup_instance(source_pgdata, dest_pgdata, conn, &nodeInfo, + backup_mode, no_sync, backup_logs, dest_pgdata_is_empty); - /* Notify user about backup size */ - //if (current.stream) - // pretty_size(current.data_bytes + current.wal_bytes, pretty_bytes, lengthof(pretty_bytes)); - //else - // pretty_size(current.data_bytes, pretty_bytes, lengthof(pretty_bytes)); - //elog(INFO, "Backup %s resident size: %s", base36enc(current.start_time), pretty_bytes); - - //if (current.status == BACKUP_STATUS_OK || - // current.status == BACKUP_STATUS_DONE) - // elog(INFO, "Backup %s completed", base36enc(current.start_time)); - //else - // elog(ERROR, "Backup %s failed", base36enc(current.start_time)); + /* TODO: show the amount of transfered data in bytes and calculate incremental ratio */ return 0; } /* - * Take a backup of the PGDATA at a file level. - * Copy all directories and files listed in backup_files_list. - * If the file is 'datafile' (regular relation's main fork), read it page by page, - * verify checksum and copy. - * In incremental backup mode, copy only files or datafiles' pages changed after - * previous backup. + * TODO: add description */ static void * catchup_files(void *arg) @@ -672,55 +495,30 @@ catchup_files(void *arg) int i; char from_fullpath[MAXPGPATH]; char to_fullpath[MAXPGPATH]; - static time_t prev_time; catchup_files_arg *arguments = (catchup_files_arg *) arg; - int n_catchup_files_list = parray_num(arguments->files_list); - - /* TODO !!!! remove current */ - prev_time = current.start_time; + int n_files = parray_num(arguments->source_filelist); - /* backup a file */ - for (i = 0; i < n_catchup_files_list; i++) + /* catchup a file */ + for (i = 0; i < n_files; i++) { - pgFile *file = (pgFile *) parray_get(arguments->files_list, i); - pgFile *prev_file = NULL; + pgFile *file = (pgFile *) parray_get(arguments->source_filelist, i); + pgFile *dest_file = NULL; /* We have already copied all directories */ if (S_ISDIR(file->mode)) continue; - if (arguments->thread_num == 1) - { - /* update backup_content.control every 60 seconds */ - if ((difftime(time(NULL), prev_time)) > 60) - { - // write_backup_filelist(¤t, arguments->files_list, arguments->from_root, - // arguments->external_dirs, false); - /* update backup control file to update size info */ - //write_backup(¤t, true); - - prev_time = time(NULL); - } - } - if (!pg_atomic_test_set_flag(&file->lock)) continue; /* check for interrupt */ if (interrupted || thread_interrupted) - elog(ERROR, "interrupted during backup"); + elog(ERROR, "interrupted during catchup"); if (progress) elog(INFO, "Progress: (%d/%d). Process file \"%s\"", - i + 1, n_catchup_files_list, file->rel_path); - - /* Handle zero sized files */ - //if (file->size == 0) - //{ - // file->write_size = 0; - // continue; - //} + i + 1, n_files, file->rel_path); /* construct destination filepath */ /* TODO разобраться нужен ли external */ @@ -749,25 +547,25 @@ catchup_files(void *arg) elog(WARNING, "Unexpected type %d of file \"%s\", skipping", file->mode, from_fullpath); - /* Check that file exist in previous backup */ + /* Check that file exist in dest pgdata */ if (arguments->backup_mode != BACKUP_MODE_FULL) { - pgFile **prev_file_tmp = NULL; - prev_file_tmp = (pgFile **) parray_bsearch(arguments->prev_filelist, + pgFile **dest_file_tmp = NULL; + dest_file_tmp = (pgFile **) parray_bsearch(arguments->dest_filelist, file, pgFileCompareRelPathWithExternal); - if (prev_file_tmp) + if (dest_file_tmp) { - /* File exists in previous backup */ + /* File exists in destination PGDATA */ file->exists_in_prev = true; - prev_file = *prev_file_tmp; + dest_file = *dest_file_tmp; } } - /* backup file */ + /* Do actual work */ if (file->is_datafile && !file->is_cfs) { catchup_data_file(&(arguments->conn_arg), file, from_fullpath, to_fullpath, - arguments->prev_start_lsn, + arguments->sync_lsn, arguments->backup_mode, NONE_COMPRESS, 0, @@ -778,7 +576,7 @@ catchup_files(void *arg) } else { - backup_non_data_file(file, prev_file, from_fullpath, to_fullpath, + backup_non_data_file(file, dest_file, from_fullpath, to_fullpath, arguments->backup_mode, current.parent_backup, true); } @@ -798,13 +596,8 @@ catchup_files(void *arg) /* ssh connection to longer needed */ fio_disconnect(); - /* Close connection */ - if (arguments->conn_arg.conn) - pgut_disconnect(arguments->conn_arg.conn); - /* Data files transferring is successful */ arguments->ret = 0; return NULL; } - diff --git a/src/help.c b/src/help.c index def680f6c..84e0b7d66 100644 --- a/src/help.c +++ b/src/help.c @@ -1026,7 +1026,7 @@ help_catchup(void) printf(_(" [--remote-port] [--remote-path] [--remote-user]\n")); printf(_(" [--ssh-options]\n\n")); - printf(_(" -b, --backup-mode=backup-mode backup mode=FULL|PTRACK\n")); + printf(_(" -b, --backup-mode=backup-mode backup mode=FULL|DELTA|PTRACK\n")); printf(_(" --stream stream the transaction log and include it in the backup\n")); printf(_(" -S, --slot=SLOTNAME replication slot to use\n")); printf(_(" --temp-slot use temporary replication slot\n")); diff --git a/src/pg_probackup.h b/src/pg_probackup.h index be9100e1f..bb2499a07 100644 --- a/src/pg_probackup.h +++ b/src/pg_probackup.h @@ -603,11 +603,11 @@ typedef struct const char *to_root; const char *external_prefix; - parray *files_list; - parray *prev_filelist; + parray *source_filelist; + parray *dest_filelist; /* TODO разобраться */ //parray *external_dirs; - XLogRecPtr prev_start_lsn; + XLogRecPtr sync_lsn; BackupMode backup_mode; ConnectionArgs conn_arg; @@ -861,7 +861,8 @@ extern char *pg_ptrack_get_block(ConnectionArgs *arguments, BlockNumber blknum, size_t *result_size, int ptrack_version_num, const char *ptrack_schema); /* in catchup.c */ -extern int do_catchup(char *source_pgdata, char *dest_pgdata, BackupMode backup_mode, ConnectionOptions conn_opt, bool stream_wal, int num_threads); +extern int do_catchup(const char *source_pgdata, const char *dest_pgdata, BackupMode backup_mode, + ConnectionOptions conn_opt, bool stream_wal, int num_threads); /* in restore.c */ extern int do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt, @@ -882,6 +883,8 @@ extern parray *get_dbOid_exclude_list(pgBackup *backup, parray *datname_list, extern parray *get_backup_filelist(pgBackup *backup, bool strict); extern parray *read_timeline_history(const char *arclog_path, TimeLineID targetTLI, bool strict); extern bool tliIsPartOfHistory(const parray *timelines, TimeLineID tli); +extern DestDirIncrCompatibility check_incremental_compatibility(const char *pgdata, uint64 system_identifier, + IncrRestoreMode incremental_mode); /* in merge.c */ extern void do_merge(time_t backup_id, bool no_validate, bool no_sync); @@ -1167,9 +1170,9 @@ extern uint64 get_system_identifier(const char *pgdata_path); extern uint64 get_remote_system_identifier(PGconn *conn); extern uint32 get_data_checksum_version(bool safe); extern pg_crc32c get_pgcontrol_checksum(const char *pgdata_path); -extern uint32 get_xlog_seg_size(char *pgdata_path); +extern uint32 get_xlog_seg_size(const char *pgdata_path); extern void get_redo(const char *pgdata_path, RedoParams *redo); -extern XLogRecPtr get_min_recovery_point(char *pgdata_path); +extern XLogRecPtr get_min_recovery_point(const char *pgdata_path); extern void set_min_recovery_point(pgFile *file, const char *backup_path, XLogRecPtr stop_backup_lsn); extern void copy_pgcontrol_file(const char *from_fullpath, fio_location from_location, @@ -1194,7 +1197,7 @@ extern void pretty_size(int64 size, char *buf, size_t len); extern void pretty_time_interval(double time, char *buf, size_t len); extern PGconn *pgdata_basic_setup(ConnectionOptions conn_opt, PGNodeInfo *nodeInfo); -extern void check_system_identifiers(PGconn *conn, char *pgdata); +extern void check_system_identifiers(PGconn *conn, const char *pgdata); extern void parse_filelist_filenames(parray *files, const char *root); /* in ptrack.c */ diff --git a/src/restore.c b/src/restore.c index 9594ef0b0..310541c40 100644 --- a/src/restore.c +++ b/src/restore.c @@ -67,8 +67,6 @@ static void restore_chain(pgBackup *dest_backup, parray *parent_chain, parray *dbOid_exclude_list, pgRestoreParams *params, const char *pgdata_path, bool no_sync, bool cleanup_pgdata, bool backup_has_tblspc); -static DestDirIncrCompatibility check_incremental_compatibility(const char *pgdata, uint64 system_identifier, - IncrRestoreMode incremental_mode); /* * Iterate over backup list to find all ancestors of the broken parent_backup diff --git a/src/util.c b/src/util.c index 061a1d8f3..dbf7a30d1 100644 --- a/src/util.c +++ b/src/util.c @@ -299,7 +299,7 @@ get_remote_system_identifier(PGconn *conn) } uint32 -get_xlog_seg_size(char *pgdata_path) +get_xlog_seg_size(const char *pgdata_path) { #if PG_VERSION_NUM >= 110000 ControlFileData ControlFile; @@ -386,7 +386,7 @@ get_redo(const char *pgdata_path, RedoParams *redo) /* Get minRecoveryPoint from control file from pgdata_path */ XLogRecPtr -get_min_recovery_point(char *pgdata_path) +get_min_recovery_point(const char *pgdata_path) { ControlFileData ControlFile; char *buffer; From 2afa159d56e7ef3fbdb2b404daca87b3ff88d817 Mon Sep 17 00:00:00 2001 From: Grigory Smolkin Date: Sun, 16 May 2021 22:05:07 +0300 Subject: [PATCH 05/63] [Issue #277] run catchup test in remote mode only if envvar PGPROBACKUP_SSH_REMOTE set to "ON" --- tests/catchup.py | 14 ++++---------- tests/helpers/ptrack_helpers.py | 4 ++++ 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/tests/catchup.py b/tests/catchup.py index c98ab1dff..8b2869918 100644 --- a/tests/catchup.py +++ b/tests/catchup.py @@ -73,9 +73,7 @@ def test_multithread_remote_transfer(self): backup_mode = 'FULL', source_pgdata = source_pg.data_dir, destination_base_dir = os.path.join(module_name, fname, 'dst'), - options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream', '-j', '4', - '--remote-proto=ssh', '--remote-host=localhost'] - ) + options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream', '-j', '4']) source_pg.stop() dest_pg.slow_start() @@ -115,9 +113,7 @@ def test_remote_catchup(self): backup_mode = 'FULL', source_pgdata = source_pg.data_dir, destination_base_dir = os.path.join(module_name, fname, 'dst'), - options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream', - '--remote-proto=ssh', '--remote-host=localhost'] - ) + options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream']) self.set_replica(source_pg, dest_pg) dest_pg.slow_start(replica = True) dest_pg.stop() @@ -134,10 +130,8 @@ def test_remote_catchup(self): backup_mode = 'PTRACK', source_pgdata = source_pg.data_dir, destination_base_dir = os.path.join(module_name, fname, 'dst'), - options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream', - '--remote-proto=ssh', '--remote-host=localhost'], - node = dest_pg - ) + options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream'], + node = dest_pg) # stop replication source_pg.stop() diff --git a/tests/helpers/ptrack_helpers.py b/tests/helpers/ptrack_helpers.py index 60a4de496..db382e220 100644 --- a/tests/helpers/ptrack_helpers.py +++ b/tests/helpers/ptrack_helpers.py @@ -998,6 +998,10 @@ def catchup_node( '--catchup-source-pgdata={0}'.format(source_pgdata), '--catchup-destination-pgdata={0}'.format(node.data_dir) ] + + if self.remote: + cmd_list += ['--remote-proto=ssh', '--remote-host=localhost'] + self.run_pb(cmd_list + options) node.append_conf(port=node.port) From 2f843a70677545793a947fb6749cdf7812f57597 Mon Sep 17 00:00:00 2001 From: Grigory Smolkin Date: Sun, 16 May 2021 22:28:32 +0300 Subject: [PATCH 06/63] [Issue #227] Remove support of ptrack1.x, clean up redundant code in catchup --- src/catchup.c | 14 ++------- src/data.c | 78 +++++----------------------------------------- src/pg_probackup.h | 14 +++------ 3 files changed, 15 insertions(+), 91 deletions(-) diff --git a/src/catchup.c b/src/catchup.c index a14d99456..89b4623fa 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -91,7 +91,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * backup_mode == BACKUP_MODE_DIFF_DELTA)) { dest_filelist = parray_new(); - dir_list_file(dest_filelist, dest_pgdata, + dir_list_file(dest_filelist, dest_pgdata, true, true, false, backup_logs, true, 0, FIO_LOCAL_HOST); sync_lsn = get_min_recovery_point(dest_pgdata); @@ -268,18 +268,10 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * arg->nodeInfo = nodeInfo; arg->from_root = source_pgdata; arg->to_root = dest_pgdata; - /* TODO разобраться */ - //arg->external_prefix = external_prefix; - //arg->external_dirs = external_dirs; arg->source_filelist = source_filelist; - /* TODO !!!! change to target file_list */ arg->dest_filelist = dest_filelist; arg->sync_lsn = sync_lsn; arg->backup_mode = backup_mode; - arg->conn_arg.conn = NULL; - arg->conn_arg.cancel_conn = NULL; - /* TODO !!!! */ - arg->hdr_map = &(current.hdr_map); arg->thread_num = i+1; /* By default there are some error */ arg->ret = 1; @@ -564,7 +556,7 @@ catchup_files(void *arg) /* Do actual work */ if (file->is_datafile && !file->is_cfs) { - catchup_data_file(&(arguments->conn_arg), file, from_fullpath, to_fullpath, + catchup_data_file(file, from_fullpath, to_fullpath, arguments->sync_lsn, arguments->backup_mode, NONE_COMPRESS, @@ -572,7 +564,7 @@ catchup_files(void *arg) arguments->nodeInfo->checksum_version, arguments->nodeInfo->ptrack_version_num, arguments->nodeInfo->ptrack_schema, - arguments->hdr_map, false); + false); } else { diff --git a/src/data.c b/src/data.c index 8643f1358..7e4f248f8 100644 --- a/src/data.c +++ b/src/data.c @@ -276,8 +276,7 @@ get_checksum_errormsg(Page page, char **errormsg, BlockNumber absolute_blkno) * return it to the caller */ static int32 -prepare_page(ConnectionArgs *conn_arg, - pgFile *file, XLogRecPtr prev_backup_start_lsn, +prepare_page(pgFile *file, XLogRecPtr prev_backup_start_lsn, BlockNumber blknum, FILE *in, BackupMode backup_mode, Page page, bool strict, @@ -395,66 +394,6 @@ prepare_page(ConnectionArgs *conn_arg, return PageIsOk; } - /* - * Get page via ptrack interface from PostgreSQL shared buffer. - * We do this only in the cases of PTRACK 1.x versions backup - */ - if (backup_mode == BACKUP_MODE_DIFF_PTRACK - && (ptrack_version_num >= 15 && ptrack_version_num < 20)) - { - int rc = 0; - size_t page_size = 0; - Page ptrack_page = NULL; - ptrack_page = (Page) pg_ptrack_get_block(conn_arg, file->dbOid, file->tblspcOid, - file->relOid, absolute_blknum, &page_size, - ptrack_version_num, ptrack_schema); - - if (ptrack_page == NULL) - /* This block was truncated.*/ - return PageIsTruncated; - - if (page_size != BLCKSZ) - elog(ERROR, "File: \"%s\", block %u, expected block size %d, but read %zu", - from_fullpath, blknum, BLCKSZ, page_size); - - /* - * We need to copy the page that was successfully - * retrieved from ptrack into our output "page" parameter. - */ - memcpy(page, ptrack_page, BLCKSZ); - pg_free(ptrack_page); - - /* - * UPD: It apprears that is possible to get zeroed page or page with invalid header - * from shared buffer. - * Note, that getting page with wrong checksumm from shared buffer is - * acceptable. - */ - rc = validate_one_page(page, absolute_blknum, - InvalidXLogRecPtr, page_st, - checksum_version); - - /* It is ok to get zeroed page */ - if (rc == PAGE_IS_ZEROED) - return PageIsOk; - - /* Getting page with invalid header from shared buffers is unacceptable */ - if (rc == PAGE_HEADER_IS_INVALID) - { - char *errormsg = NULL; - get_header_errormsg(page, &errormsg); - elog(ERROR, "Corruption detected in file \"%s\", block %u: %s", - from_fullpath, blknum, errormsg); - } - - /* - * We must set checksum here, because it is outdated - * in the block recieved from shared buffers. - */ - if (checksum_version) - page_st->checksum = ((PageHeader) page)->pd_checksum = pg_checksum_page(page, absolute_blknum); - } - /* * Skip page if page lsn is less than START_LSN of parent backup. * Nullified pages must be copied by DELTA backup, just to be safe. @@ -714,12 +653,11 @@ backup_data_file(ConnectionArgs* conn_arg, pgFile *file, * backup with special header. */ void -catchup_data_file(ConnectionArgs* conn_arg, pgFile *file, - const char *from_fullpath, const char *to_fullpath, +catchup_data_file(pgFile *file, const char *from_fullpath, const char *to_fullpath, XLogRecPtr prev_backup_start_lsn, BackupMode backup_mode, CompressAlg calg, int clevel, uint32 checksum_version, int ptrack_version_num, const char *ptrack_schema, - HeaderMap *hdr_map, bool is_merge) + bool is_merge) { int rc; bool use_pagemap; @@ -796,7 +734,7 @@ catchup_data_file(ConnectionArgs* conn_arg, pgFile *file, else { /* TODO: stop handling errors internally */ - rc = copy_pages(conn_arg, to_fullpath, from_fullpath, file, + rc = copy_pages(to_fullpath, from_fullpath, file, /* send prev backup START_LSN */ backup_mode == BACKUP_MODE_DIFF_DELTA && file->exists_in_prev ? prev_backup_start_lsn : InvalidXLogRecPtr, @@ -1742,7 +1680,7 @@ check_data_file(ConnectionArgs *arguments, pgFile *file, for (blknum = 0; blknum < nblocks; blknum++) { PageState page_st; - page_state = prepare_page(NULL, file, InvalidXLogRecPtr, + page_state = prepare_page(file, InvalidXLogRecPtr, blknum, in, BACKUP_MODE_FULL, curr_page, false, checksum_version, 0, NULL, from_fullpath, &page_st); @@ -2228,7 +2166,7 @@ send_pages(ConnectionArgs* conn_arg, const char *to_fullpath, const char *from_f while (blknum < file->n_blocks) { PageState page_st; - int rc = prepare_page(conn_arg, file, prev_backup_start_lsn, + int rc = prepare_page(file, prev_backup_start_lsn, blknum, in, backup_mode, curr_page, true, checksum_version, ptrack_version_num, ptrack_schema, @@ -2303,7 +2241,7 @@ send_pages(ConnectionArgs* conn_arg, const char *to_fullpath, const char *from_f /* copy local file (взята из send_pages, но используется простое копирование странички, без добавления заголовков и компрессии) */ int -copy_pages(ConnectionArgs* conn_arg, const char *to_fullpath, const char *from_fullpath, +copy_pages(const char *to_fullpath, const char *from_fullpath, pgFile *file, XLogRecPtr prev_backup_start_lsn, uint32 checksum_version, bool use_pagemap, BackupMode backup_mode, int ptrack_version_num, const char *ptrack_schema) @@ -2358,7 +2296,7 @@ copy_pages(ConnectionArgs* conn_arg, const char *to_fullpath, const char *from_f while (blknum < file->n_blocks) { PageState page_st; - int rc = prepare_page(conn_arg, file, prev_backup_start_lsn, + int rc = prepare_page(file, prev_backup_start_lsn, blknum, in, backup_mode, curr_page, true, checksum_version, ptrack_version_num, ptrack_schema, diff --git a/src/pg_probackup.h b/src/pg_probackup.h index bb2499a07..5c5166391 100644 --- a/src/pg_probackup.h +++ b/src/pg_probackup.h @@ -601,18 +601,13 @@ typedef struct const char *from_root; const char *to_root; - const char *external_prefix; parray *source_filelist; parray *dest_filelist; - /* TODO разобраться */ - //parray *external_dirs; + XLogRecPtr sync_lsn; BackupMode backup_mode; - - ConnectionArgs conn_arg; int thread_num; - HeaderMap *hdr_map; /* * Return value from the thread. @@ -1093,12 +1088,11 @@ extern void backup_data_file(ConnectionArgs* conn_arg, pgFile *file, CompressAlg calg, int clevel, uint32 checksum_version, int ptrack_version_num, const char *ptrack_schema, HeaderMap *hdr_map, bool missing_ok); -extern void catchup_data_file(ConnectionArgs* conn_arg, pgFile *file, - const char *from_fullpath, const char *to_fullpath, +extern void catchup_data_file(pgFile *file, const char *from_fullpath, const char *to_fullpath, XLogRecPtr prev_backup_start_lsn, BackupMode backup_mode, CompressAlg calg, int clevel, uint32 checksum_version, int ptrack_version_num, const char *ptrack_schema, - HeaderMap *hdr_map, bool missing_ok); + bool missing_ok); extern void backup_non_data_file(pgFile *file, pgFile *prev_file, const char *from_fullpath, const char *to_fullpath, BackupMode backup_mode, time_t parent_backup_time, @@ -1226,7 +1220,7 @@ extern int send_pages(ConnectionArgs* conn_arg, const char *to_fullpath, const c pgFile *file, XLogRecPtr prev_backup_start_lsn, CompressAlg calg, int clevel, uint32 checksum_version, bool use_pagemap, BackupPageHeader2 **headers, BackupMode backup_mode, int ptrack_version_num, const char *ptrack_schema); -extern int copy_pages(ConnectionArgs* conn_arg, const char *to_fullpath, const char *from_fullpath, +extern int copy_pages(const char *to_fullpath, const char *from_fullpath, pgFile *file, XLogRecPtr prev_backup_start_lsn, uint32 checksum_version, bool use_pagemap, BackupMode backup_mode, int ptrack_version_num, const char *ptrack_schema); From 2f8617dff11f4d65d43ff3687d3899610ade44c2 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Tue, 18 May 2021 13:15:38 +0300 Subject: [PATCH 07/63] cosmetic changes --- src/backup.c | 36 ++++++++++++++++++++---------------- src/pg_probackup.h | 5 +++-- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/src/backup.c b/src/backup.c index 46e3ba482..fa065998f 100644 --- a/src/backup.c +++ b/src/backup.c @@ -38,19 +38,23 @@ bool exclusive_backup = false; /* Is pg_start_backup() was executed */ static bool backup_in_progress = false; -struct pg_stop_backup_result { +typedef struct PGStopBackupResult +{ /* * We will use values of snapshot_xid and invocation_time if there are * no transactions between start_lsn and stop_lsn. */ TransactionId snapshot_xid; time_t invocation_time; + /* + * Fields that store pg_catalog.pg_stop_backup() result + */ XLogRecPtr lsn; size_t backup_label_content_len; char *backup_label_content; size_t tablespace_map_content_len; char *tablespace_map_content; -}; +} PGStopBackupResult; /* * Backup routines @@ -90,12 +94,12 @@ static void check_server_version(PGconn *conn, PGNodeInfo *nodeInfo); static void confirm_block_size(PGconn *conn, const char *name, int blcksz); static void set_cfs_datafiles(parray *files, const char *root, char *relative, size_t i); -static StopBackupCallbackState stop_callback_state; +static StopBackupCallbackParams stop_callback_params; static void backup_stopbackup_callback(bool fatal, void *userdata) { - StopBackupCallbackState *st = (StopBackupCallbackState *) userdata; + StopBackupCallbackParams *st = (StopBackupCallbackParams *) userdata; /* * If backup is in progress, notify stop of backup to PostgreSQL */ @@ -214,11 +218,11 @@ do_backup_pg(InstanceState *instanceState, PGconn *backup_conn, if (prev_backup) { - if (parse_program_version(prev_backup->program_version) > parse_program_version(PROGRAM_VERSION)) - elog(ERROR, "pg_probackup binary version is %s, but backup %s version is %s. " - "pg_probackup do not guarantee to be forward compatible. " - "Please upgrade pg_probackup binary.", - PROGRAM_VERSION, base36enc(prev_backup->start_time), prev_backup->program_version); + if (parse_program_version(prev_backup->program_version) > parse_program_version(PROGRAM_VERSION)) + elog(ERROR, "pg_probackup binary version is %s, but backup %s version is %s. " + "pg_probackup do not guarantee to be forward compatible. " + "Please upgrade pg_probackup binary.", + PROGRAM_VERSION, base36enc(prev_backup->start_time), prev_backup->program_version); elog(INFO, "Parent backup: %s", base36enc(prev_backup->start_time)); @@ -1088,9 +1092,9 @@ pg_start_backup(InstanceState *instanceState, const char *label, bool smooth, pg * is necessary to call pg_stop_backup() in backup_cleanup(). */ backup_in_progress = true; - stop_callback_state.conn = conn; - stop_callback_state.server_version = nodeInfo->server_version; - pgut_atexit_push(backup_stopbackup_callback, &stop_callback_state); + stop_callback_params.conn = conn; + stop_callback_params.server_version = nodeInfo->server_version; + pgut_atexit_push(backup_stopbackup_callback, &stop_callback_params); /* Extract timeline and LSN from results of pg_start_backup() */ XLogDataFromLSN(PQgetvalue(res, 0, 0), &lsn_hi, &lsn_lo); @@ -1573,7 +1577,7 @@ pg_stop_backup_send(PGconn *conn, int server_version, bool is_started_on_replica elog(ERROR, "Failed to send pg_stop_backup query"); /* After we have sent pg_stop_backup, we don't need this callback anymore */ - pgut_atexit_pop(backup_stopbackup_callback, &stop_callback_state); + pgut_atexit_pop(backup_stopbackup_callback, &stop_callback_params); if (query_text) *query_text = pgut_strdup(stop_backup_query); @@ -1589,7 +1593,7 @@ pg_stop_backup_send(PGconn *conn, int server_version, bool is_started_on_replica static void pg_stop_backup_consume(PGconn *conn, int server_version, bool is_exclusive, uint32 timeout, const char *query_text, - struct pg_stop_backup_result *result) + PGStopBackupResult *result) { PGresult *query_result; uint32 pg_stop_backup_timeout = 0; @@ -1738,7 +1742,7 @@ pg_stop_backup_write_file_helper(const char *path, const char *filename, const c error_msg_filename, full_filename, strerror(errno)); /* - * It's vital to check if backup_files_list is initialized, + * It's vital to check if files_list is initialized, * because we could get here because the backup was interrupted */ if (file_list) @@ -1766,7 +1770,7 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb { PGconn *conn; bool stop_lsn_exists = false; - struct pg_stop_backup_result stop_backup_result; + PGStopBackupResult stop_backup_result; char *xlog_path,stream_xlog_path[MAXPGPATH]; /* kludge against some old bug in archive_timeout. TODO: remove in 3.0.0 */ int timeout = (instance_config.archive_timeout > 0) ? diff --git a/src/pg_probackup.h b/src/pg_probackup.h index d02bbb033..4da12d654 100644 --- a/src/pg_probackup.h +++ b/src/pg_probackup.h @@ -679,10 +679,11 @@ typedef struct BackupPageHeader2 uint16 checksum; } BackupPageHeader2; -typedef struct StopBackupCallbackState { +typedef struct StopBackupCallbackParams +{ PGconn *conn; int server_version; -} StopBackupCallbackState; +} StopBackupCallbackParams; /* Special value for compressed_size field */ #define PageIsOk 0 From 61f167c094aa3898eafa5776307165bae5f364ee Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Tue, 18 May 2021 14:42:08 +0300 Subject: [PATCH 08/63] rename pg_checksum_enable() to pg_is_checksum_enabled --- src/backup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/backup.c b/src/backup.c index fa065998f..50aa80939 100644 --- a/src/backup.c +++ b/src/backup.c @@ -87,7 +87,7 @@ static parray *get_database_map(PGconn *pg_startbackup_conn); static bool pgpro_support(PGconn *conn); /* Check functions */ -static bool pg_checksum_enable(PGconn *conn); +static bool pg_is_checksum_enabled(PGconn *conn); static bool pg_is_in_recovery(PGconn *conn); static bool pg_is_superuser(PGconn *conn); static void check_server_version(PGconn *conn, PGNodeInfo *nodeInfo); @@ -732,7 +732,7 @@ pgdata_basic_setup(ConnectionOptions conn_opt, PGNodeInfo *nodeInfo) /* Confirm that this server version is supported */ check_server_version(cur_conn, nodeInfo); - if (pg_checksum_enable(cur_conn)) + if (pg_is_checksum_enabled(cur_conn)) current.checksum_version = 1; else current.checksum_version = 0; @@ -1222,7 +1222,7 @@ get_database_map(PGconn *conn) /* Check if ptrack is enabled in target instance */ static bool -pg_checksum_enable(PGconn *conn) +pg_is_checksum_enabled(PGconn *conn) { PGresult *res_db; From 30543b22ef80132b0ec72f84103c33642ffb7fb2 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Mon, 24 May 2021 07:30:36 +0300 Subject: [PATCH 09/63] remove unused instanceState from pg_start_backup() --- src/backup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/backup.c b/src/backup.c index 50aa80939..b55406507 100644 --- a/src/backup.c +++ b/src/backup.c @@ -66,7 +66,7 @@ static void *backup_files(void *arg); static void do_backup_pg(InstanceState *instanceState, PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync, bool backup_logs); -static void pg_start_backup(InstanceState *instanceState, const char *label, bool smooth, pgBackup *backup, +static void pg_start_backup(const char *label, bool smooth, pgBackup *backup, PGNodeInfo *nodeInfo, PGconn *conn); static void pg_switch_wal(PGconn *conn); static void pg_silent_client_messages(PGconn *conn); @@ -162,7 +162,7 @@ do_backup_pg(InstanceState *instanceState, PGconn *backup_conn, strlen(" with pg_probackup")); /* Call pg_start_backup function in PostgreSQL connect */ - pg_start_backup(instanceState, label, smooth_checkpoint, ¤t, nodeInfo, backup_conn); + pg_start_backup(label, smooth_checkpoint, ¤t, nodeInfo, backup_conn); /* Obtain current timeline */ #if PG_VERSION_NUM >= 90600 @@ -1063,7 +1063,7 @@ confirm_block_size(PGconn *conn, const char *name, int blcksz) * Notify start of backup to PostgreSQL server. */ static void -pg_start_backup(InstanceState *instanceState, const char *label, bool smooth, pgBackup *backup, +pg_start_backup(const char *label, bool smooth, pgBackup *backup, PGNodeInfo *nodeInfo, PGconn *conn) { PGresult *res; From 1657fee7b929375db1a95d52f0fb486ee5620ab4 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Mon, 24 May 2021 09:07:05 +0300 Subject: [PATCH 10/63] Refactor wait_wal_lsn(): remove unused pgBackup * parameter and replace InstanceState * with simple directory string --- src/backup.c | 76 +++++++++++++++++++--------------------------------- 1 file changed, 27 insertions(+), 49 deletions(-) diff --git a/src/backup.c b/src/backup.c index b55406507..317688f99 100644 --- a/src/backup.c +++ b/src/backup.c @@ -75,9 +75,9 @@ static void pg_create_restore_point(PGconn *conn, time_t backup_start_time); static void pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startbackup_conn, PGNodeInfo *nodeInfo); static void pg_stop_backup_send(PGconn *conn, int server_version, bool is_started_on_replica, bool is_exclusive, char **query_text); -static XLogRecPtr wait_wal_lsn(InstanceState *instanceState, XLogRecPtr lsn, bool is_start_lsn, TimeLineID tli, +static XLogRecPtr wait_wal_lsn(const char *wal_segment_dir, XLogRecPtr lsn, bool is_start_lsn, TimeLineID tli, bool in_prev_segment, bool segment_only, - int timeout_elevel, bool in_stream_dir, pgBackup *backup); + int timeout_elevel, bool in_stream_dir); static void check_external_for_tablespaces(parray *external_list, PGconn *backup_conn); @@ -298,7 +298,7 @@ do_backup_pg(InstanceState *instanceState, PGconn *backup_conn, * Because WAL streaming will start after pg_start_backup() in stream * mode. */ - wait_wal_lsn(instanceState, current.start_lsn, true, current.tli, false, true, ERROR, false, ¤t); + wait_wal_lsn(instanceState->instance_wal_subdir_path, current.start_lsn, true, current.tli, false, true, ERROR, false); } /* start stream replication */ @@ -314,7 +314,7 @@ do_backup_pg(InstanceState *instanceState, PGconn *backup_conn, * PAGE backup in stream mode is waited twice, first for * segment in WAL archive and then for streamed segment */ - wait_wal_lsn(instanceState, current.start_lsn, true, current.tli, false, true, ERROR, true, ¤t); + wait_wal_lsn(dst_backup_path, current.start_lsn, true, current.tli, false, true, ERROR, true); } /* initialize backup's file list */ @@ -1298,14 +1298,12 @@ pg_is_superuser(PGconn *conn) * Returns InvalidXLogRecPtr if 'segment_only' flag is used. */ static XLogRecPtr -wait_wal_lsn(InstanceState *instanceState, XLogRecPtr target_lsn, bool is_start_lsn, TimeLineID tli, +wait_wal_lsn(const char *wal_segment_dir, XLogRecPtr target_lsn, bool is_start_lsn, TimeLineID tli, bool in_prev_segment, bool segment_only, - int timeout_elevel, bool in_stream_dir, pgBackup *backup) + int timeout_elevel, bool in_stream_dir) { XLogSegNo targetSegNo; - char pg_wal_dir[MAXPGPATH]; char wal_segment_path[MAXPGPATH], - *wal_segment_dir, wal_segment[MAXFNAMELEN]; bool file_exists = false; uint32 try_count = 0, @@ -1323,6 +1321,7 @@ wait_wal_lsn(InstanceState *instanceState, XLogRecPtr target_lsn, bool is_start_ GetXLogFileName(wal_segment, tli, targetSegNo, instance_config.xlog_seg_size); + join_path_components(wal_segment_path, wal_segment_dir, wal_segment); /* * In pg_start_backup we wait for 'target_lsn' in 'pg_wal' directory if it is * stream and non-page backup. Page backup needs archived WAL files, so we @@ -1330,17 +1329,6 @@ wait_wal_lsn(InstanceState *instanceState, XLogRecPtr target_lsn, bool is_start_ * * In pg_stop_backup it depends only on stream_wal. */ - if (in_stream_dir) - { - join_path_components(pg_wal_dir, backup->database_dir, PG_XLOG_DIR); - join_path_components(wal_segment_path, pg_wal_dir, wal_segment); - wal_segment_dir = pg_wal_dir; - } - else - { - join_path_components(wal_segment_path, instanceState->instance_wal_subdir_path, wal_segment); - wal_segment_dir = instanceState->instance_wal_subdir_path; - } /* TODO: remove this in 3.0 (it is a cludge against some old bug with archive_timeout) */ if (instance_config.archive_timeout > 0) @@ -1771,7 +1759,7 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb PGconn *conn; bool stop_lsn_exists = false; PGStopBackupResult stop_backup_result; - char *xlog_path,stream_xlog_path[MAXPGPATH]; + char *xlog_path, stream_xlog_path[MAXPGPATH]; /* kludge against some old bug in archive_timeout. TODO: remove in 3.0.0 */ int timeout = (instance_config.archive_timeout > 0) ? instance_config.archive_timeout : ARCHIVE_TIMEOUT_DEFAULT; @@ -1803,13 +1791,22 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb */ pg_stop_backup_consume(conn, nodeInfo->server_version, exclusive_backup, timeout, query_text, &stop_backup_result); + if (stream_wal) + { + snprintf(stream_xlog_path, lengthof(stream_xlog_path), + "%s/%s/%s/%s", instanceState->instance_backup_subdir_path, + base36enc(backup->start_time), + DATABASE_DIR, PG_XLOG_DIR); + xlog_path = stream_xlog_path; + } + else + xlog_path = instanceState->instance_wal_subdir_path; + /* It is ok for replica to return invalid STOP LSN * UPD: Apparently it is ok even for a master. */ if (!XRecOffIsValid(stop_backup_result.lsn)) { - char *xlog_path, - stream_xlog_path[MAXPGPATH]; XLogSegNo segno = 0; XLogRecPtr lsn_tmp = InvalidXLogRecPtr; @@ -1828,17 +1825,6 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb //elog(WARNING, "New Invalid stop_backup_lsn value %X/%X", // (uint32) (stop_backup_result.lsn >> 32), (uint32) (stop_backup_result.lsn)); - if (stream_wal) - { - snprintf(stream_xlog_path, lengthof(stream_xlog_path), - "%s/%s/%s/%s", instanceState->instance_backup_subdir_path, - base36enc(backup->start_time), - DATABASE_DIR, PG_XLOG_DIR); - xlog_path = stream_xlog_path; - } - else - xlog_path = instanceState->instance_wal_subdir_path; - GetXLogSegNo(stop_backup_result.lsn, segno, instance_config.xlog_seg_size); /* @@ -1862,8 +1848,8 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb if (stop_backup_result.lsn % instance_config.xlog_seg_size == 0) { /* Wait for segment with current stop_lsn, it is ok for it to never arrive */ - wait_wal_lsn(instanceState, stop_backup_result.lsn, false, backup->tli, - false, true, WARNING, stream_wal, backup); + wait_wal_lsn(xlog_path, stop_backup_result.lsn, false, backup->tli, + false, true, WARNING, stream_wal); /* Get the first record in segment with current stop_lsn */ lsn_tmp = get_first_record_lsn(xlog_path, segno, backup->tli, @@ -1890,8 +1876,8 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb /* Despite looking for previous record there is not guarantee of success * because previous record can be the contrecord. */ - lsn_tmp = wait_wal_lsn(instanceState, stop_backup_result.lsn, false, backup->tli, - true, false, ERROR, stream_wal, backup); + lsn_tmp = wait_wal_lsn(xlog_path, stop_backup_result.lsn, false, backup->tli, + true, false, ERROR, stream_wal); /* sanity */ if (!XRecOffIsValid(lsn_tmp) || XLogRecPtrIsInvalid(lsn_tmp)) @@ -1904,8 +1890,8 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb else if (stop_backup_result.lsn % XLOG_BLCKSZ == 0) { /* Wait for segment with current stop_lsn */ - wait_wal_lsn(instanceState, stop_backup_result.lsn, false, backup->tli, - false, true, ERROR, stream_wal, backup); + wait_wal_lsn(xlog_path, stop_backup_result.lsn, false, backup->tli, + false, true, ERROR, stream_wal); /* Get the next closest record in segment with current stop_lsn */ lsn_tmp = get_next_record_lsn(xlog_path, segno, backup->tli, @@ -1967,8 +1953,8 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb * look for previous record with endpoint >= STOP_LSN. */ if (!stop_lsn_exists) - stop_backup_lsn = wait_wal_lsn(instanceState, stop_backup_result.lsn, false, backup->tli, - false, false, ERROR, stream_wal, backup); + stop_backup_lsn = wait_wal_lsn(xlog_path, stop_backup_result.lsn, false, backup->tli, + false, false, ERROR, stream_wal); if (stream_wal) { @@ -1976,15 +1962,7 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb * to the passed filelist */ if(wait_WAL_streaming_end(backup_files_list)) elog(ERROR, "WAL streaming failed"); - - snprintf(stream_xlog_path, lengthof(stream_xlog_path), "%s/%s/%s/%s", - instanceState->instance_backup_subdir_path, base36enc(backup->start_time), - DATABASE_DIR, PG_XLOG_DIR); - - xlog_path = stream_xlog_path; } - else - xlog_path = instanceState->instance_wal_subdir_path; backup->stop_lsn = stop_backup_lsn; backup->recovery_xid = stop_backup_result.snapshot_xid; From fffa8b14fd66be46db17e0cc7151cad66939215e Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Mon, 24 May 2021 09:14:39 +0300 Subject: [PATCH 11/63] Refactor pg_stop_backup(): remove useless conn variable --- src/backup.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/backup.c b/src/backup.c index 317688f99..1ef50a22a 100644 --- a/src/backup.c +++ b/src/backup.c @@ -1756,7 +1756,6 @@ static void pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startbackup_conn, PGNodeInfo *nodeInfo) { - PGconn *conn; bool stop_lsn_exists = false; PGStopBackupResult stop_backup_result; char *xlog_path, stream_xlog_path[MAXPGPATH]; @@ -1769,9 +1768,7 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb if (!backup_in_progress) elog(ERROR, "backup is not in progress"); - conn = pg_startbackup_conn; - - pg_silent_client_messages(conn); + pg_silent_client_messages(pg_startbackup_conn); /* Create restore point * Only if backup is from master. @@ -1780,16 +1777,16 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb if (!backup->from_replica && !(nodeInfo->server_version < 90600 && !nodeInfo->is_superuser)) //TODO: check correctness - pg_create_restore_point(conn, backup->start_time); + pg_create_restore_point(pg_startbackup_conn, backup->start_time); /* Execute pg_stop_backup using PostgreSQL connection */ - pg_stop_backup_send(conn, nodeInfo->server_version, current.from_replica, exclusive_backup, &query_text); + pg_stop_backup_send(pg_startbackup_conn, nodeInfo->server_version, current.from_replica, exclusive_backup, &query_text); /* * Wait for the result of pg_stop_backup(), but no longer than * archive_timeout seconds */ - pg_stop_backup_consume(conn, nodeInfo->server_version, exclusive_backup, timeout, query_text, &stop_backup_result); + pg_stop_backup_consume(pg_startbackup_conn, nodeInfo->server_version, exclusive_backup, timeout, query_text, &stop_backup_result); if (stream_wal) { From ed5d71e6b3ce8b999251ee443dcb23ff3016766c Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Mon, 24 May 2021 10:22:18 +0300 Subject: [PATCH 12/63] Make some functions and variables (from backup.c) accessible from other compilation units --- src/backup.c | 43 ++++++++----------------------------------- src/pg_probackup.h | 39 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 36 deletions(-) diff --git a/src/backup.c b/src/backup.c index 1ef50a22a..74ac59780 100644 --- a/src/backup.c +++ b/src/backup.c @@ -27,7 +27,7 @@ //const char *progname = "pg_probackup"; /* list of files contained in backup */ -static parray *backup_files_list = NULL; +parray *backup_files_list = NULL; /* We need critical section for datapagemap_add() in case of using threads */ static pthread_mutex_t backup_pagemap_mutex = PTHREAD_MUTEX_INITIALIZER; @@ -36,25 +36,7 @@ static pthread_mutex_t backup_pagemap_mutex = PTHREAD_MUTEX_INITIALIZER; bool exclusive_backup = false; /* Is pg_start_backup() was executed */ -static bool backup_in_progress = false; - -typedef struct PGStopBackupResult -{ - /* - * We will use values of snapshot_xid and invocation_time if there are - * no transactions between start_lsn and stop_lsn. - */ - TransactionId snapshot_xid; - time_t invocation_time; - /* - * Fields that store pg_catalog.pg_stop_backup() result - */ - XLogRecPtr lsn; - size_t backup_label_content_len; - char *backup_label_content; - size_t tablespace_map_content_len; - char *tablespace_map_content; -} PGStopBackupResult; +bool backup_in_progress = false; /* * Backup routines @@ -66,18 +48,9 @@ static void *backup_files(void *arg); static void do_backup_pg(InstanceState *instanceState, PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync, bool backup_logs); -static void pg_start_backup(const char *label, bool smooth, pgBackup *backup, - PGNodeInfo *nodeInfo, PGconn *conn); static void pg_switch_wal(PGconn *conn); -static void pg_silent_client_messages(PGconn *conn); -static void pg_create_restore_point(PGconn *conn, time_t backup_start_time); static void pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startbackup_conn, PGNodeInfo *nodeInfo); -static void pg_stop_backup_send(PGconn *conn, int server_version, bool is_started_on_replica, bool is_exclusive, char **query_text); - -static XLogRecPtr wait_wal_lsn(const char *wal_segment_dir, XLogRecPtr lsn, bool is_start_lsn, TimeLineID tli, - bool in_prev_segment, bool segment_only, - int timeout_elevel, bool in_stream_dir); static void check_external_for_tablespaces(parray *external_list, PGconn *backup_conn); @@ -1062,7 +1035,7 @@ confirm_block_size(PGconn *conn, const char *name, int blcksz) /* * Notify start of backup to PostgreSQL server. */ -static void +void pg_start_backup(const char *label, bool smooth, pgBackup *backup, PGNodeInfo *nodeInfo, PGconn *conn) { @@ -1297,7 +1270,7 @@ pg_is_superuser(PGconn *conn) * Returns target LSN if such is found, failing that returns LSN of record prior to target LSN. * Returns InvalidXLogRecPtr if 'segment_only' flag is used. */ -static XLogRecPtr +XLogRecPtr wait_wal_lsn(const char *wal_segment_dir, XLogRecPtr target_lsn, bool is_start_lsn, TimeLineID tli, bool in_prev_segment, bool segment_only, int timeout_elevel, bool in_stream_dir) @@ -1459,7 +1432,7 @@ wait_wal_lsn(const char *wal_segment_dir, XLogRecPtr target_lsn, bool is_start_l } /* Remove annoying NOTICE messages generated by backend */ -static void +void pg_silent_client_messages(PGconn *conn) { PGresult *res; @@ -1468,7 +1441,7 @@ pg_silent_client_messages(PGconn *conn) PQclear(res); } -static void +void pg_create_restore_point(PGconn *conn, time_t backup_start_time) { PGresult *res; @@ -1578,7 +1551,7 @@ pg_stop_backup_send(PGconn *conn, int server_version, bool is_started_on_replica * parameters: * - */ -static void +void pg_stop_backup_consume(PGconn *conn, int server_version, bool is_exclusive, uint32 timeout, const char *query_text, PGStopBackupResult *result) @@ -1709,7 +1682,7 @@ pg_stop_backup_consume(PGconn *conn, int server_version, /* * helper routine used to write backup_label and tablespace_map in pg_stop_backup() */ -static void +void pg_stop_backup_write_file_helper(const char *path, const char *filename, const char *error_msg_filename, const void *data, size_t len, parray *file_list) { diff --git a/src/pg_probackup.h b/src/pg_probackup.h index 4da12d654..f662a25f5 100644 --- a/src/pg_probackup.h +++ b/src/pg_probackup.h @@ -600,7 +600,6 @@ typedef struct int ret; } backup_files_arg; - typedef struct timelineInfo timelineInfo; /* struct to collect info about timelines in WAL archive */ @@ -1260,4 +1259,42 @@ extern void start_WAL_streaming(PGconn *backup_conn, char *stream_dst_path, ConnectionOptions *conn_opt, XLogRecPtr startpos, TimeLineID starttli); extern int wait_WAL_streaming_end(parray *backup_files_list); + +/* external variables and functions, implemented in backup.c */ +typedef struct PGStopBackupResult +{ + /* + * We will use values of snapshot_xid and invocation_time if there are + * no transactions between start_lsn and stop_lsn. + */ + TransactionId snapshot_xid; + time_t invocation_time; + /* + * Fields that store pg_catalog.pg_stop_backup() result + */ + XLogRecPtr lsn; + size_t backup_label_content_len; + char *backup_label_content; + size_t tablespace_map_content_len; + char *tablespace_map_content; +} PGStopBackupResult; + +extern bool backup_in_progress; +extern parray *backup_files_list; + +extern void pg_start_backup(const char *label, bool smooth, pgBackup *backup, + PGNodeInfo *nodeInfo, PGconn *conn); +extern void pg_silent_client_messages(PGconn *conn); +extern void pg_create_restore_point(PGconn *conn, time_t backup_start_time); +extern void pg_stop_backup_send(PGconn *conn, int server_version, bool is_started_on_replica, bool is_exclusive, char **query_text); +extern void pg_stop_backup_consume(PGconn *conn, int server_version, + bool is_exclusive, uint32 timeout, const char *query_text, + PGStopBackupResult *result); +extern void pg_stop_backup_write_file_helper(const char *path, const char *filename, const char *error_msg_filename, + const void *data, size_t len, parray *file_list); +extern XLogRecPtr wait_wal_lsn(const char *wal_segment_dir, XLogRecPtr lsn, bool is_start_lsn, TimeLineID tli, + bool in_prev_segment, bool segment_only, + int timeout_elevel, bool in_stream_dir); + + #endif /* PG_PROBACKUP_H */ From f7196e89e37bfc5580f801ab5a4d065b721651c7 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Mon, 24 May 2021 11:12:00 +0300 Subject: [PATCH 13/63] Remove some references to global stream_wal variable --- src/backup.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/backup.c b/src/backup.c index 74ac59780..9765ac16a 100644 --- a/src/backup.c +++ b/src/backup.c @@ -259,7 +259,7 @@ do_backup_pg(InstanceState *instanceState, PGconn *backup_conn, write_backup(¤t, true); /* In PAGE mode or in ARCHIVE wal-mode wait for current segment */ - if (current.backup_mode == BACKUP_MODE_DIFF_PAGE || !stream_wal) + if (current.backup_mode == BACKUP_MODE_DIFF_PAGE || !current.stream) { /* Check that archive_dir can be reached */ if (fio_access(instanceState->instance_wal_subdir_path, F_OK, FIO_BACKUP_HOST) != 0) @@ -275,7 +275,7 @@ do_backup_pg(InstanceState *instanceState, PGconn *backup_conn, } /* start stream replication */ - if (stream_wal) + if (current.stream) { join_path_components(dst_backup_path, current.database_dir, PG_XLOG_DIR); fio_mkdir(dst_backup_path, DIR_PERMISSION, FIO_BACKUP_HOST); @@ -1076,7 +1076,7 @@ pg_start_backup(const char *label, bool smooth, pgBackup *backup, PQclear(res); - if ((!stream_wal || current.backup_mode == BACKUP_MODE_DIFF_PAGE) && + if ((!backup->stream || backup->backup_mode == BACKUP_MODE_DIFF_PAGE) && !backup->from_replica && !(nodeInfo->server_version < 90600 && !nodeInfo->is_superuser)) @@ -1261,7 +1261,7 @@ pg_is_superuser(PGconn *conn) * previous segment. * * Flag 'in_stream_dir' determine whether we looking for WAL in 'pg_wal' directory or - * in archive. Do note, that we cannot rely sorely on global variable 'stream_wal' because, + * in archive. Do note, that we cannot rely sorely on global variable 'stream_wal' (current.stream) because, * for example, PAGE backup must(!) look for start_lsn in archive regardless of wal_mode. * * 'timeout_elevel' determine the elevel for timeout elog message. If elevel lighter than @@ -1407,7 +1407,7 @@ wait_wal_lsn(const char *wal_segment_dir, XLogRecPtr target_lsn, bool is_start_l wal_delivery_str, wal_segment_path); } - if (!stream_wal && is_start_lsn && try_count == 30) + if (!current.stream && is_start_lsn && try_count == 30) elog(WARNING, "By default pg_probackup assume WAL delivery method to be ARCHIVE. " "If continuous archiving is not set up, use '--stream' option to make autonomous backup. " "Otherwise check that continuous archiving works correctly."); @@ -1753,7 +1753,7 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb pg_create_restore_point(pg_startbackup_conn, backup->start_time); /* Execute pg_stop_backup using PostgreSQL connection */ - pg_stop_backup_send(pg_startbackup_conn, nodeInfo->server_version, current.from_replica, exclusive_backup, &query_text); + pg_stop_backup_send(pg_startbackup_conn, nodeInfo->server_version, backup->from_replica, exclusive_backup, &query_text); /* * Wait for the result of pg_stop_backup(), but no longer than @@ -1761,7 +1761,7 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb */ pg_stop_backup_consume(pg_startbackup_conn, nodeInfo->server_version, exclusive_backup, timeout, query_text, &stop_backup_result); - if (stream_wal) + if (backup->stream) { snprintf(stream_xlog_path, lengthof(stream_xlog_path), "%s/%s/%s/%s", instanceState->instance_backup_subdir_path, @@ -1819,7 +1819,7 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb { /* Wait for segment with current stop_lsn, it is ok for it to never arrive */ wait_wal_lsn(xlog_path, stop_backup_result.lsn, false, backup->tli, - false, true, WARNING, stream_wal); + false, true, WARNING, backup->stream); /* Get the first record in segment with current stop_lsn */ lsn_tmp = get_first_record_lsn(xlog_path, segno, backup->tli, @@ -1847,7 +1847,7 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb * because previous record can be the contrecord. */ lsn_tmp = wait_wal_lsn(xlog_path, stop_backup_result.lsn, false, backup->tli, - true, false, ERROR, stream_wal); + true, false, ERROR, backup->stream); /* sanity */ if (!XRecOffIsValid(lsn_tmp) || XLogRecPtrIsInvalid(lsn_tmp)) @@ -1861,7 +1861,7 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb { /* Wait for segment with current stop_lsn */ wait_wal_lsn(xlog_path, stop_backup_result.lsn, false, backup->tli, - false, true, ERROR, stream_wal); + false, true, ERROR, backup->stream); /* Get the next closest record in segment with current stop_lsn */ lsn_tmp = get_next_record_lsn(xlog_path, segno, backup->tli, @@ -1924,9 +1924,9 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb */ if (!stop_lsn_exists) stop_backup_lsn = wait_wal_lsn(xlog_path, stop_backup_result.lsn, false, backup->tli, - false, false, ERROR, stream_wal); + false, false, ERROR, backup->stream); - if (stream_wal) + if (backup->stream) { /* This function will also add list of xlog files * to the passed filelist */ From bd9cd9f8a406c6204ba2979a8cf4f19961900c5a Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Mon, 24 May 2021 11:38:53 +0300 Subject: [PATCH 14/63] remove unused variable externaldir --- src/pg_probackup.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/pg_probackup.c b/src/pg_probackup.c index 1b2e7f751..3150900b6 100644 --- a/src/pg_probackup.c +++ b/src/pg_probackup.c @@ -68,8 +68,6 @@ static char *backup_path = NULL; static CatalogState *catalogState = NULL; /* ================ catalogState (END) =========== */ -/* colon separated external directories list ("/path1:/path2") */ -char *externaldir = NULL; /* common options */ int num_threads = 1; bool stream_wal = false; From 1793c68e8dd7cf0e524ac13f41b049299d4db915 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Mon, 24 May 2021 14:35:35 +0300 Subject: [PATCH 15/63] Yet another split of pg_stop_backup(): separate verification of stop_lsn into wait_wal_and_calculate_stop_lsn() --- src/backup.c | 263 +++++++++++++++++++++++---------------------- src/pg_probackup.h | 2 +- 2 files changed, 138 insertions(+), 127 deletions(-) diff --git a/src/backup.c b/src/backup.c index 9765ac16a..7cb100fe2 100644 --- a/src/backup.c +++ b/src/backup.c @@ -1431,6 +1431,142 @@ wait_wal_lsn(const char *wal_segment_dir, XLogRecPtr target_lsn, bool is_start_l } } +/* + * Check stop_lsn (returned from pg_stop_backup()) and update backup->stop_lsn + */ +void +wait_wal_and_calculate_stop_lsn(const char *xlog_path, XLogRecPtr stop_lsn, pgBackup *backup) +{ + bool stop_lsn_exists = false; + + /* It is ok for replica to return invalid STOP LSN + * UPD: Apparently it is ok even for a master. + */ + if (!XRecOffIsValid(stop_lsn)) + { + XLogSegNo segno = 0; + XLogRecPtr lsn_tmp = InvalidXLogRecPtr; + + /* + * Even though the value is invalid, it's expected postgres behaviour + * and we're trying to fix it below. + */ + elog(LOG, "Invalid offset in stop_lsn value %X/%X, trying to fix", + (uint32) (stop_lsn >> 32), (uint32) (stop_lsn)); + + /* + * Note: even with gdb it is very hard to produce automated tests for + * contrecord + invalid LSN, so emulate it for manual testing. + */ + //lsn = lsn - XLOG_SEG_SIZE; + //elog(WARNING, "New Invalid stop_backup_lsn value %X/%X", + // (uint32) (stop_lsn >> 32), (uint32) (stop_lsn)); + + GetXLogSegNo(stop_lsn, segno, instance_config.xlog_seg_size); + + /* + * Note, that there is no guarantee that corresponding WAL file even exists. + * Replica may return LSN from future and keep staying in present. + * Or it can return invalid LSN. + * + * That's bad, since we want to get real LSN to save it in backup label file + * and to use it in WAL validation. + * + * So we try to do the following: + * 1. Wait 'archive_timeout' seconds for segment containing stop_lsn and + * look for the first valid record in it. + * It solves the problem of occasional invalid LSN on write-busy system. + * 2. Failing that, look for record in previous segment with endpoint + * equal or greater than stop_lsn. It may(!) solve the problem of invalid LSN + * on write-idle system. If that fails too, error out. + */ + + /* stop_lsn is pointing to a 0 byte of xlog segment */ + if (stop_lsn % instance_config.xlog_seg_size == 0) + { + /* Wait for segment with current stop_lsn, it is ok for it to never arrive */ + wait_wal_lsn(xlog_path, stop_lsn, false, backup->tli, + false, true, WARNING, backup->stream); + + /* Get the first record in segment with current stop_lsn */ + lsn_tmp = get_first_record_lsn(xlog_path, segno, backup->tli, + instance_config.xlog_seg_size, + instance_config.archive_timeout); + + /* Check that returned LSN is valid and greater than stop_lsn */ + if (XLogRecPtrIsInvalid(lsn_tmp) || + !XRecOffIsValid(lsn_tmp) || + lsn_tmp < stop_lsn) + { + /* Backup from master should error out here */ + if (!backup->from_replica) + elog(ERROR, "Failed to get next WAL record after %X/%X", + (uint32) (stop_lsn >> 32), + (uint32) (stop_lsn)); + + /* No luck, falling back to looking up for previous record */ + elog(WARNING, "Failed to get next WAL record after %X/%X, " + "looking for previous WAL record", + (uint32) (stop_lsn >> 32), + (uint32) (stop_lsn)); + + /* Despite looking for previous record there is not guarantee of success + * because previous record can be the contrecord. + */ + lsn_tmp = wait_wal_lsn(xlog_path, stop_lsn, false, backup->tli, + true, false, ERROR, backup->stream); + + /* sanity */ + if (!XRecOffIsValid(lsn_tmp) || XLogRecPtrIsInvalid(lsn_tmp)) + elog(ERROR, "Failed to get WAL record prior to %X/%X", + (uint32) (stop_lsn >> 32), + (uint32) (stop_lsn)); + } + } + /* stop lsn is aligned to xlog block size, just find next lsn */ + else if (stop_lsn % XLOG_BLCKSZ == 0) + { + /* Wait for segment with current stop_lsn */ + wait_wal_lsn(xlog_path, stop_lsn, false, backup->tli, + false, true, ERROR, backup->stream); + + /* Get the next closest record in segment with current stop_lsn */ + lsn_tmp = get_next_record_lsn(xlog_path, segno, backup->tli, + instance_config.xlog_seg_size, + instance_config.archive_timeout, + stop_lsn); + + /* sanity */ + if (!XRecOffIsValid(lsn_tmp) || XLogRecPtrIsInvalid(lsn_tmp)) + elog(ERROR, "Failed to get WAL record next to %X/%X", + (uint32) (stop_lsn >> 32), + (uint32) (stop_lsn)); + } + /* PostgreSQL returned something very illegal as STOP_LSN, error out */ + else + elog(ERROR, "Invalid stop_backup_lsn value %X/%X", + (uint32) (stop_lsn >> 32), (uint32) (stop_lsn)); + + /* Setting stop_backup_lsn will set stop point for streaming */ + stop_backup_lsn = lsn_tmp; + stop_lsn_exists = true; + } + + elog(LOG, "stop_lsn: %X/%X", + (uint32) (stop_lsn >> 32), (uint32) (stop_lsn)); + + /* + * Wait for stop_lsn to be archived or streamed. + * If replica returned valid STOP_LSN of not actually existing record, + * look for previous record with endpoint >= STOP_LSN. + */ + if (!stop_lsn_exists) + stop_backup_lsn = wait_wal_lsn(xlog_path, stop_lsn, false, backup->tli, + false, false, ERROR, backup->stream); + + backup->stop_lsn = stop_backup_lsn; +} + /* Remove annoying NOTICE messages generated by backend */ void pg_silent_client_messages(PGconn *conn) @@ -1729,7 +1865,6 @@ static void pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startbackup_conn, PGNodeInfo *nodeInfo) { - bool stop_lsn_exists = false; PGStopBackupResult stop_backup_result; char *xlog_path, stream_xlog_path[MAXPGPATH]; /* kludge against some old bug in archive_timeout. TODO: remove in 3.0.0 */ @@ -1772,121 +1907,7 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb else xlog_path = instanceState->instance_wal_subdir_path; - /* It is ok for replica to return invalid STOP LSN - * UPD: Apparently it is ok even for a master. - */ - if (!XRecOffIsValid(stop_backup_result.lsn)) - { - XLogSegNo segno = 0; - XLogRecPtr lsn_tmp = InvalidXLogRecPtr; - - /* - * Even though the value is invalid, it's expected postgres behaviour - * and we're trying to fix it below. - */ - elog(LOG, "Invalid offset in stop_lsn value %X/%X, trying to fix", - (uint32) (stop_backup_result.lsn >> 32), (uint32) (stop_backup_result.lsn)); - - /* - * Note: even with gdb it is very hard to produce automated tests for - * contrecord + invalid LSN, so emulate it for manual testing. - */ - //stop_backup_result.lsn = stop_backup_result.lsn - XLOG_SEG_SIZE; - //elog(WARNING, "New Invalid stop_backup_lsn value %X/%X", - // (uint32) (stop_backup_result.lsn >> 32), (uint32) (stop_backup_result.lsn)); - - GetXLogSegNo(stop_backup_result.lsn, segno, instance_config.xlog_seg_size); - - /* - * Note, that there is no guarantee that corresponding WAL file even exists. - * Replica may return LSN from future and keep staying in present. - * Or it can return invalid LSN. - * - * That's bad, since we want to get real LSN to save it in backup label file - * and to use it in WAL validation. - * - * So we try to do the following: - * 1. Wait 'archive_timeout' seconds for segment containing stop_lsn and - * look for the first valid record in it. - * It solves the problem of occasional invalid LSN on write-busy system. - * 2. Failing that, look for record in previous segment with endpoint - * equal or greater than stop_lsn. It may(!) solve the problem of invalid LSN - * on write-idle system. If that fails too, error out. - */ - - /* stop_lsn is pointing to a 0 byte of xlog segment */ - if (stop_backup_result.lsn % instance_config.xlog_seg_size == 0) - { - /* Wait for segment with current stop_lsn, it is ok for it to never arrive */ - wait_wal_lsn(xlog_path, stop_backup_result.lsn, false, backup->tli, - false, true, WARNING, backup->stream); - - /* Get the first record in segment with current stop_lsn */ - lsn_tmp = get_first_record_lsn(xlog_path, segno, backup->tli, - instance_config.xlog_seg_size, - instance_config.archive_timeout); - - /* Check that returned LSN is valid and greater than stop_lsn */ - if (XLogRecPtrIsInvalid(lsn_tmp) || - !XRecOffIsValid(lsn_tmp) || - lsn_tmp < stop_backup_result.lsn) - { - /* Backup from master should error out here */ - if (!backup->from_replica) - elog(ERROR, "Failed to get next WAL record after %X/%X", - (uint32) (stop_backup_result.lsn >> 32), - (uint32) (stop_backup_result.lsn)); - - /* No luck, falling back to looking up for previous record */ - elog(WARNING, "Failed to get next WAL record after %X/%X, " - "looking for previous WAL record", - (uint32) (stop_backup_result.lsn >> 32), - (uint32) (stop_backup_result.lsn)); - - /* Despite looking for previous record there is not guarantee of success - * because previous record can be the contrecord. - */ - lsn_tmp = wait_wal_lsn(xlog_path, stop_backup_result.lsn, false, backup->tli, - true, false, ERROR, backup->stream); - - /* sanity */ - if (!XRecOffIsValid(lsn_tmp) || XLogRecPtrIsInvalid(lsn_tmp)) - elog(ERROR, "Failed to get WAL record prior to %X/%X", - (uint32) (stop_backup_result.lsn >> 32), - (uint32) (stop_backup_result.lsn)); - } - } - /* stop lsn is aligned to xlog block size, just find next lsn */ - else if (stop_backup_result.lsn % XLOG_BLCKSZ == 0) - { - /* Wait for segment with current stop_lsn */ - wait_wal_lsn(xlog_path, stop_backup_result.lsn, false, backup->tli, - false, true, ERROR, backup->stream); - - /* Get the next closest record in segment with current stop_lsn */ - lsn_tmp = get_next_record_lsn(xlog_path, segno, backup->tli, - instance_config.xlog_seg_size, - instance_config.archive_timeout, - stop_backup_result.lsn); - - /* sanity */ - if (!XRecOffIsValid(lsn_tmp) || XLogRecPtrIsInvalid(lsn_tmp)) - elog(ERROR, "Failed to get WAL record next to %X/%X", - (uint32) (stop_backup_result.lsn >> 32), - (uint32) (stop_backup_result.lsn)); - } - /* PostgreSQL returned something very illegal as STOP_LSN, error out */ - else - elog(ERROR, "Invalid stop_backup_lsn value %X/%X", - (uint32) (stop_backup_result.lsn >> 32), (uint32) (stop_backup_result.lsn)); - - /* Setting stop_backup_lsn will set stop point for streaming */ - stop_backup_lsn = lsn_tmp; - stop_lsn_exists = true; - } - - elog(LOG, "stop_lsn: %X/%X", - (uint32) (stop_backup_result.lsn >> 32), (uint32) (stop_backup_result.lsn)); + wait_wal_and_calculate_stop_lsn(xlog_path, stop_backup_result.lsn, backup); /* Write backup_label and tablespace_map */ if (!exclusive_backup) @@ -1917,15 +1938,6 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb } } - /* - * Wait for stop_lsn to be archived or streamed. - * If replica returned valid STOP_LSN of not actually existing record, - * look for previous record with endpoint >= STOP_LSN. - */ - if (!stop_lsn_exists) - stop_backup_lsn = wait_wal_lsn(xlog_path, stop_backup_result.lsn, false, backup->tli, - false, false, ERROR, backup->stream); - if (backup->stream) { /* This function will also add list of xlog files @@ -1934,7 +1946,6 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb elog(ERROR, "WAL streaming failed"); } - backup->stop_lsn = stop_backup_lsn; backup->recovery_xid = stop_backup_result.snapshot_xid; elog(LOG, "Getting the Recovery Time from WAL"); diff --git a/src/pg_probackup.h b/src/pg_probackup.h index f662a25f5..60f1a4872 100644 --- a/src/pg_probackup.h +++ b/src/pg_probackup.h @@ -1295,6 +1295,6 @@ extern void pg_stop_backup_write_file_helper(const char *path, const char *filen extern XLogRecPtr wait_wal_lsn(const char *wal_segment_dir, XLogRecPtr lsn, bool is_start_lsn, TimeLineID tli, bool in_prev_segment, bool segment_only, int timeout_elevel, bool in_stream_dir); - +extern void wait_wal_and_calculate_stop_lsn(const char *xlog_path, XLogRecPtr stop_lsn, pgBackup *backup); #endif /* PG_PROBACKUP_H */ From 02aa32107433c34f6f7f264585e051cc7b5dbab1 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Mon, 24 May 2021 18:53:13 +0300 Subject: [PATCH 16/63] catchup update #1 --- src/backup.c | 6 +- src/catchup.c | 364 ++++++++++++++++++++++++++++----------------- src/pg_probackup.c | 15 +- src/pg_probackup.h | 8 +- src/ptrack.c | 2 +- 5 files changed, 243 insertions(+), 152 deletions(-) diff --git a/src/backup.c b/src/backup.c index 063b1c6e2..1e6912be4 100644 --- a/src/backup.c +++ b/src/backup.c @@ -125,7 +125,7 @@ do_backup_pg(InstanceState *instanceState, PGconn *backup_conn, } /* Clear ptrack files for not PTRACK backups */ - if (current.backup_mode != BACKUP_MODE_DIFF_PTRACK && nodeInfo->is_ptrack_enable) + if (current.backup_mode != BACKUP_MODE_DIFF_PTRACK && nodeInfo->is_ptrack_enabled) pg_ptrack_clear(backup_conn, nodeInfo->ptrack_version_num); /* notify start of backup to PostgreSQL server */ @@ -812,7 +812,7 @@ do_backup(InstanceState *instanceState, pgSetBackupParams *set_backup_params, // elog(WARNING, "ptrack_version_num %d", ptrack_version_num); if (nodeInfo.ptrack_version_num > 0) - nodeInfo.is_ptrack_enable = pg_ptrack_enable(backup_conn, nodeInfo.ptrack_version_num); + nodeInfo.is_ptrack_enabled = pg_is_ptrack_enabled(backup_conn, nodeInfo.ptrack_version_num); if (current.backup_mode == BACKUP_MODE_DIFF_PTRACK) { @@ -820,7 +820,7 @@ do_backup(InstanceState *instanceState, pgSetBackupParams *set_backup_params, elog(ERROR, "This PostgreSQL instance does not support ptrack"); else { - if (!nodeInfo.is_ptrack_enable) + if (!nodeInfo.is_ptrack_enabled) elog(ERROR, "Ptrack is disabled"); } } diff --git a/src/catchup.c b/src/catchup.c index 084713dcf..5f603ab27 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -2,8 +2,7 @@ * * catchup.c: sync DB cluster * - * Portions Copyright (c) 2009-2013, NIPPON TELEGRAPH AND TELEPHONE CORPORATION - * Portions Copyright (c) 2015-2021, Postgres Professional + * Copyright (c) 2021, Postgres Professional * *------------------------------------------------------------------------- */ @@ -27,7 +26,118 @@ /* * Catchup routines */ -static void *catchup_files(void *arg); +static PGconn *catchup_collect_info(PGNodeInfo *source_node_info, const char *source_pgdata, const char *dest_pgdata, + BackupMode backup_mode, ConnectionOptions conn_opt); +static void catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, + const char *source_pgdata, BackupMode backup_mode); +static void do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn *source_conn, + PGNodeInfo *nodeInfo, BackupMode backup_mode, bool no_sync, bool backup_logs, + bool dest_pgdata_is_empty); +static void *catchup_thread_runner(void *arg); + +/* + * Entry point of pg_probackup CATCHUP subcommand. + */ +int +do_catchup(const char *source_pgdata, const char *dest_pgdata, BackupMode backup_mode, + ConnectionOptions conn_opt, int num_threads) +{ + PGconn *source_conn = NULL; + PGNodeInfo source_node_info; + bool no_sync = false; + bool backup_logs = false; + bool dest_pgdata_is_empty = dir_is_empty(dest_pgdata, FIO_LOCAL_HOST); + + source_conn = catchup_collect_info(&source_node_info, source_pgdata, dest_pgdata, backup_mode, conn_opt); + catchup_preflight_checks(&source_node_info, source_conn, source_pgdata, backup_mode); + + if (!dest_pgdata_is_empty && + check_incremental_compatibility(dest_pgdata, + instance_config.system_identifier, + INCR_CHECKSUM) != DEST_OK) + elog(ERROR, "Incremental restore is not allowed"); + + if (current.from_replica && exclusive_backup) + elog(ERROR, "Catchup from standby is available only for PG >= 9.6"); + + do_catchup_instance(source_pgdata, dest_pgdata, source_conn, &source_node_info, + backup_mode, no_sync, backup_logs, dest_pgdata_is_empty); + + /* TODO: show the amount of transfered data in bytes and calculate incremental ratio */ + + return 0; +} + +static PGconn * +catchup_collect_info(PGNodeInfo *source_node_info, const char *source_pgdata, const char *dest_pgdata, + BackupMode backup_mode, ConnectionOptions conn_opt) +{ + PGconn *source_conn; + /* Initialize PGInfonode */ + pgNodeInit(source_node_info); + + /* Get WAL segments size and system ID of source PG instance */ + instance_config.xlog_seg_size = get_xlog_seg_size(source_pgdata); + instance_config.system_identifier = get_system_identifier(source_pgdata); + current.start_time = time(NULL); + + StrNCpy(current.program_version, PROGRAM_VERSION, sizeof(current.program_version)); + //current.compress_alg = instance_config.compress_alg; + //current.compress_level = instance_config.compress_level; + + /* Do some compatibility checks and fill basic info about PG instance */ + source_conn = pgdata_basic_setup(conn_opt, source_node_info); + + /* below perform checks specific for backup command */ +#if PG_VERSION_NUM >= 110000 + if (!RetrieveWalSegSize(source_conn)) + elog(ERROR, "Failed to retrieve wal_segment_size"); +#endif + + get_ptrack_version(source_conn, source_node_info); + if (source_node_info->ptrack_version_num > 0) + source_node_info->is_ptrack_enabled = pg_is_ptrack_enabled(source_conn, source_node_info->ptrack_version_num); + + /* Obtain current timeline */ +#if PG_VERSION_NUM >= 90600 + current.tli = get_current_timeline(source_conn); +#else + current.tli = get_current_timeline_from_control(false); +#endif + + elog(INFO, "Catchup start, pg_probackup version: %s, " + "PostgreSQL version: %s, " + "remote: %s, catchup-source-pgdata: %s, catchup-destination-pgdata: %s", + PROGRAM_VERSION, source_node_info->server_version_str, + IsSshProtocol() ? "true" : "false", + source_pgdata, dest_pgdata); + + if (current.from_replica) + elog(INFO, "Running catchup from standby"); + + return source_conn; +} + +static void +catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, + const char *source_pgdata, BackupMode backup_mode) +{ + // TODO: add sanity check that source PGDATA is not empty + + /* Check that connected PG instance and source PGDATA are the same */ + check_system_identifiers(source_conn, source_pgdata); + + if (backup_mode == BACKUP_MODE_DIFF_PTRACK) + { + if (source_node_info->ptrack_version_num == 0) + elog(ERROR, "This PostgreSQL instance does not support ptrack"); + else if (source_node_info->ptrack_version_num < 20) + elog(ERROR, "ptrack extension is too old.\n" + "Upgrade ptrack to version >= 2"); + else if (!source_node_info->is_ptrack_enabled) + elog(ERROR, "Ptrack is disabled"); + } +} /* * TODO: @@ -36,18 +146,17 @@ static void *catchup_files(void *arg); */ static void do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn *source_conn, - PGNodeInfo *nodeInfo, BackupMode backup_mode, bool no_sync, bool backup_logs, + PGNodeInfo *source_node_info, BackupMode backup_mode, bool no_sync, bool backup_logs, bool dest_pgdata_is_empty) { int i; - char dst_xlog_path[MAXPGPATH]; + char dest_xlog_path[MAXPGPATH]; char label[1024]; XLogRecPtr sync_lsn = InvalidXLogRecPtr; - XLogRecPtr start_lsn; /* arrays with meta info for multi threaded backup */ pthread_t *threads; - catchup_files_arg *threads_args; + catchup_thread_runner_arg *threads_args; bool catchup_isok = true; parray *source_filelist = NULL; @@ -70,15 +179,8 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * strlen(" with pg_probackup")); /* Call pg_start_backup function in PostgreSQL connect */ - pg_start_backup(label, smooth_checkpoint, ¤t, nodeInfo, source_conn); - elog(LOG, "pg_start_backup START LSN %X/%X", (uint32) (start_lsn >> 32), (uint32) (start_lsn)); - - /* Obtain current timeline */ -#if PG_VERSION_NUM >= 90600 - current.tli = get_current_timeline(source_conn); -#else - current.tli = get_current_timeline_from_control(false); -#endif + pg_start_backup(label, smooth_checkpoint, ¤t, source_node_info, source_conn); + elog(LOG, "pg_start_backup START LSN %X/%X", (uint32) (current.start_lsn >> 32), (uint32) (current.start_lsn)); if (!dest_pgdata_is_empty && (backup_mode == BACKUP_MODE_DIFF_PTRACK || @@ -97,43 +199,33 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * */ if (backup_mode == BACKUP_MODE_DIFF_PTRACK) { - XLogRecPtr ptrack_lsn = get_last_ptrack_lsn(source_conn, nodeInfo); - - if (nodeInfo->ptrack_version_num < 20) - { - elog(ERROR, "ptrack extension is too old.\n" - "Upgrade ptrack to version >= 2"); - } - else - { - // new ptrack is more robust and checks Start LSN - if (ptrack_lsn > sync_lsn || ptrack_lsn == InvalidXLogRecPtr) - { - elog(ERROR, "LSN from ptrack_control %X/%X is greater than checkpoint LSN %X/%X.\n" - "Create new full backup before an incremental one.", - (uint32) (ptrack_lsn >> 32), (uint32) (ptrack_lsn), - (uint32) (sync_lsn >> 32), - (uint32) (sync_lsn)); - } - } + XLogRecPtr ptrack_lsn = get_last_ptrack_lsn(source_conn, source_node_info); + + // new ptrack is more robust and checks Start LSN + if (ptrack_lsn > sync_lsn || ptrack_lsn == InvalidXLogRecPtr) + elog(ERROR, "LSN from ptrack_control %X/%X is greater than checkpoint LSN %X/%X.\n" + "Create new full backup before an incremental one.", + (uint32) (ptrack_lsn >> 32), (uint32) (ptrack_lsn), + (uint32) (sync_lsn >> 32), + (uint32) (sync_lsn)); } - /* Check that sync_lsn is less than start_lsn */ + /* Check that sync_lsn is less than current.start_lsn */ /* TODO это нужно? */ if (backup_mode != BACKUP_MODE_FULL && - sync_lsn > start_lsn) + sync_lsn > current.start_lsn) elog(ERROR, "Current START LSN %X/%X is lower than SYNC LSN %X/%X, " "it may indicate that we are trying to catchup with PostgreSQL instance from the past", - (uint32) (start_lsn >> 32), (uint32) (start_lsn), + (uint32) (current.start_lsn >> 32), (uint32) (current.start_lsn), (uint32) (sync_lsn >> 32), (uint32) (sync_lsn)); /* Start stream replication */ if (stream_wal) { - join_path_components(dst_xlog_path, dest_pgdata, PG_XLOG_DIR); - fio_mkdir(dst_xlog_path, DIR_PERMISSION, FIO_BACKUP_HOST); - start_WAL_streaming(source_conn, dst_xlog_path, &instance_config.conn_opt, - start_lsn, current.tli); + join_path_components(dest_xlog_path, dest_pgdata, PG_XLOG_DIR); + fio_mkdir(dest_xlog_path, DIR_PERMISSION, FIO_BACKUP_HOST); + start_WAL_streaming(source_conn, dest_xlog_path, &instance_config.conn_opt, + current.start_lsn, current.tli); } /* initialize backup list */ @@ -189,7 +281,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * parse_filelist_filenames(source_filelist, source_pgdata); elog(LOG, "Current Start LSN: %X/%X, TLI: %X", - (uint32) (start_lsn >> 32), (uint32) (start_lsn), + (uint32) (current.start_lsn >> 32), (uint32) (current.start_lsn), current.tli); /* TODO проверить, нужна ли проверка TLI */ /*if (backup_mode != BACKUP_MODE_FULL) @@ -200,17 +292,15 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * /* Build page mapping in PTRACK mode */ - if (backup_mode == BACKUP_MODE_DIFF_PAGE) - elog(ERROR, "Catchup in PAGE mode currently is not supported"); - else if (backup_mode == BACKUP_MODE_DIFF_PTRACK) + if (backup_mode == BACKUP_MODE_DIFF_PTRACK) { time(&start_time); elog(INFO, "Extracting pagemap of changed blocks"); /* Build the page map from ptrack information */ make_pagemap_from_ptrack_2(source_filelist, source_conn, - nodeInfo->ptrack_schema, - nodeInfo->ptrack_version_num, + source_node_info->ptrack_schema, + source_node_info->ptrack_version_num, sync_lsn); time(&end_time); elog(INFO, "Pagemap successfully extracted, time elapsed: %.0f sec", @@ -253,13 +343,13 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * /* init thread args with own file lists */ threads = (pthread_t *) palloc(sizeof(pthread_t) * num_threads); - threads_args = (catchup_files_arg *) palloc(sizeof(catchup_files_arg)*num_threads); + threads_args = (catchup_thread_runner_arg *) palloc(sizeof(catchup_thread_runner_arg)*num_threads); for (i = 0; i < num_threads; i++) { - catchup_files_arg *arg = &(threads_args[i]); + catchup_thread_runner_arg *arg = &(threads_args[i]); - arg->nodeInfo = nodeInfo; + arg->nodeInfo = source_node_info; arg->from_root = source_pgdata; arg->to_root = dest_pgdata; arg->source_filelist = source_filelist; @@ -277,10 +367,10 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * time(&start_time); for (i = 0; i < num_threads; i++) { - catchup_files_arg *arg = &(threads_args[i]); + catchup_thread_runner_arg *arg = &(threads_args[i]); elog(VERBOSE, "Start thread num: %i", i); - pthread_create(&threads[i], NULL, catchup_files, arg); + pthread_create(&threads[i], NULL, catchup_thread_runner, arg); } /* Wait threads */ @@ -302,8 +392,85 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * pretty_time); /* Notify end of backup */ - current.start_lsn = start_lsn; - //!!!!!! pg_stop_backup(¤t, source_conn, nodeInfo, dest_pgdata); + //!!!!! + //catchup_pg_stop_backup(¤t, source_conn, source_node_info, dest_pgdata); + +/* + * Notify end of backup to PostgreSQL server. + */ +//static void +//catchup_pg_stop_backup(pgBackup *backup, PGconn *pg_startbackup_conn, PGNodeInfo *source_node_info, const char *destination_dir) +//{ + PGStopBackupResult stop_backup_result; + /* kludge against some old bug in archive_timeout. TODO: remove in 3.0.0 */ + int timeout = (instance_config.archive_timeout > 0) ? + instance_config.archive_timeout : ARCHIVE_TIMEOUT_DEFAULT; + char *query_text = NULL; + + pg_silent_client_messages(source_conn); + + /* Create restore point + * Only if backup is from master. + * For PG 9.5 create restore point only if pguser is superuser. + */ + if (!current.from_replica && + !(source_node_info->server_version < 90600 && + !source_node_info->is_superuser)) //TODO: check correctness + pg_create_restore_point(source_conn, current.start_time); + + /* Execute pg_stop_backup using PostgreSQL connection */ + pg_stop_backup_send(source_conn, source_node_info->server_version, current.from_replica, exclusive_backup, &query_text); + + /* + * Wait for the result of pg_stop_backup(), but no longer than + * archive_timeout seconds + */ + pg_stop_backup_consume(source_conn, source_node_info->server_version, exclusive_backup, timeout, query_text, &stop_backup_result); + + wait_wal_and_calculate_stop_lsn(dest_xlog_path, stop_backup_result.lsn, ¤t); + + /* Write backup_label and tablespace_map */ + Assert(stop_backup_result.backup_label_content != NULL); + + /* Write backup_label */ + pg_stop_backup_write_file_helper(dest_pgdata, PG_BACKUP_LABEL_FILE, "backup label", + stop_backup_result.backup_label_content, stop_backup_result.backup_label_content_len, + backup_files_list); + free(stop_backup_result.backup_label_content); + stop_backup_result.backup_label_content = NULL; + stop_backup_result.backup_label_content_len = 0; + + /* Write tablespace_map */ + if (stop_backup_result.tablespace_map_content != NULL) + { + pg_stop_backup_write_file_helper(dest_pgdata, PG_TABLESPACE_MAP_FILE, "tablespace map", + stop_backup_result.tablespace_map_content, stop_backup_result.tablespace_map_content_len, + backup_files_list); + free(stop_backup_result.tablespace_map_content); + stop_backup_result.tablespace_map_content = NULL; + stop_backup_result.tablespace_map_content_len = 0; + } + + /* This function will also add list of xlog files + * to the passed filelist */ + if(wait_WAL_streaming_end(backup_files_list)) + elog(ERROR, "WAL streaming failed"); + + current.recovery_xid = stop_backup_result.snapshot_xid; + + elog(LOG, "Getting the Recovery Time from WAL"); + /* iterate over WAL from stop_backup lsn to start_backup lsn */ + if (!read_recovery_info(dest_xlog_path, current.tli, + instance_config.xlog_seg_size, + current.start_lsn, current.stop_lsn, + ¤t.recovery_time)) + { + elog(LOG, "Failed to find Recovery Time in WAL, forced to trust current_timestamp"); + current.recovery_time = stop_backup_result.invocation_time; + } + + /* Cleanup */ + pg_free(query_text); /* In case of backup from replica >= 9.6 we must fix minRecPoint, * First we must find pg_control in source_filelist. @@ -389,100 +556,17 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * // где закрывается conn? } -/* - * Entry point of pg_probackup CATCHUP subcommand. - */ -int -do_catchup(const char *source_pgdata, const char *dest_pgdata, BackupMode backup_mode, - ConnectionOptions conn_opt, bool stream_wal, int num_threads) -{ - PGconn *conn = NULL; - PGNodeInfo nodeInfo; - bool no_sync = false; - bool backup_logs = false; - bool dest_pgdata_is_empty = dir_is_empty(dest_pgdata, FIO_LOCAL_HOST); - - /* Initialize PGInfonode */ - pgNodeInit(&nodeInfo); - - // TODO: add sanity check that source PGDATA is not empty - - /* Get WAL segments size and system ID of source PG instance */ - instance_config.xlog_seg_size = get_xlog_seg_size(source_pgdata); - instance_config.system_identifier = get_system_identifier(source_pgdata); - - StrNCpy(current.program_version, PROGRAM_VERSION, sizeof(current.program_version)); - - elog(INFO, "Catchup start, pg_probackup version: %s, " - "wal mode: %s, remote: %s, catchup-source-pgdata: %s, catchup-destination-pgdata: %s", - PROGRAM_VERSION, - current.stream ? "STREAM" : "ARCHIVE", IsSshProtocol() ? "true" : "false", - source_pgdata, dest_pgdata); - - /* Do some compatibility checks and fill basic info about PG instance */ - conn = pgdata_basic_setup(instance_config.conn_opt, &nodeInfo); - - elog(INFO, "PostgreSQL version: %s", nodeInfo.server_version_str); - - if (current.from_replica) - elog(INFO, "Running catchup from standby"); - - /* Check that connected PG instance and source PGDATA are the same */ - check_system_identifiers(conn, source_pgdata); - - if (!dest_pgdata_is_empty && - check_incremental_compatibility(dest_pgdata, - instance_config.system_identifier, - INCR_CHECKSUM) != DEST_OK) - elog(ERROR, "Incremental restore is not allowed"); - - /* below perform checks specific for backup command */ -#if PG_VERSION_NUM >= 110000 - if (!RetrieveWalSegSize(conn)) - elog(ERROR, "Failed to retrieve wal_segment_size"); -#endif - - // TODO: move to separate function for reuse in backup.c and catchup.c - // -> - get_ptrack_version(conn, &nodeInfo); - - if (nodeInfo.ptrack_version_num > 0) - nodeInfo.is_ptrack_enable = pg_ptrack_enable(conn, nodeInfo.ptrack_version_num); - - if (backup_mode == BACKUP_MODE_DIFF_PTRACK) - { - if (nodeInfo.ptrack_version_num == 0) - elog(ERROR, "This PostgreSQL instance does not support ptrack"); - else - { - if (!nodeInfo.is_ptrack_enable) - elog(ERROR, "Ptrack is disabled"); - } - } - // <- - - if (current.from_replica && exclusive_backup) - elog(ERROR, "Catchup from standby is available only for PG >= 9.6"); - - do_catchup_instance(source_pgdata, dest_pgdata, conn, &nodeInfo, - backup_mode, no_sync, backup_logs, dest_pgdata_is_empty); - - /* TODO: show the amount of transfered data in bytes and calculate incremental ratio */ - - return 0; -} - /* * TODO: add description */ static void * -catchup_files(void *arg) +catchup_thread_runner(void *arg) { int i; char from_fullpath[MAXPGPATH]; char to_fullpath[MAXPGPATH]; - catchup_files_arg *arguments = (catchup_files_arg *) arg; + catchup_thread_runner_arg *arguments = (catchup_thread_runner_arg *) arg; int n_files = parray_num(arguments->source_filelist); /* catchup a file */ diff --git a/src/pg_probackup.c b/src/pg_probackup.c index a9e91e0e5..94bc323be 100644 --- a/src/pg_probackup.c +++ b/src/pg_probackup.c @@ -68,6 +68,8 @@ static char *backup_path = NULL; static CatalogState *catalogState = NULL; /* ================ catalogState (END) =========== */ +/* colon separated external directories list ("/path1:/path2") */ +char *externaldir = NULL; /* common options */ int num_threads = 1; bool stream_wal = false; @@ -451,7 +453,7 @@ main(int argc, char *argv[]) catalogState->catalog_path, WAL_SUBDIR); } - /* backup_path is required for all pg_probackup commands except help, version, checkdb and catchup */ + /* backup_path is required for all pg_probackup commands except help, version and checkdb */ if (backup_path == NULL && backup_subcmd != CHECKDB_CMD && backup_subcmd != HELP_CMD && @@ -765,8 +767,13 @@ main(int argc, char *argv[]) elog(ERROR, "You must specify \"--catchup-destination-pgdata\" option with the \"%s\" command", get_subcmd_name(backup_subcmd)); if (current.backup_mode == BACKUP_MODE_INVALID) elog(ERROR, "Required parameter not specified: BACKUP_MODE (-b, --backup-mode)"); - if (current.backup_mode != BACKUP_MODE_FULL && current.backup_mode != BACKUP_MODE_DIFF_PTRACK) - elog(ERROR, "Only \"FULL\" and \"PTRACK\" modes are supported with the \"%s\" command", get_subcmd_name(backup_subcmd)); + if (current.backup_mode != BACKUP_MODE_FULL && current.backup_mode != BACKUP_MODE_DIFF_PTRACK && current.backup_mode != BACKUP_MODE_DIFF_DELTA) + elog(ERROR, "Only \"FULL\", \"PTRACK\" and \"DELTA\" modes are supported with the \"%s\" command", get_subcmd_name(backup_subcmd)); + if (!stream_wal) + elog(INFO, "--stream is required, forcing stream mode"); + current.stream = stream_wal = true; + if (instance_config.external_dir_str) + elog(ERROR, "external directories not supported fom \"%s\" command", get_subcmd_name(backup_subcmd)); // TODO проверить instance_config.conn_opt } @@ -813,7 +820,7 @@ main(int argc, char *argv[]) no_validate, no_sync, backup_logs); } case CATCHUP_CMD: - return do_catchup(catchup_source_pgdata, catchup_destination_pgdata, current.backup_mode, instance_config.conn_opt, stream_wal, num_threads); + return do_catchup(catchup_source_pgdata, catchup_destination_pgdata, current.backup_mode, instance_config.conn_opt, num_threads); case RESTORE_CMD: return do_restore_or_validate(instanceState, current.backup_id, recovery_target_options, diff --git a/src/pg_probackup.h b/src/pg_probackup.h index f295675c2..0061f8ab2 100644 --- a/src/pg_probackup.h +++ b/src/pg_probackup.h @@ -421,7 +421,7 @@ typedef struct PGNodeInfo char server_version_str[100]; int ptrack_version_num; - bool is_ptrack_enable; + bool is_ptrack_enabled; const char *ptrack_schema; /* used only for ptrack 2.x */ } PGNodeInfo; @@ -619,7 +619,7 @@ typedef struct * 0 means there is no error, 1 - there is an error. */ int ret; -} catchup_files_arg; +} catchup_thread_runner_arg; typedef struct timelineInfo timelineInfo; @@ -869,7 +869,7 @@ extern char *pg_ptrack_get_block(ConnectionArgs *arguments, int ptrack_version_num, const char *ptrack_schema); /* in catchup.c */ extern int do_catchup(const char *source_pgdata, const char *dest_pgdata, BackupMode backup_mode, - ConnectionOptions conn_opt, bool stream_wal, int num_threads); + ConnectionOptions conn_opt, int num_threads); /* in restore.c */ extern int do_restore_or_validate(InstanceState *instanceState, time_t target_backup_id, @@ -1212,7 +1212,7 @@ extern void make_pagemap_from_ptrack_2(parray* files, PGconn* backup_conn, XLogRecPtr lsn); extern void pg_ptrack_clear(PGconn *backup_conn, int ptrack_version_num); extern void get_ptrack_version(PGconn *backup_conn, PGNodeInfo *nodeInfo); -extern bool pg_ptrack_enable(PGconn *backup_conn, int ptrack_version_num); +extern bool pg_is_ptrack_enabled(PGconn *backup_conn, int ptrack_version_num); extern bool pg_ptrack_get_and_clear_db(Oid dbOid, Oid tblspcOid, PGconn *backup_conn); extern char *pg_ptrack_get_and_clear(Oid tablespace_oid, Oid db_oid, diff --git a/src/ptrack.c b/src/ptrack.c index dc1a2c74c..84d5b841d 100644 --- a/src/ptrack.c +++ b/src/ptrack.c @@ -226,7 +226,7 @@ get_ptrack_version(PGconn *backup_conn, PGNodeInfo *nodeInfo) * Check if ptrack is enabled in target instance */ bool -pg_ptrack_enable(PGconn *backup_conn, int ptrack_version_num) +pg_is_ptrack_enabled(PGconn *backup_conn, int ptrack_version_num) { PGresult *res_db; bool result = false; From 24bd657d15c06876847d528d6a4bb7657cc53512 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Mon, 24 May 2021 19:23:55 +0300 Subject: [PATCH 17/63] make separate function for node creation in catchup tests (make_empty_node) --- tests/catchup.py | 14 ++++++------ tests/helpers/ptrack_helpers.py | 38 ++++++++++++++++----------------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/tests/catchup.py b/tests/catchup.py index 8b2869918..fde51b172 100644 --- a/tests/catchup.py +++ b/tests/catchup.py @@ -36,10 +36,11 @@ def test_multithread_local_transfer(self): "CREATE TABLE ultimate_question AS SELECT 42 AS answer") result = source_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") + dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) dest_pg = self.catchup_node( backup_mode = 'FULL', source_pgdata = source_pg.data_dir, - destination_base_dir = os.path.join(module_name, fname, 'dst'), + destination_node = dest_pg, options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream', '-j', '4'] ) source_pg.stop() @@ -69,10 +70,11 @@ def test_multithread_remote_transfer(self): "CREATE TABLE ultimate_question AS SELECT 42 AS answer") result = source_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") + dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) dest_pg = self.catchup_node( backup_mode = 'FULL', source_pgdata = source_pg.data_dir, - destination_base_dir = os.path.join(module_name, fname, 'dst'), + destination_node = dest_pg, options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream', '-j', '4']) source_pg.stop() @@ -109,10 +111,11 @@ def test_remote_catchup(self): source_pg.safe_psql("postgres", "CREATE TABLE ultimate_question(answer int)") # make clean shutdowned lagging behind replica + dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) dest_pg = self.catchup_node( backup_mode = 'FULL', source_pgdata = source_pg.data_dir, - destination_base_dir = os.path.join(module_name, fname, 'dst'), + destination_node = dest_pg, options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream']) self.set_replica(source_pg, dest_pg) dest_pg.slow_start(replica = True) @@ -129,9 +132,8 @@ def test_remote_catchup(self): self.catchup_node( backup_mode = 'PTRACK', source_pgdata = source_pg.data_dir, - destination_base_dir = os.path.join(module_name, fname, 'dst'), - options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream'], - node = dest_pg) + destination_node = dest_pg, + options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream']) # stop replication source_pg.stop() diff --git a/tests/helpers/ptrack_helpers.py b/tests/helpers/ptrack_helpers.py index b7d2cd7a6..27ff06a21 100644 --- a/tests/helpers/ptrack_helpers.py +++ b/tests/helpers/ptrack_helpers.py @@ -335,14 +335,9 @@ def pg_config_version(self): # print('PGPROBACKUP_SSH_USER is not set') # exit(1) - def make_simple_node( + def make_empty_node( self, - base_dir=None, - set_replication=False, - ptrack_enable=False, - initdb_params=[], - pg_options={}): - + base_dir=None): real_base_dir = os.path.join(self.tmp_path, base_dir) shutil.rmtree(real_base_dir, ignore_errors=True) os.makedirs(real_base_dir) @@ -351,6 +346,17 @@ def make_simple_node( # bound method slow_start() to 'node' class instance node.slow_start = slow_start.__get__(node) node.should_rm_dirs = True + return node + + def make_simple_node( + self, + base_dir=None, + set_replication=False, + ptrack_enable=False, + initdb_params=[], + pg_options={}): + + node = self.make_empty_node(base_dir) node.init( initdb_params=initdb_params, allow_streaming=set_replication) @@ -1033,23 +1039,15 @@ def restore_node( def catchup_node( self, - backup_mode, source_pgdata, destination_base_dir, - options = [], - node = None + backup_mode, source_pgdata, destination_node, + options = [] ): - real_destination_dir = os.path.join(self.tmp_path, destination_base_dir) - if not node: - shutil.rmtree(real_destination_dir, ignore_errors = True) - node = testgres.get_new_node('test', base_dir = real_destination_dir) - node.slow_start = slow_start.__get__(node) - node.should_rm_dirs = True - cmd_list = [ 'catchup', '--backup-mode={0}'.format(backup_mode), '--catchup-source-pgdata={0}'.format(source_pgdata), - '--catchup-destination-pgdata={0}'.format(node.data_dir) + '--catchup-destination-pgdata={0}'.format(destination_node.data_dir) ] if self.remote: @@ -1057,8 +1055,8 @@ def catchup_node( self.run_pb(cmd_list + options) - node.append_conf(port=node.port) - return node + destination_node.append_conf(port=destination_node.port) + return destination_node def show_pb( self, backup_dir, instance=None, backup_id=None, From eab58c18f6dd62424d5c9e7cc84041900adf51f8 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Mon, 24 May 2021 19:29:32 +0300 Subject: [PATCH 18/63] simple delta catchup test --- tests/catchup.py | 62 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/tests/catchup.py b/tests/catchup.py index fde51b172..01716ce00 100644 --- a/tests/catchup.py +++ b/tests/catchup.py @@ -89,7 +89,7 @@ def test_multithread_remote_transfer(self): self.del_test_dir(module_name, fname) # @unittest.skip("skip") - def test_remote_catchup(self): + def test_remote_ptrack_catchup(self): """ Test 'catchup' mode create node, @@ -150,4 +150,64 @@ def test_remote_catchup(self): # Clean after yourself self.del_test_dir(module_name, fname) + # @unittest.skip("skip") + def test_remote_delta_catchup(self): + """ + Test 'catchup' mode + create node, + make a copy with replication, start copy, stop copy, + generate some load on master, insert some test data on master, + catchup copy, start and select test data + """ + fname = self.id().split('.')[3] + + # prepare master + source_pg = self.make_simple_node( + base_dir = os.path.join(module_name, fname, 'src'), + set_replication = True, + ptrack_enable = True, + initdb_params = ['--data-checksums'] + ) + source_pg.slow_start() + source_pg.safe_psql("postgres", "CREATE EXTENSION ptrack") + source_pg.safe_psql("postgres", "CREATE TABLE ultimate_question(answer int)") + + # make clean shutdowned lagging behind replica + dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) + dest_pg = self.catchup_node( + backup_mode = 'FULL', + source_pgdata = source_pg.data_dir, + destination_node = dest_pg, + options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream']) + self.set_replica(source_pg, dest_pg) + dest_pg.slow_start(replica = True) + dest_pg.stop() + + # make changes on master + source_pg.pgbench_init(scale=10) + pgbench = source_pg.pgbench(options=['-T', '10', '--no-vacuum']) + pgbench.wait() + source_pg.safe_psql("postgres", "INSERT INTO ultimate_question VALUES(42)") + result = source_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") + + # catchup + self.catchup_node( + backup_mode = 'DELTA', + source_pgdata = source_pg.data_dir, + destination_node = dest_pg, + options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream']) + # stop replication + source_pg.stop() + + # check latest changes + self.set_replica(source_pg, dest_pg) + dest_pg.slow_start(replica = True) + self.assertEqual( + result, + dest_pg.safe_psql("postgres", "SELECT * FROM ultimate_question"), + 'Different answer from copy') + dest_pg.stop() + + # Clean after yourself + self.del_test_dir(module_name, fname) From 9efea2b094788ea4676b4d99179644a52fadea2d Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Tue, 25 May 2021 01:27:36 +0300 Subject: [PATCH 19/63] Remove get_min_recovery_point() and use get_redo() instead --- src/catchup.c | 7 ++++++- src/pg_probackup.h | 1 - src/util.c | 15 --------------- 3 files changed, 6 insertions(+), 17 deletions(-) diff --git a/src/catchup.c b/src/catchup.c index 5f603ab27..fd2bb2e47 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -186,11 +186,16 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * (backup_mode == BACKUP_MODE_DIFF_PTRACK || backup_mode == BACKUP_MODE_DIFF_DELTA)) { + RedoParams dest_redo; + dest_filelist = parray_new(); dir_list_file(dest_filelist, dest_pgdata, true, true, false, backup_logs, true, 0, FIO_LOCAL_HOST); - sync_lsn = get_min_recovery_point(dest_pgdata); + // fill dest_redo.lsn and dest_redo.tli + get_redo(dest_pgdata, &dest_redo); + + sync_lsn = dest_redo.lsn; elog(INFO, "syncLSN = %X/%X", (uint32) (sync_lsn >> 32), (uint32) sync_lsn); } diff --git a/src/pg_probackup.h b/src/pg_probackup.h index 0061f8ab2..a84483767 100644 --- a/src/pg_probackup.h +++ b/src/pg_probackup.h @@ -1176,7 +1176,6 @@ extern uint32 get_data_checksum_version(bool safe); extern pg_crc32c get_pgcontrol_checksum(const char *pgdata_path); extern uint32 get_xlog_seg_size(const char *pgdata_path); extern void get_redo(const char *pgdata_path, RedoParams *redo); -extern XLogRecPtr get_min_recovery_point(const char *pgdata_path); extern void set_min_recovery_point(pgFile *file, const char *backup_path, XLogRecPtr stop_backup_lsn); extern void copy_pgcontrol_file(const char *from_fullpath, fio_location from_location, diff --git a/src/util.c b/src/util.c index dbf7a30d1..27b5409f5 100644 --- a/src/util.c +++ b/src/util.c @@ -384,21 +384,6 @@ get_redo(const char *pgdata_path, RedoParams *redo) redo->checksum_version = ControlFile.data_checksum_version; } -/* Get minRecoveryPoint from control file from pgdata_path */ -XLogRecPtr -get_min_recovery_point(const char *pgdata_path) -{ - ControlFileData ControlFile; - char *buffer; - size_t size; - - buffer = slurpFile(pgdata_path, XLOG_CONTROL_FILE, &size, false, FIO_LOCAL_HOST); - digestControlFile(&ControlFile, buffer, size); - pg_free(buffer); - - return ControlFile.minRecoveryPoint; -} - /* * Rewrite minRecoveryPoint of pg_control in backup directory. minRecoveryPoint * 'as-is' is not to be trusted. From 1c1a89c45813639153698da839550bb8752e8697 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Tue, 25 May 2021 05:20:12 +0300 Subject: [PATCH 20/63] catchup update #2 --- src/backup.c | 51 +++++++---- src/catchup.c | 218 +++++++++++++++++++++++---------------------- src/dir.c | 11 +++ src/pg_probackup.c | 2 +- src/pg_probackup.h | 5 +- tests/catchup.py | 58 +++++++++--- 6 files changed, 209 insertions(+), 136 deletions(-) diff --git a/src/backup.c b/src/backup.c index 1e6912be4..cfb46e1a3 100644 --- a/src/backup.c +++ b/src/backup.c @@ -335,23 +335,7 @@ do_backup_pg(InstanceState *instanceState, PGconn *backup_conn, elog(ERROR, "PGDATA is almost empty. Either it was concurrently deleted or " "pg_probackup do not possess sufficient permissions to list PGDATA content"); - /* Calculate pgdata_bytes */ - for (i = 0; i < parray_num(backup_files_list); i++) - { - pgFile *file = (pgFile *) parray_get(backup_files_list, i); - - if (file->external_dir_num != 0) - continue; - - if (S_ISDIR(file->mode)) - { - current.pgdata_bytes += 4096; - continue; - } - - current.pgdata_bytes += file->size; - } - + current.pgdata_bytes += calculate_datasize_of_filelist(backup_files_list); pretty_size(current.pgdata_bytes, pretty_bytes, lengthof(pretty_bytes)); elog(INFO, "PGDATA size: %s", pretty_bytes); @@ -2382,3 +2366,36 @@ check_external_for_tablespaces(parray *external_list, PGconn *backup_conn) } } } + +/* + * Calculate pgdata_bytes + * accepts (parray *) of (pgFile *) + */ +int64 +calculate_datasize_of_filelist(parray *filelist) +{ + int64 bytes = 0; + int i; + + /* parray_num don't check for NULL */ + if (filelist == NULL) + return 0; + + for (i = 0; i < parray_num(filelist); i++) + { + pgFile *file = (pgFile *) parray_get(filelist, i); + + if (file->external_dir_num != 0) + continue; + + if (S_ISDIR(file->mode)) + { + // TODO is a dir always 4K? + bytes += 4096; + continue; + } + + bytes += file->size; + } + return bytes; +} \ No newline at end of file diff --git a/src/catchup.c b/src/catchup.c index fd2bb2e47..456d07905 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -26,12 +26,11 @@ /* * Catchup routines */ -static PGconn *catchup_collect_info(PGNodeInfo *source_node_info, const char *source_pgdata, const char *dest_pgdata, - BackupMode backup_mode, ConnectionOptions conn_opt); -static void catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, - const char *source_pgdata, BackupMode backup_mode); +static PGconn *catchup_collect_info(PGNodeInfo *source_node_info, const char *source_pgdata, const char *dest_pgdata); +static void catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, const char *source_pgdata, + const char *dest_pgdata, bool dest_pgdata_is_empty); static void do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn *source_conn, - PGNodeInfo *nodeInfo, BackupMode backup_mode, bool no_sync, bool backup_logs, + PGNodeInfo *nodeInfo, bool no_sync, bool backup_logs, bool dest_pgdata_is_empty); static void *catchup_thread_runner(void *arg); @@ -39,29 +38,19 @@ static void *catchup_thread_runner(void *arg); * Entry point of pg_probackup CATCHUP subcommand. */ int -do_catchup(const char *source_pgdata, const char *dest_pgdata, BackupMode backup_mode, - ConnectionOptions conn_opt, int num_threads) +do_catchup(const char *source_pgdata, const char *dest_pgdata, int num_threads) { PGconn *source_conn = NULL; PGNodeInfo source_node_info; bool no_sync = false; bool backup_logs = false; - bool dest_pgdata_is_empty = dir_is_empty(dest_pgdata, FIO_LOCAL_HOST); + bool dest_pgdata_is_empty = dir_is_empty(dest_pgdata, FIO_LOCAL_HOST); - source_conn = catchup_collect_info(&source_node_info, source_pgdata, dest_pgdata, backup_mode, conn_opt); - catchup_preflight_checks(&source_node_info, source_conn, source_pgdata, backup_mode); - - if (!dest_pgdata_is_empty && - check_incremental_compatibility(dest_pgdata, - instance_config.system_identifier, - INCR_CHECKSUM) != DEST_OK) - elog(ERROR, "Incremental restore is not allowed"); - - if (current.from_replica && exclusive_backup) - elog(ERROR, "Catchup from standby is available only for PG >= 9.6"); + source_conn = catchup_collect_info(&source_node_info, source_pgdata, dest_pgdata); + catchup_preflight_checks(&source_node_info, source_conn, source_pgdata, dest_pgdata, dest_pgdata_is_empty); do_catchup_instance(source_pgdata, dest_pgdata, source_conn, &source_node_info, - backup_mode, no_sync, backup_logs, dest_pgdata_is_empty); + no_sync, backup_logs, dest_pgdata_is_empty); /* TODO: show the amount of transfered data in bytes and calculate incremental ratio */ @@ -69,8 +58,7 @@ do_catchup(const char *source_pgdata, const char *dest_pgdata, BackupMode backup } static PGconn * -catchup_collect_info(PGNodeInfo *source_node_info, const char *source_pgdata, const char *dest_pgdata, - BackupMode backup_mode, ConnectionOptions conn_opt) +catchup_collect_info(PGNodeInfo *source_node_info, const char *source_pgdata, const char *dest_pgdata) { PGconn *source_conn; /* Initialize PGInfonode */ @@ -86,7 +74,7 @@ catchup_collect_info(PGNodeInfo *source_node_info, const char *source_pgdata, co //current.compress_level = instance_config.compress_level; /* Do some compatibility checks and fill basic info about PG instance */ - source_conn = pgdata_basic_setup(conn_opt, source_node_info); + source_conn = pgdata_basic_setup(instance_config.conn_opt, source_node_info); /* below perform checks specific for backup command */ #if PG_VERSION_NUM >= 110000 @@ -120,14 +108,14 @@ catchup_collect_info(PGNodeInfo *source_node_info, const char *source_pgdata, co static void catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, - const char *source_pgdata, BackupMode backup_mode) + const char *source_pgdata, const char *dest_pgdata, bool dest_pgdata_is_empty) { // TODO: add sanity check that source PGDATA is not empty /* Check that connected PG instance and source PGDATA are the same */ check_system_identifiers(source_conn, source_pgdata); - if (backup_mode == BACKUP_MODE_DIFF_PTRACK) + if (current.backup_mode == BACKUP_MODE_DIFF_PTRACK) { if (source_node_info->ptrack_version_num == 0) elog(ERROR, "This PostgreSQL instance does not support ptrack"); @@ -137,6 +125,17 @@ catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, else if (!source_node_info->is_ptrack_enabled) elog(ERROR, "Ptrack is disabled"); } + + if (!dest_pgdata_is_empty && + check_incremental_compatibility(dest_pgdata, + instance_config.system_identifier, + INCR_CHECKSUM) != DEST_OK) + elog(ERROR, "Catchup is not possible in this destination"); + + if (current.from_replica && exclusive_backup) + elog(ERROR, "Catchup from standby is available only for PG >= 9.6"); + + // TODO check if it is local catchup and source contain tablespaces } /* @@ -146,13 +145,13 @@ catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, */ static void do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn *source_conn, - PGNodeInfo *source_node_info, BackupMode backup_mode, bool no_sync, bool backup_logs, + PGNodeInfo *source_node_info, bool no_sync, bool backup_logs, bool dest_pgdata_is_empty) { int i; char dest_xlog_path[MAXPGPATH]; char label[1024]; - XLogRecPtr sync_lsn = InvalidXLogRecPtr; + RedoParams dest_redo = { 0, InvalidXLogRecPtr, 0 }; /* arrays with meta info for multi threaded backup */ pthread_t *threads; @@ -171,6 +170,12 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * char pretty_time[20]; char pretty_bytes[20]; + PGStopBackupResult stop_backup_result; + /* kludge against some old bug in archive_timeout. TODO: remove in 3.0.0 */ + int timeout = (instance_config.archive_timeout > 0) ? + instance_config.archive_timeout : ARCHIVE_TIMEOUT_DEFAULT; + char *query_text = NULL; + elog(LOG, "Database catchup start"); /* notify start of backup to PostgreSQL server */ @@ -178,51 +183,48 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * strncat(label, " with pg_probackup", lengthof(label) - strlen(" with pg_probackup")); + // TODO delete dest control file + /* Call pg_start_backup function in PostgreSQL connect */ pg_start_backup(label, smooth_checkpoint, ¤t, source_node_info, source_conn); elog(LOG, "pg_start_backup START LSN %X/%X", (uint32) (current.start_lsn >> 32), (uint32) (current.start_lsn)); if (!dest_pgdata_is_empty && - (backup_mode == BACKUP_MODE_DIFF_PTRACK || - backup_mode == BACKUP_MODE_DIFF_DELTA)) + (current.backup_mode == BACKUP_MODE_DIFF_PTRACK || + current.backup_mode == BACKUP_MODE_DIFF_DELTA)) { - RedoParams dest_redo; - dest_filelist = parray_new(); dir_list_file(dest_filelist, dest_pgdata, true, true, false, backup_logs, true, 0, FIO_LOCAL_HOST); // fill dest_redo.lsn and dest_redo.tli get_redo(dest_pgdata, &dest_redo); - - sync_lsn = dest_redo.lsn; - elog(INFO, "syncLSN = %X/%X", (uint32) (sync_lsn >> 32), (uint32) sync_lsn); + elog(INFO, "syncLSN = %X/%X", (uint32) (dest_redo.lsn >> 32), (uint32) dest_redo.lsn); } /* * TODO: move to separate function to use in both backup.c and catchup.c */ - if (backup_mode == BACKUP_MODE_DIFF_PTRACK) + if (current.backup_mode == BACKUP_MODE_DIFF_PTRACK) { XLogRecPtr ptrack_lsn = get_last_ptrack_lsn(source_conn, source_node_info); // new ptrack is more robust and checks Start LSN - if (ptrack_lsn > sync_lsn || ptrack_lsn == InvalidXLogRecPtr) + if (ptrack_lsn > dest_redo.lsn || ptrack_lsn == InvalidXLogRecPtr) elog(ERROR, "LSN from ptrack_control %X/%X is greater than checkpoint LSN %X/%X.\n" "Create new full backup before an incremental one.", (uint32) (ptrack_lsn >> 32), (uint32) (ptrack_lsn), - (uint32) (sync_lsn >> 32), - (uint32) (sync_lsn)); + (uint32) (dest_redo.lsn >> 32), + (uint32) (dest_redo.lsn)); } - /* Check that sync_lsn is less than current.start_lsn */ - /* TODO это нужно? */ - if (backup_mode != BACKUP_MODE_FULL && - sync_lsn > current.start_lsn) + /* Check that dest_redo.lsn is less than current.start_lsn */ + if (current.backup_mode != BACKUP_MODE_FULL && + dest_redo.lsn > current.start_lsn) elog(ERROR, "Current START LSN %X/%X is lower than SYNC LSN %X/%X, " "it may indicate that we are trying to catchup with PostgreSQL instance from the past", (uint32) (current.start_lsn >> 32), (uint32) (current.start_lsn), - (uint32) (sync_lsn >> 32), (uint32) (sync_lsn)); + (uint32) (dest_redo.lsn >> 32), (uint32) (dest_redo.lsn)); /* Start stream replication */ if (stream_wal) @@ -243,31 +245,15 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * else dir_list_file(source_filelist, source_pgdata, true, true, false, backup_logs, true, 0, FIO_LOCAL_HOST); + // TODO filter pg_xlog/wal? + // TODO what if wal is not a dir (symlink to a dir)? /* close ssh session in main thread */ fio_disconnect(); - /* Calculate pgdata_bytes - * TODO: move to separate function to use in both backup.c and catchup.c - */ - for (i = 0; i < parray_num(source_filelist); i++) - { - pgFile *file = (pgFile *) parray_get(source_filelist, i); - - if (file->external_dir_num != 0) - continue; - - if (S_ISDIR(file->mode)) - { - current.pgdata_bytes += 4096; - continue; - } - - current.pgdata_bytes += file->size; - } - + current.pgdata_bytes += calculate_datasize_of_filelist(source_filelist); pretty_size(current.pgdata_bytes, pretty_bytes, lengthof(pretty_bytes)); - elog(INFO, "PGDATA size: %s", pretty_bytes); + elog(INFO, "Source PGDATA size: %s", pretty_bytes); /* * Sort pathname ascending. It is necessary to create intermediate @@ -285,19 +271,17 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * /* Extract information about files in backup_list parsing their names:*/ parse_filelist_filenames(source_filelist, source_pgdata); - elog(LOG, "Current Start LSN: %X/%X, TLI: %X", + elog(LOG, "Start LSN (source): %X/%X, TLI: %X", (uint32) (current.start_lsn >> 32), (uint32) (current.start_lsn), current.tli); /* TODO проверить, нужна ли проверка TLI */ - /*if (backup_mode != BACKUP_MODE_FULL) - elog(LOG, "Parent Start LSN: %X/%X, TLI: %X", - (uint32) (sync_lsn >> 32), (uint32) (sync_lsn), - prev_backup->tli); - */ + if (current.backup_mode != BACKUP_MODE_FULL) + elog(LOG, "LSN in destination: %X/%X, TLI: %X", + (uint32) (dest_redo.lsn >> 32), (uint32) (dest_redo.lsn), + dest_redo.tli); /* Build page mapping in PTRACK mode */ - - if (backup_mode == BACKUP_MODE_DIFF_PTRACK) + if (current.backup_mode == BACKUP_MODE_DIFF_PTRACK) { time(&start_time); elog(INFO, "Extracting pagemap of changed blocks"); @@ -306,7 +290,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * make_pagemap_from_ptrack_2(source_filelist, source_conn, source_node_info->ptrack_schema, source_node_info->ptrack_version_num, - sync_lsn); + dest_redo.lsn); time(&end_time); elog(INFO, "Pagemap successfully extracted, time elapsed: %.0f sec", difftime(end_time, start_time)); @@ -315,32 +299,68 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * /* * Make directories before catchup and setup threads at the same time */ + /* + * We iterate over source_filelist and for every directory with parent 'pg_tblspc' + * we must lookup this directory name in tablespace map. + * If we got a match, we treat this directory as tablespace. + * It means that we create directory specified in tablespace_map and + * original directory created as symlink to it. + */ for (i = 0; i < parray_num(source_filelist); i++) { pgFile *file = (pgFile *) parray_get(source_filelist, i); + char parent_dir[MAXPGPATH]; - /* if the entry was a directory, create it in the backup */ - if (S_ISDIR(file->mode)) + /* setup threads */ + pg_atomic_clear_flag(&file->lock); + + if (!S_ISDIR(file->mode)) + continue; + + /* + * check if it is fake "directory" and is a tablespace link + * это происходить потому что мы передали follow_symlink при построении списка + */ + /* get parent dir of rel_path */ + strncpy(parent_dir, file->rel_path, MAXPGPATH); + get_parent_directory(parent_dir); + + /* check if directory is actually link to tablespace */ + if (strcmp(parent_dir, PG_TBLSPC_DIR) != 0) { + /* if the entry is a regular directory, create it in the destination */ char dirpath[MAXPGPATH]; - if (file->external_dir_num) - { - char temp[MAXPGPATH]; - /* TODO пока непонятно, разобраться! */ - /* snprintf(temp, MAXPGPATH, "%s%d", external_prefix, - file->external_dir_num); */ - join_path_components(dirpath, temp, file->rel_path); - } - else - join_path_components(dirpath, dest_pgdata, file->rel_path); + join_path_components(dirpath, dest_pgdata, file->rel_path); elog(VERBOSE, "Create directory '%s'", dirpath); fio_mkdir(dirpath, DIR_PERMISSION, FIO_BACKUP_HOST); } + else + { + /* this directory located in pg_tblspc */ + const char *linked_path = leaked_abstraction_get_tablespace_mapping(file->name); + char to_path[MAXPGPATH]; - /* setup threads */ - pg_atomic_clear_flag(&file->lock); + //elog(WARNING, "pgFile name: %s rel_path: %s linked: %s\n", file->name, file->rel_path, file->linked); + + if (!is_absolute_path(linked_path)) + elog(ERROR, "Tablespace directory path must be an absolute path: %s\n", + linked_path); + + join_path_components(to_path, dest_pgdata, file->rel_path); + + elog(VERBOSE, "Create directory \"%s\" and symbolic link \"%s\"", + linked_path, to_path); + + /* create tablespace directory */ + fio_mkdir(linked_path, DIR_PERMISSION, FIO_BACKUP_HOST); + + /* create link to linked_path */ + if (fio_symlink(linked_path, to_path, true, FIO_BACKUP_HOST) < 0) + elog(ERROR, "Could not create symbolic link \"%s\": %s", + to_path, strerror(errno)); + } } /* Sort by size for load balancing */ @@ -359,8 +379,8 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * arg->to_root = dest_pgdata; arg->source_filelist = source_filelist; arg->dest_filelist = dest_filelist; - arg->sync_lsn = sync_lsn; - arg->backup_mode = backup_mode; + arg->sync_lsn = dest_redo.lsn; + arg->backup_mode = current.backup_mode; arg->thread_num = i+1; /* By default there are some error */ arg->ret = 1; @@ -397,21 +417,6 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * pretty_time); /* Notify end of backup */ - //!!!!! - //catchup_pg_stop_backup(¤t, source_conn, source_node_info, dest_pgdata); - -/* - * Notify end of backup to PostgreSQL server. - */ -//static void -//catchup_pg_stop_backup(pgBackup *backup, PGconn *pg_startbackup_conn, PGNodeInfo *source_node_info, const char *destination_dir) -//{ - PGStopBackupResult stop_backup_result; - /* kludge against some old bug in archive_timeout. TODO: remove in 3.0.0 */ - int timeout = (instance_config.archive_timeout > 0) ? - instance_config.archive_timeout : ARCHIVE_TIMEOUT_DEFAULT; - char *query_text = NULL; - pg_silent_client_messages(source_conn); /* Create restore point @@ -440,7 +445,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * /* Write backup_label */ pg_stop_backup_write_file_helper(dest_pgdata, PG_BACKUP_LABEL_FILE, "backup label", stop_backup_result.backup_label_content, stop_backup_result.backup_label_content_len, - backup_files_list); + NULL); free(stop_backup_result.backup_label_content); stop_backup_result.backup_label_content = NULL; stop_backup_result.backup_label_content_len = 0; @@ -448,9 +453,10 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * /* Write tablespace_map */ if (stop_backup_result.tablespace_map_content != NULL) { + // TODO what if tablespace is created during catchup? pg_stop_backup_write_file_helper(dest_pgdata, PG_TABLESPACE_MAP_FILE, "tablespace map", stop_backup_result.tablespace_map_content, stop_backup_result.tablespace_map_content_len, - backup_files_list); + NULL); free(stop_backup_result.tablespace_map_content); stop_backup_result.tablespace_map_content = NULL; stop_backup_result.tablespace_map_content_len = 0; @@ -458,7 +464,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * /* This function will also add list of xlog files * to the passed filelist */ - if(wait_WAL_streaming_end(backup_files_list)) + if(wait_WAL_streaming_end(NULL)) elog(ERROR, "WAL streaming failed"); current.recovery_xid = stop_backup_result.snapshot_xid; diff --git a/src/dir.c b/src/dir.c index 86848d8d5..e4ead6664 100644 --- a/src/dir.c +++ b/src/dir.c @@ -907,6 +907,17 @@ get_tablespace_mapping(const char *dir) return dir; } +/* + * TODO протёкшая абстрация, надо на этапе ревью решить что с ней делать, + * потому как непонятно, почему мы в backup.c напрямую работаем с созданием + * каталогов, видимо, когда-то подразумевалось, что вся работа будет в dir.c + */ +const char * +leaked_abstraction_get_tablespace_mapping(const char *dir) +{ + return get_tablespace_mapping(dir); +} + /* * Split argument into old_dir and new_dir and append to mapping * list. diff --git a/src/pg_probackup.c b/src/pg_probackup.c index 94bc323be..8b2e4a271 100644 --- a/src/pg_probackup.c +++ b/src/pg_probackup.c @@ -820,7 +820,7 @@ main(int argc, char *argv[]) no_validate, no_sync, backup_logs); } case CATCHUP_CMD: - return do_catchup(catchup_source_pgdata, catchup_destination_pgdata, current.backup_mode, instance_config.conn_opt, num_threads); + return do_catchup(catchup_source_pgdata, catchup_destination_pgdata, num_threads); case RESTORE_CMD: return do_restore_or_validate(instanceState, current.backup_id, recovery_target_options, diff --git a/src/pg_probackup.h b/src/pg_probackup.h index a84483767..73a7ee45d 100644 --- a/src/pg_probackup.h +++ b/src/pg_probackup.h @@ -868,8 +868,7 @@ extern char *pg_ptrack_get_block(ConnectionArgs *arguments, BlockNumber blknum, size_t *result_size, int ptrack_version_num, const char *ptrack_schema); /* in catchup.c */ -extern int do_catchup(const char *source_pgdata, const char *dest_pgdata, BackupMode backup_mode, - ConnectionOptions conn_opt, int num_threads); +extern int do_catchup(const char *source_pgdata, const char *dest_pgdata, int num_threads); /* in restore.c */ extern int do_restore_or_validate(InstanceState *instanceState, time_t target_backup_id, @@ -1034,6 +1033,7 @@ extern void dir_list_file(parray *files, const char *root, bool exclude, bool follow_symlink, bool add_root, bool backup_logs, bool skip_hidden, int external_dir_num, fio_location location); +extern const char *leaked_abstraction_get_tablespace_mapping(const char *dir); extern void create_data_directories(parray *dest_files, const char *data_dir, const char *backup_dir, @@ -1335,5 +1335,6 @@ extern XLogRecPtr wait_wal_lsn(const char *wal_segment_dir, XLogRecPtr lsn, bool bool in_prev_segment, bool segment_only, int timeout_elevel, bool in_stream_dir); extern void wait_wal_and_calculate_stop_lsn(const char *xlog_path, XLogRecPtr stop_lsn, pgBackup *backup); +extern int64 calculate_datasize_of_filelist(parray *filelist); #endif /* PG_PROBACKUP_H */ diff --git a/tests/catchup.py b/tests/catchup.py index 01716ce00..1edb4083a 100644 --- a/tests/catchup.py +++ b/tests/catchup.py @@ -7,41 +7,74 @@ class CatchupTest(ProbackupTest, unittest.TestCase): # @unittest.skip("skip") - def dummy(self): + def test_multithread_local_transfer(self): """ - dummy test + Test 'multithreaded basebackup' mode + create node, insert some test data, catchup into other dir, start, select test data """ fname = self.id().split('.')[3] - node = self.make_simple_node( - base_dir = os.path.join(module_name, fname, 'node') + + source_pg = self.make_simple_node(base_dir = os.path.join(module_name, fname, 'src')) + source_pg.slow_start() + source_pg.safe_psql( + "postgres", + "CREATE TABLE ultimate_question AS SELECT 42 AS answer") + result = source_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") + + dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) + dest_pg = self.catchup_node( + backup_mode = 'FULL', + source_pgdata = source_pg.data_dir, + destination_node = dest_pg, + options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream', '-j', '4'] ) - node.slow_start() + source_pg.stop() + + dest_pg.slow_start() + self.assertEqual( + result, + dest_pg.safe_psql("postgres", "SELECT * FROM ultimate_question"), + 'Different answer from copy') + dest_pg.stop() # Clean after yourself - node.stop() self.del_test_dir(module_name, fname) # @unittest.skip("skip") - def test_multithread_local_transfer(self): + def test_local_simple_transfer_with_tablespace(self): """ Test 'multithreaded basebackup' mode create node, insert some test data, catchup into other dir, start, select test data """ fname = self.id().split('.')[3] - source_pg = self.make_simple_node(base_dir = os.path.join(module_name, fname, 'src')) + source_pg = self.make_simple_node( + base_dir = os.path.join(module_name, fname, 'src'), + initdb_params = ['--data-checksums']) source_pg.slow_start() + + tblspace1_old_path = self.get_tblspace_path(source_pg, 'tblspace1_old') + self.create_tblspace_in_node( + source_pg, 'tblspace1', + tblspc_path = tblspace1_old_path) + source_pg.safe_psql( "postgres", - "CREATE TABLE ultimate_question AS SELECT 42 AS answer") + "CREATE TABLE ultimate_question TABLESPACE tblspace1 AS SELECT 42 AS answer") result = source_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) + tblspace1_new_path = self.get_tblspace_path(dest_pg, 'tblspace1_new') dest_pg = self.catchup_node( backup_mode = 'FULL', source_pgdata = source_pg.data_dir, destination_node = dest_pg, - options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream', '-j', '4'] + options = [ + '-d', 'postgres', + '-p', str(source_pg.port), + '--stream', + '-T', '{0}={1}'.format(tblspace1_old_path, tblspace1_new_path) + ] ) source_pg.stop() @@ -52,6 +85,10 @@ def test_multithread_local_transfer(self): 'Different answer from copy') dest_pg.stop() + source_pgdata = self.pgdata_content(source_node.data_dir) + dest_pgdata = self.pgdata_content(dest_node.data_dir) + self.compare_pgdata(source_pgdata, dest_pgdata) + # Clean after yourself self.del_test_dir(module_name, fname) @@ -211,3 +248,4 @@ def test_remote_delta_catchup(self): # Clean after yourself self.del_test_dir(module_name, fname) + From d7e2606291e3f104b9116fd8b1855783ab6eb75e Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Tue, 25 May 2021 18:05:39 +0300 Subject: [PATCH 21/63] catchup update #3 (tablespace mapping fix) --- src/catchup.c | 14 +++++++++-- src/utils/file.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++ src/utils/file.h | 4 +++- tests/catchup.py | 4 ++-- 4 files changed, 77 insertions(+), 5 deletions(-) diff --git a/src/catchup.c b/src/catchup.c index 456d07905..ee254ef69 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -339,10 +339,20 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * else { /* this directory located in pg_tblspc */ - const char *linked_path = leaked_abstraction_get_tablespace_mapping(file->name); + const char *linked_path = NULL; char to_path[MAXPGPATH]; - //elog(WARNING, "pgFile name: %s rel_path: %s linked: %s\n", file->name, file->rel_path, file->linked); + { /* get full symlink path and map this path to new location */ + char source_full_path[MAXPGPATH]; + char symlink_content[MAXPGPATH]; + join_path_components(source_full_path, source_pgdata, file->rel_path); + fio_readlink(source_full_path, symlink_content, sizeof(symlink_content), FIO_DB_HOST); + linked_path = leaked_abstraction_get_tablespace_mapping(symlink_content); + elog(WARNING, "Map tablespace full_path: \"%s\" old_symlink_content: \"%s\" old_symlink_content: \"%s\"\n", + source_full_path, + symlink_content, + linked_path); + } if (!is_absolute_path(linked_path)) elog(ERROR, "Tablespace directory path must be an absolute path: %s\n", diff --git a/src/utils/file.c b/src/utils/file.c index 27de7b74a..e2a6eaf0b 100644 --- a/src/utils/file.c +++ b/src/utils/file.c @@ -1062,6 +1062,46 @@ int fio_stat(char const* path, struct stat* st, bool follow_symlink, fio_locatio } } +/* + * Read value of a symbolic link + * this is a wrapper about readlink() syscall + * side effects: string truncation occur (and it + * can be checked by caller by comparing + * returned value >= valsiz) + */ +ssize_t +fio_readlink(const char *path, char *value, size_t valsiz, fio_location location) +{ + if (!fio_is_remote(location)) + { + /* readlink don't place trailing \0 */ + ssize_t len = readlink(path, value, valsiz); + value[len < valsiz ? len : valsiz] = '\0'; + return len; + } + else + { + fio_header hdr; + size_t path_len = strlen(path) + 1; + + hdr.cop = FIO_READLINK; + hdr.handle = -1; + Assert(valsiz <= UINT_MAX); /* max value of fio_header.arg */ + hdr.arg = valsiz; + hdr.size = path_len; + + IO_CHECK(fio_write_all(fio_stdout, &hdr, sizeof(hdr)), sizeof(hdr)); + IO_CHECK(fio_write_all(fio_stdout, path, path_len), path_len); + + IO_CHECK(fio_read_all(fio_stdin, &hdr, sizeof(hdr)), sizeof(hdr)); + Assert(hdr.cop == FIO_READLINK); + Assert(hdr.size <= valsiz); + IO_CHECK(fio_read_all(fio_stdin, value, hdr.size), hdr.size); + value[hdr.size < valsiz ? hdr.size : valsiz] = '\0'; + return hdr.size; + } +} + /* Check presence of the file */ int fio_access(char const* path, int mode, fio_location location) { @@ -3175,6 +3215,26 @@ void fio_communicate(int in, int out) case FIO_GET_ASYNC_ERROR: fio_get_async_error_impl(out); break; + case FIO_READLINK: /* Read content of a symbolic link */ + { + /* + * We need a buf for a arguments and for a result at the same time + * hdr.size = strlen(symlink_name) + 1 + * hdr.arg = bufsize for a answer (symlink content) + */ + size_t filename_size = (size_t)hdr.size; + if (filename_size + hdr.arg > buf_size) { + buf_size = hdr.arg; + buf = (char*)realloc(buf, buf_size); + } + rc = readlink(buf, buf + filename_size, hdr.arg); + hdr.cop = FIO_READLINK; + hdr.size = rc > 0 ? rc : 0; + IO_CHECK(fio_write_all(out, &hdr, sizeof(hdr)), sizeof(hdr)); + if (hdr.size != 0) + IO_CHECK(fio_write_all(out, buf + filename_size, hdr.size), hdr.size); + } + break; default: Assert(false); } diff --git a/src/utils/file.h b/src/utils/file.h index 1eafe543d..c5a3f8bc0 100644 --- a/src/utils/file.h +++ b/src/utils/file.h @@ -55,7 +55,8 @@ typedef enum FIO_LIST_DIR, FIO_CHECK_POSTMASTER, FIO_GET_ASYNC_ERROR, - FIO_WRITE_ASYNC + FIO_WRITE_ASYNC, + FIO_READLINK } fio_operations; typedef enum @@ -128,6 +129,7 @@ extern int fio_mkdir(char const* path, int mode, fio_location location); extern int fio_chmod(char const* path, int mode, fio_location location); extern int fio_access(char const* path, int mode, fio_location location); extern int fio_stat(char const* path, struct stat* st, bool follow_symlinks, fio_location location); +extern ssize_t fio_readlink(const char *path, char *value, size_t valsiz, fio_location location); extern DIR* fio_opendir(char const* path, fio_location location); extern struct dirent * fio_readdir(DIR *dirp); extern int fio_closedir(DIR *dirp); diff --git a/tests/catchup.py b/tests/catchup.py index 1edb4083a..5df6df440 100644 --- a/tests/catchup.py +++ b/tests/catchup.py @@ -85,8 +85,8 @@ def test_local_simple_transfer_with_tablespace(self): 'Different answer from copy') dest_pg.stop() - source_pgdata = self.pgdata_content(source_node.data_dir) - dest_pgdata = self.pgdata_content(dest_node.data_dir) + source_pgdata = self.pgdata_content(source_pg.data_dir) + dest_pgdata = self.pgdata_content(dest_pg.data_dir) self.compare_pgdata(source_pgdata, dest_pgdata) # Clean after yourself From 2c8b7e967ef7abe8a48841dcd1c7d7969cf12e33 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Wed, 26 May 2021 14:52:16 +0300 Subject: [PATCH 22/63] create pfilearray_clear_locks() helper function --- src/backup.c | 7 ++++--- src/checkdb.c | 3 ++- src/dir.c | 16 ++++++++++++++++ src/pg_probackup.h | 1 + src/restore.c | 8 ++++---- src/utils/parray.c | 2 +- src/validate.c | 6 +----- 7 files changed, 29 insertions(+), 14 deletions(-) diff --git a/src/backup.c b/src/backup.c index 7cb100fe2..f102de26b 100644 --- a/src/backup.c +++ b/src/backup.c @@ -430,7 +430,7 @@ do_backup_pg(InstanceState *instanceState, PGconn *backup_conn, } /* - * Make directories before backup and setup threads at the same time + * Make directories before backup */ for (i = 0; i < parray_num(backup_files_list); i++) { @@ -455,10 +455,11 @@ do_backup_pg(InstanceState *instanceState, PGconn *backup_conn, fio_mkdir(dirpath, DIR_PERMISSION, FIO_BACKUP_HOST); } - /* setup threads */ - pg_atomic_clear_flag(&file->lock); } + /* setup thread locks */ + pfilearray_clear_locks(backup_files_list); + /* Sort by size for load balancing */ parray_qsort(backup_files_list, pgFileCompareSize); /* Sort the array for binary search */ diff --git a/src/checkdb.c b/src/checkdb.c index 4ea1d0800..5d7d6652b 100644 --- a/src/checkdb.c +++ b/src/checkdb.c @@ -455,7 +455,6 @@ get_index_list(const char *dbname, bool first_db_with_amcheck, ind->heapallindexed_is_supported = heapallindexed_is_supported; ind->amcheck_nspname = pgut_malloc(strlen(amcheck_nspname) + 1); strcpy(ind->amcheck_nspname, amcheck_nspname); - pg_atomic_clear_flag(&ind->lock); if (index_list == NULL) index_list = parray_new(); @@ -463,6 +462,8 @@ get_index_list(const char *dbname, bool first_db_with_amcheck, parray_append(index_list, ind); } + pfilearray_clear_locks(index_list); + PQclear(res); return index_list; diff --git a/src/dir.c b/src/dir.c index 86848d8d5..dfcddd7d0 100644 --- a/src/dir.c +++ b/src/dir.c @@ -222,6 +222,8 @@ pgFileInit(const char *rel_path) /* Number of blocks backed up during backup */ file->n_headers = 0; + // May be add? + // pg_atomic_clear_flag(file->lock); return file; } @@ -1859,3 +1861,17 @@ cleanup_tablespace(const char *path) parray_walk(files, pgFileFree); parray_free(files); } + +/* + * Clear the synchronisation locks in a parray of (pgFile *)'s + */ +void +pfilearray_clear_locks(parray *file_list) +{ + int i; + for (i = 0; i < parray_num(file_list); i++) + { + pgFile *file = (pgFile *) parray_get(file_list, i); + pg_atomic_clear_flag(&file->lock); + } +} diff --git a/src/pg_probackup.h b/src/pg_probackup.h index 60f1a4872..a7979ed27 100644 --- a/src/pg_probackup.h +++ b/src/pg_probackup.h @@ -1061,6 +1061,7 @@ extern int pgFileCompareRelPathWithExternalDesc(const void *f1, const void *f2); extern int pgFileCompareLinked(const void *f1, const void *f2); extern int pgFileCompareSize(const void *f1, const void *f2); extern int pgCompareOid(const void *f1, const void *f2); +extern void pfilearray_clear_locks(parray *file_list); /* in data.c */ extern bool check_data_file(ConnectionArgs *arguments, pgFile *file, diff --git a/src/restore.c b/src/restore.c index b7071234a..4cef60005 100644 --- a/src/restore.c +++ b/src/restore.c @@ -824,7 +824,7 @@ restore_chain(pgBackup *dest_backup, parray *parent_chain, } /* - * Setup directory structure for external directories and file locks + * Setup directory structure for external directories */ for (i = 0; i < parray_num(dest_files); i++) { @@ -848,11 +848,11 @@ restore_chain(pgBackup *dest_backup, parray *parent_chain, elog(VERBOSE, "Create external directory \"%s\"", dirpath); fio_mkdir(dirpath, file->mode, FIO_DB_HOST); } - - /* setup threads */ - pg_atomic_clear_flag(&file->lock); } + /* setup threads */ + pfilearray_clear_locks(dest_files); + /* Get list of files in destination directory and remove redundant files */ if (params->incremental_mode != INCR_NONE || cleanup_pgdata) { diff --git a/src/utils/parray.c b/src/utils/parray.c index 31148ee9a..95b83365d 100644 --- a/src/utils/parray.c +++ b/src/utils/parray.c @@ -175,7 +175,7 @@ parray_rm(parray *array, const void *key, int(*compare)(const void *, const void size_t parray_num(const parray *array) { - return array->used; + return array!= NULL ? array->used : (size_t) 0; } void diff --git a/src/validate.c b/src/validate.c index f000698d0..4044ac158 100644 --- a/src/validate.c +++ b/src/validate.c @@ -130,11 +130,7 @@ pgBackupValidate(pgBackup *backup, pgRestoreParams *params) // params->partial_restore_type); /* setup threads */ - for (i = 0; i < parray_num(files); i++) - { - pgFile *file = (pgFile *) parray_get(files, i); - pg_atomic_clear_flag(&file->lock); - } + pfilearray_clear_locks(files); /* init thread args with own file lists */ threads = (pthread_t *) palloc(sizeof(pthread_t) * num_threads); From 50cd84dba290f05d862416524f08b473c5e7b771 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Wed, 26 May 2021 15:25:02 +0300 Subject: [PATCH 23/63] followup for 02aa32107433c34f6f7f264585e051cc7b5dbab1 --- src/catalog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/catalog.c b/src/catalog.c index b1040c66d..8b56496ad 100644 --- a/src/catalog.c +++ b/src/catalog.c @@ -2883,7 +2883,7 @@ pgNodeInit(PGNodeInfo *node) node->server_version_str[0] = '\0'; node->ptrack_version_num = 0; - node->is_ptrack_enable = false; + node->is_ptrack_enabled = false; node->ptrack_schema = NULL; } From bc2f3929a1bd7ca2891376df9e015b2d1cadbd97 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Wed, 26 May 2021 16:00:59 +0300 Subject: [PATCH 24/63] catchup update #4 (fix removed files) --- src/catchup.c | 51 ++++++++++++++++++++++++++++++++++++---- tests/catchup.py | 61 +++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 102 insertions(+), 10 deletions(-) diff --git a/src/catchup.c b/src/catchup.c index ee254ef69..5bb63809a 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -311,15 +311,12 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * pgFile *file = (pgFile *) parray_get(source_filelist, i); char parent_dir[MAXPGPATH]; - /* setup threads */ - pg_atomic_clear_flag(&file->lock); - if (!S_ISDIR(file->mode)) continue; /* * check if it is fake "directory" and is a tablespace link - * это происходить потому что мы передали follow_symlink при построении списка + * это происходит потому что мы передали follow_symlink при построении списка */ /* get parent dir of rel_path */ strncpy(parent_dir, file->rel_path, MAXPGPATH); @@ -342,12 +339,14 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * const char *linked_path = NULL; char to_path[MAXPGPATH]; + // perform additional check that this is actually synlink? { /* get full symlink path and map this path to new location */ char source_full_path[MAXPGPATH]; char symlink_content[MAXPGPATH]; join_path_components(source_full_path, source_pgdata, file->rel_path); fio_readlink(source_full_path, symlink_content, sizeof(symlink_content), FIO_DB_HOST); linked_path = leaked_abstraction_get_tablespace_mapping(symlink_content); + // TODO: check that linked_path != symlink_content in case of local catchup? elog(WARNING, "Map tablespace full_path: \"%s\" old_symlink_content: \"%s\" old_symlink_content: \"%s\"\n", source_full_path, symlink_content, @@ -373,6 +372,50 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * } } + if (!dest_pgdata_is_empty && + (current.backup_mode == BACKUP_MODE_DIFF_PTRACK || + current.backup_mode == BACKUP_MODE_DIFF_DELTA)) + { + elog(INFO, "Removing redundant files in destination directory"); + parray_qsort(dest_filelist, pgFileCompareRelPathWithExternalDesc); + for (i = 0; i < parray_num(dest_filelist); i++) + { + bool redundant = true; + pgFile *file = (pgFile *) parray_get(dest_filelist, i); + + if (parray_bsearch(source_filelist, file, pgFileCompareRelPathWithExternal)) + redundant = false; + + /* pg_filenode.map are always restored, because it's crc cannot be trusted */ + if (file->external_dir_num == 0 && + pg_strcasecmp(file->name, RELMAPPER_FILENAME) == 0) + redundant = true; + + /* do not delete the useful internal directories */ + if (S_ISDIR(file->mode) && !redundant) + continue; + + /* if file does not exists in destination list, then we can safely unlink it */ + if (redundant) + { + char fullpath[MAXPGPATH]; + + join_path_components(fullpath, dest_pgdata, file->rel_path); + + fio_delete(file->mode, fullpath, FIO_DB_HOST); + elog(VERBOSE, "Deleted file \"%s\"", fullpath); + + /* shrink pgdata list */ + pgFileFree(file); + parray_remove(dest_filelist, i); + i--; + } + } + } + + /* clear file locks */ + pfilearray_clear_locks(source_filelist); + /* Sort by size for load balancing */ parray_qsort(source_filelist, pgFileCompareSize); diff --git a/tests/catchup.py b/tests/catchup.py index 5df6df440..c90992ff4 100644 --- a/tests/catchup.py +++ b/tests/catchup.py @@ -42,10 +42,6 @@ def test_multithread_local_transfer(self): # @unittest.skip("skip") def test_local_simple_transfer_with_tablespace(self): - """ - Test 'multithreaded basebackup' mode - create node, insert some test data, catchup into other dir, start, select test data - """ fname = self.id().split('.')[3] source_pg = self.make_simple_node( @@ -203,10 +199,8 @@ def test_remote_delta_catchup(self): base_dir = os.path.join(module_name, fname, 'src'), set_replication = True, ptrack_enable = True, - initdb_params = ['--data-checksums'] ) source_pg.slow_start() - source_pg.safe_psql("postgres", "CREATE EXTENSION ptrack") source_pg.safe_psql("postgres", "CREATE TABLE ultimate_question(answer int)") # make clean shutdowned lagging behind replica @@ -249,3 +243,58 @@ def test_remote_delta_catchup(self): # Clean after yourself self.del_test_dir(module_name, fname) + # @unittest.skip("skip") + def test_table_drop(self): + """ + Test 'multithreaded basebackup' mode + create node, insert some test data, catchup into other dir, start, select test data + """ + fname = self.id().split('.')[3] + + source_pg = self.make_simple_node( + base_dir = os.path.join(module_name, fname, 'src'), + ptrack_enable = True, + initdb_params = ['--data-checksums']) + source_pg.slow_start() + + source_pg.safe_psql("postgres", "CREATE EXTENSION ptrack") + source_pg.safe_psql( + "postgres", + "CREATE TABLE ultimate_question AS SELECT 42 AS answer") + + dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) + dest_pg = self.catchup_node( + backup_mode = 'FULL', + source_pgdata = source_pg.data_dir, + destination_node = dest_pg, + options = [ + '-d', 'postgres', + '-p', str(source_pg.port), + '--stream' + ] + ) + + dest_pg.slow_start() + dest_pg.stop() + + source_pg.safe_psql("postgres", "DROP TABLE ultimate_question") + source_pg.safe_psql("postgres", "CHECKPOINT") + source_pg.safe_psql("postgres", "CHECKPOINT") + + # catchup + self.catchup_node( + backup_mode = 'PTRACK', + source_pgdata = source_pg.data_dir, + destination_node = dest_pg, + options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream']) + + source_pg.stop() + dest_pg.slow_start() + dest_pg.stop() + + source_pgdata = self.pgdata_content(source_pg.data_dir) + dest_pgdata = self.pgdata_content(dest_pg.data_dir) + self.compare_pgdata(source_pgdata, dest_pgdata) + + # Clean after yourself + self.del_test_dir(module_name, fname) From 05c451eaff3b525d83194ed0f6e7ef418343b6ed Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Thu, 27 May 2021 11:39:02 +0300 Subject: [PATCH 25/63] test stabilization --- tests/catchup.py | 34 +++++++++++++++++++++++---------- tests/helpers/ptrack_helpers.py | 3 --- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/tests/catchup.py b/tests/catchup.py index c90992ff4..08ac4bc22 100644 --- a/tests/catchup.py +++ b/tests/catchup.py @@ -30,6 +30,9 @@ def test_multithread_local_transfer(self): ) source_pg.stop() + dest_options = {} + dest_options['port'] = str(dest_pg.port) + self.set_auto_conf(dest_pg, dest_options) dest_pg.slow_start() self.assertEqual( result, @@ -72,8 +75,16 @@ def test_local_simple_transfer_with_tablespace(self): '-T', '{0}={1}'.format(tblspace1_old_path, tblspace1_new_path) ] ) + + source_pgdata = self.pgdata_content(source_pg.data_dir) + dest_pgdata = self.pgdata_content(dest_pg.data_dir) + self.compare_pgdata(source_pgdata, dest_pgdata) + source_pg.stop() + dest_options = {} + dest_options['port'] = str(dest_pg.port) + self.set_auto_conf(dest_pg, dest_options) dest_pg.slow_start() self.assertEqual( result, @@ -81,10 +92,6 @@ def test_local_simple_transfer_with_tablespace(self): 'Different answer from copy') dest_pg.stop() - source_pgdata = self.pgdata_content(source_pg.data_dir) - dest_pgdata = self.pgdata_content(dest_pg.data_dir) - self.compare_pgdata(source_pgdata, dest_pgdata) - # Clean after yourself self.del_test_dir(module_name, fname) @@ -111,6 +118,9 @@ def test_multithread_remote_transfer(self): options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream', '-j', '4']) source_pg.stop() + dest_options = {} + dest_options['port'] = str(dest_pg.port) + self.set_auto_conf(dest_pg, dest_options) dest_pg.slow_start() self.assertEqual( result, @@ -172,6 +182,9 @@ def test_remote_ptrack_catchup(self): source_pg.stop() # check latest changes + dest_options = {} + dest_options['port'] = str(dest_pg.port) + self.set_auto_conf(dest_pg, dest_options) self.set_replica(source_pg, dest_pg) dest_pg.slow_start(replica = True) self.assertEqual( @@ -211,6 +224,9 @@ def test_remote_delta_catchup(self): destination_node = dest_pg, options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream']) self.set_replica(source_pg, dest_pg) + dest_options = {} + dest_options['port'] = str(dest_pg.port) + self.set_auto_conf(dest_pg, dest_options) dest_pg.slow_start(replica = True) dest_pg.stop() @@ -246,8 +262,6 @@ def test_remote_delta_catchup(self): # @unittest.skip("skip") def test_table_drop(self): """ - Test 'multithreaded basebackup' mode - create node, insert some test data, catchup into other dir, start, select test data """ fname = self.id().split('.')[3] @@ -274,6 +288,9 @@ def test_table_drop(self): ] ) + dest_options = {} + dest_options['port'] = str(dest_pg.port) + self.set_auto_conf(dest_pg, dest_options) dest_pg.slow_start() dest_pg.stop() @@ -288,13 +305,10 @@ def test_table_drop(self): destination_node = dest_pg, options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream']) - source_pg.stop() - dest_pg.slow_start() - dest_pg.stop() - source_pgdata = self.pgdata_content(source_pg.data_dir) dest_pgdata = self.pgdata_content(dest_pg.data_dir) self.compare_pgdata(source_pgdata, dest_pgdata) # Clean after yourself + source_pg.stop() self.del_test_dir(module_name, fname) diff --git a/tests/helpers/ptrack_helpers.py b/tests/helpers/ptrack_helpers.py index 8fa442690..61273c3d8 100644 --- a/tests/helpers/ptrack_helpers.py +++ b/tests/helpers/ptrack_helpers.py @@ -1050,13 +1050,10 @@ def catchup_node( '--catchup-source-pgdata={0}'.format(source_pgdata), '--catchup-destination-pgdata={0}'.format(destination_node.data_dir) ] - if self.remote: cmd_list += ['--remote-proto=ssh', '--remote-host=localhost'] self.run_pb(cmd_list + options) - - destination_node.append_conf(port=destination_node.port) return destination_node def show_pb( From fd7571b8360af4b514d4b7da11a2e55a87cb4fe7 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Thu, 27 May 2021 13:45:56 +0300 Subject: [PATCH 26/63] test_tablefile_truncation added --- tests/catchup.py | 58 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/tests/catchup.py b/tests/catchup.py index 08ac4bc22..e2a52feb0 100644 --- a/tests/catchup.py +++ b/tests/catchup.py @@ -312,3 +312,61 @@ def test_table_drop(self): # Clean after yourself source_pg.stop() self.del_test_dir(module_name, fname) + + # @unittest.skip("skip") + def test_tablefile_truncation(self): + """ + """ + fname = self.id().split('.')[3] + + source_pg = self.make_simple_node( + base_dir = os.path.join(module_name, fname, 'src'), + ptrack_enable = True, + initdb_params = ['--data-checksums']) + source_pg.slow_start() + + source_pg.safe_psql("postgres", "CREATE EXTENSION ptrack") + source_pg.safe_psql( + "postgres", + "CREATE SEQUENCE t_seq; " + "CREATE TABLE t_heap AS SELECT i AS id, " + "md5(i::text) AS text, " + "md5(repeat(i::text, 10))::tsvector AS tsvector " + "FROM generate_series(0, 1024) i") + source_pg.safe_psql("postgres", "VACUUM t_heap") + + dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) + dest_pg = self.catchup_node( + backup_mode = 'FULL', + source_pgdata = source_pg.data_dir, + destination_node = dest_pg, + options = [ + '-d', 'postgres', + '-p', str(source_pg.port), + '--stream' + ] + ) + + dest_options = {} + dest_options['port'] = str(dest_pg.port) + self.set_auto_conf(dest_pg, dest_options) + dest_pg.slow_start() + dest_pg.stop() + + source_pg.safe_psql("postgres", "DELETE FROM t_heap WHERE ctid >= '(11,0)'") + source_pg.safe_psql("postgres", "VACUUM t_heap") + + # catchup + self.catchup_node( + backup_mode = 'PTRACK', + source_pgdata = source_pg.data_dir, + destination_node = dest_pg, + options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream']) + + source_pgdata = self.pgdata_content(source_pg.data_dir) + dest_pgdata = self.pgdata_content(dest_pg.data_dir) + self.compare_pgdata(source_pgdata, dest_pgdata) + + # Clean after yourself + source_pg.stop() + self.del_test_dir(module_name, fname) From 72b5bba10df38de0f4feefd9563f14d65496604e Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Fri, 28 May 2021 15:00:20 +0300 Subject: [PATCH 27/63] Improve catchup help --- src/help.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/src/help.c b/src/help.c index 84e0b7d66..92236121f 100644 --- a/src/help.c +++ b/src/help.c @@ -2,7 +2,7 @@ * * help.c * - * Copyright (c) 2017-2019, Postgres Professional + * Copyright (c) 2017-2021, Postgres Professional * *------------------------------------------------------------------------- */ @@ -248,6 +248,19 @@ help_pg_probackup(void) printf(_(" [--ssh-options]\n")); printf(_(" [--help]\n")); + printf(_("\n%s catchup -b catchup-mode\n"), PROGRAM_NAME); + printf(_(" --catchup-source-pgdata=path_to_pgdata_on_remote_server\n")); + printf(_(" --catchup-destination-pgdata=path_to_local_dir\n")); + printf(_(" [--stream [-S slot-name]] [--temp-slot]\n")); + printf(_(" [-j num-threads]\n")); + printf(_(" [-T OLDDIR=NEWDIR]\n")); + printf(_(" [-d dbname] [-h host] [-p port] [-U username]\n")); + printf(_(" [-w --no-password] [-W --password]\n")); + printf(_(" [--remote-proto] [--remote-host]\n")); + printf(_(" [--remote-port] [--remote-path] [--remote-user]\n")); + printf(_(" [--ssh-options]\n")); + printf(_(" [--help]\n")); + if ((PROGRAM_URL || PROGRAM_EMAIL)) { printf("\n"); @@ -1015,24 +1028,29 @@ help_version(void) static void help_catchup(void) { - printf(_("\n%s catchup -b backup-mode\n"), PROGRAM_NAME); + printf(_("\n%s catchup -b catchup-mode\n"), PROGRAM_NAME); printf(_(" --catchup-source-pgdata=path_to_pgdata_on_remote_server\n")); printf(_(" --catchup-destination-pgdata=path_to_local_dir\n")); printf(_(" [--stream [-S slot-name]] [--temp-slot]\n")); printf(_(" [-j num-threads]\n")); + printf(_(" [-T OLDDIR=NEWDIR]\n")); printf(_(" [-d dbname] [-h host] [-p port] [-U username]\n")); printf(_(" [-w --no-password] [-W --password]\n")); printf(_(" [--remote-proto] [--remote-host]\n")); printf(_(" [--remote-port] [--remote-path] [--remote-user]\n")); - printf(_(" [--ssh-options]\n\n")); + printf(_(" [--ssh-options]\n")); + printf(_(" [--help]\n\n")); - printf(_(" -b, --backup-mode=backup-mode backup mode=FULL|DELTA|PTRACK\n")); + printf(_(" -b, --backup-mode=catchup-mode catchup mode=FULL|DELTA|PTRACK\n")); printf(_(" --stream stream the transaction log and include it in the backup\n")); printf(_(" -S, --slot=SLOTNAME replication slot to use\n")); printf(_(" --temp-slot use temporary replication slot\n")); printf(_(" -j, --threads=NUM number of parallel threads\n")); + printf(_(" -T, --tablespace-mapping=OLDDIR=NEWDIR\n")); + printf(_(" relocate the tablespace from directory OLDDIR to NEWDIR\n")); + printf(_("\n Connection options:\n")); printf(_(" -U, --pguser=USERNAME user name to connect as (default: current local user)\n")); printf(_(" -d, --pgdatabase=DBNAME database to connect (default: username)\n")); @@ -1052,4 +1070,3 @@ help_catchup(void) printf(_(" --ssh-options=ssh_options additional ssh options (default: none)\n")); printf(_(" (example: --ssh-options='-c cipher_spec -F configfile')\n\n")); } - From bee476c72ccffb2bb5cb408cf3937344a0fd5681 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Mon, 31 May 2021 13:30:46 +0300 Subject: [PATCH 28/63] pg_control now is synced last --- src/catchup.c | 71 ++++++++++++++++++++++++++++++---------------- src/data.c | 5 ++-- src/utils/parray.c | 7 +++++ src/utils/parray.h | 1 + 4 files changed, 56 insertions(+), 28 deletions(-) diff --git a/src/catchup.c b/src/catchup.c index 5bb63809a..6f7a9620c 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -152,6 +152,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * char dest_xlog_path[MAXPGPATH]; char label[1024]; RedoParams dest_redo = { 0, InvalidXLogRecPtr, 0 }; + pgFile *source_pg_control_file = NULL; /* arrays with meta info for multi threaded backup */ pthread_t *threads; @@ -183,8 +184,6 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * strncat(label, " with pg_probackup", lengthof(label) - strlen(" with pg_probackup")); - // TODO delete dest control file - /* Call pg_start_backup function in PostgreSQL connect */ pg_start_backup(label, smooth_checkpoint, ¤t, source_node_info, source_conn); elog(LOG, "pg_start_backup START LSN %X/%X", (uint32) (current.start_lsn >> 32), (uint32) (current.start_lsn)); @@ -268,7 +267,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * */ parray_qsort(source_filelist, pgFileCompareRelPathWithExternal); - /* Extract information about files in backup_list parsing their names:*/ + /* Extract information about files in source_filelist parsing their names:*/ parse_filelist_filenames(source_filelist, source_pgdata); elog(LOG, "Start LSN (source): %X/%X, TLI: %X", @@ -372,6 +371,27 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * } } + /* + * find pg_control file (in already sorted source_filelist) + * and exclude it from list for future special processing + */ + { + int control_file_elem_index; + pgFile search_key ; + MemSet(&search_key, 0, sizeof(pgFile)); + /* pgFileCompareRelPathWithExternal uses only .rel_path and .external_dir_num for comparision */ + search_key.rel_path = XLOG_CONTROL_FILE; + search_key.external_dir_num = 0; + control_file_elem_index = parray_bsearch_index(source_filelist, &search_key, pgFileCompareRelPathWithExternal); + if(control_file_elem_index < 0) + elog(ERROR, "\"%s\" not found in \"%s\"\n", XLOG_CONTROL_FILE, source_pgdata); + source_pg_control_file = parray_remove(source_filelist, control_file_elem_index); + } + + /* + * remove absent source files in dest (dropped tables, etc...) + * note: global/pg_control will also be deleted here + */ if (!dest_pgdata_is_empty && (current.backup_mode == BACKUP_MODE_DIFF_PTRACK || current.backup_mode == BACKUP_MODE_DIFF_DELTA)) @@ -421,7 +441,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * /* init thread args with own file lists */ threads = (pthread_t *) palloc(sizeof(pthread_t) * num_threads); - threads_args = (catchup_thread_runner_arg *) palloc(sizeof(catchup_thread_runner_arg)*num_threads); + threads_args = (catchup_thread_runner_arg *) palloc(sizeof(catchup_thread_runner_arg) * num_threads); for (i = 0; i < num_threads; i++) { @@ -434,7 +454,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * arg->dest_filelist = dest_filelist; arg->sync_lsn = dest_redo.lsn; arg->backup_mode = current.backup_mode; - arg->thread_num = i+1; + arg->thread_num = i + 1; /* By default there are some error */ arg->ret = 1; } @@ -459,6 +479,16 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * catchup_isok = false; } + /* at last copy control file */ + { + char from_fullpath[MAXPGPATH]; + char to_fullpath[MAXPGPATH]; + join_path_components(from_fullpath, source_pgdata, source_pg_control_file->rel_path); + join_path_components(to_fullpath, dest_pgdata, source_pg_control_file->rel_path); + copy_pgcontrol_file(from_fullpath, FIO_DB_HOST, + to_fullpath, FIO_BACKUP_HOST, source_pg_control_file); + } + time(&end_time); pretty_time_interval(difftime(end_time, start_time), pretty_time, lengthof(pretty_time)); @@ -541,25 +571,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * */ if (current.from_replica && !exclusive_backup) { - pgFile *pg_control = NULL; - - for (i = 0; i < parray_num(source_filelist); i++) - { - pgFile *tmp_file = (pgFile *) parray_get(source_filelist, i); - - if (tmp_file->external_dir_num == 0 && - (strcmp(tmp_file->rel_path, XLOG_CONTROL_FILE) == 0)) - { - pg_control = tmp_file; - break; - } - } - - if (!pg_control) - elog(ERROR, "Failed to find file \"%s\" in backup filelist.", - XLOG_CONTROL_FILE); - - set_min_recovery_point(pg_control, dest_pgdata, current.stop_lsn); + set_min_recovery_point(source_pg_control_file, dest_pgdata, current.stop_lsn); } /* close ssh session in main thread */ @@ -570,12 +582,13 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * elog(WARNING, "Files are not synced to disk"); else { + char to_fullpath[MAXPGPATH]; + elog(INFO, "Syncing copied files to disk"); time(&start_time); for (i = 0; i < parray_num(source_filelist); i++) { - char to_fullpath[MAXPGPATH]; pgFile *file = (pgFile *) parray_get(source_filelist, i); /* TODO: sync directory ? */ @@ -602,6 +615,13 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * elog(ERROR, "Cannot sync file \"%s\": %s", to_fullpath, strerror(errno)); } + /* + * sync pg_control file + */ + join_path_components(to_fullpath, dest_pgdata, source_pg_control_file->rel_path); + if (fio_sync(to_fullpath, FIO_BACKUP_HOST) != 0) + elog(ERROR, "Cannot sync file \"%s\": %s", to_fullpath, strerror(errno)); + time(&end_time); pretty_time_interval(difftime(end_time, start_time), pretty_time, lengthof(pretty_time)); @@ -617,6 +637,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * parray_walk(source_filelist, pgFileFree); parray_free(source_filelist); + pgFileFree(source_pg_control_file); // где закрывается conn? } diff --git a/src/data.c b/src/data.c index f2b8867a9..1d3ee9ebd 100644 --- a/src/data.c +++ b/src/data.c @@ -682,8 +682,7 @@ catchup_data_file(pgFile *file, const char *from_fullpath, const char *to_fullpa * This way we can correctly handle null-sized files which are * not tracked by pagemap and thus always marked as unchanged. */ - if ((backup_mode == BACKUP_MODE_DIFF_PAGE || - backup_mode == BACKUP_MODE_DIFF_PTRACK) && + if (backup_mode == BACKUP_MODE_DIFF_PTRACK && file->pagemap.bitmapsize == PageBitmapIsEmpty && file->exists_in_prev && !file->pagemap_isabsent) { @@ -711,7 +710,7 @@ catchup_data_file(pgFile *file, const char *from_fullpath, const char *to_fullpa * Such files should be fully copied. */ - if (file->pagemap.bitmapsize == PageBitmapIsEmpty || + if (file->pagemap.bitmapsize == PageBitmapIsEmpty || file->pagemap_isabsent || !file->exists_in_prev || !file->pagemap.bitmap) use_pagemap = false; diff --git a/src/utils/parray.c b/src/utils/parray.c index 95b83365d..792e26907 100644 --- a/src/utils/parray.c +++ b/src/utils/parray.c @@ -198,6 +198,13 @@ parray_bsearch(parray *array, const void *key, int(*compare)(const void *, const return bsearch(&key, array->data, array->used, sizeof(void *), compare); } +int +parray_bsearch_index(parray *array, const void *key, int(*compare)(const void *, const void *)) +{ + void **elem = parray_bsearch(array, key, compare); + return elem != NULL ? elem - array->data : -1; +} + /* checks that parray contains element */ bool parray_contains(parray *array, void *elem) { diff --git a/src/utils/parray.h b/src/utils/parray.h index 85d7383f3..e92ad728c 100644 --- a/src/utils/parray.h +++ b/src/utils/parray.h @@ -29,6 +29,7 @@ extern bool parray_rm(parray *array, const void *key, int(*compare)(const void * extern size_t parray_num(const parray *array); extern void parray_qsort(parray *array, int(*compare)(const void *, const void *)); extern void *parray_bsearch(parray *array, const void *key, int(*compare)(const void *, const void *)); +extern int parray_bsearch_index(parray *array, const void *key, int(*compare)(const void *, const void *)); extern void parray_walk(parray *array, void (*action)(void *)); extern bool parray_contains(parray *array, void *elem); From 74cd21ac331129e1996c39ee751cb0825851be68 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Mon, 31 May 2021 13:48:40 +0300 Subject: [PATCH 29/63] apply lubennikovaav review.patch --- src/catchup.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ src/dir.c | 1 + 2 files changed, 50 insertions(+) diff --git a/src/catchup.c b/src/catchup.c index 6f7a9620c..643c34dae 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -52,11 +52,16 @@ do_catchup(const char *source_pgdata, const char *dest_pgdata, int num_threads) do_catchup_instance(source_pgdata, dest_pgdata, source_conn, &source_node_info, no_sync, backup_logs, dest_pgdata_is_empty); + //REVIEW: Are we going to do that before release? /* TODO: show the amount of transfered data in bytes and calculate incremental ratio */ return 0; } +//REVIEW Please add a comment to this function. +//Besides, the name of this function looks strange to me. +//Maybe catchup_init_state() or catchup_setup() will do better? +//I'd also suggest to wrap all these fields into some CatchupState, but it isn't urgent. static PGconn * catchup_collect_info(PGNodeInfo *source_node_info, const char *source_pgdata, const char *dest_pgdata) { @@ -70,12 +75,14 @@ catchup_collect_info(PGNodeInfo *source_node_info, const char *source_pgdata, co current.start_time = time(NULL); StrNCpy(current.program_version, PROGRAM_VERSION, sizeof(current.program_version)); + //REVIEW I guess these are some copy-paste leftovers. Let's clean them. //current.compress_alg = instance_config.compress_alg; //current.compress_level = instance_config.compress_level; /* Do some compatibility checks and fill basic info about PG instance */ source_conn = pgdata_basic_setup(instance_config.conn_opt, source_node_info); + //REVIEW Please adjust the comment. Do we need this code for catchup at all? /* below perform checks specific for backup command */ #if PG_VERSION_NUM >= 110000 if (!RetrieveWalSegSize(source_conn)) @@ -106,10 +113,12 @@ catchup_collect_info(PGNodeInfo *source_node_info, const char *source_pgdata, co return source_conn; } +//REVIEW Please add a comment to this function. static void catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, const char *source_pgdata, const char *dest_pgdata, bool dest_pgdata_is_empty) { + //REVIEW Let's fix it before release. // TODO: add sanity check that source PGDATA is not empty /* Check that connected PG instance and source PGDATA are the same */ @@ -135,6 +144,7 @@ catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, if (current.from_replica && exclusive_backup) elog(ERROR, "Catchup from standby is available only for PG >= 9.6"); + //REVIEW FIXME Let's fix it before release. This one seems like a potential bug. // TODO check if it is local catchup and source contain tablespaces } @@ -154,6 +164,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * RedoParams dest_redo = { 0, InvalidXLogRecPtr, 0 }; pgFile *source_pg_control_file = NULL; + //REVIEW please adjust this comment. /* arrays with meta info for multi threaded backup */ pthread_t *threads; catchup_thread_runner_arg *threads_args; @@ -161,8 +172,10 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * parray *source_filelist = NULL; parray *dest_filelist = NULL; + //REVIEW We don't handle external_dirs in catchup, do we? Let's clean this up. parray *external_dirs = NULL; + //REVIEW FIXME Let's fix it before release. It can cause some obscure bugs. /* TODO: in case of timeline mistmatch, check that source PG timeline descending from dest PG timeline */ parray *tli_list = NULL; @@ -172,6 +185,8 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * char pretty_bytes[20]; PGStopBackupResult stop_backup_result; + //REVIEW Is it relevant to catchup? I suppose it isn't, since catchup is a new code. + //If we do need it, please write a comment explaining that. /* kludge against some old bug in archive_timeout. TODO: remove in 3.0.0 */ int timeout = (instance_config.archive_timeout > 0) ? instance_config.archive_timeout : ARCHIVE_TIMEOUT_DEFAULT; @@ -184,10 +199,14 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * strncat(label, " with pg_probackup", lengthof(label) - strlen(" with pg_probackup")); + //REVIEW FIXME Let' do that. + /* Call pg_start_backup function in PostgreSQL connect */ pg_start_backup(label, smooth_checkpoint, ¤t, source_node_info, source_conn); elog(LOG, "pg_start_backup START LSN %X/%X", (uint32) (current.start_lsn >> 32), (uint32) (current.start_lsn)); + //REVIEW I wonder, if we can move this piece above and call before pg_start backup()? + //It seems to be a part of setup phase. if (!dest_pgdata_is_empty && (current.backup_mode == BACKUP_MODE_DIFF_PTRACK || current.backup_mode == BACKUP_MODE_DIFF_DELTA)) @@ -201,6 +220,8 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * elog(INFO, "syncLSN = %X/%X", (uint32) (dest_redo.lsn >> 32), (uint32) dest_redo.lsn); } + //REVIEW I wonder, if we can move this piece above and call before pg_start backup()? + //It seems to be a part of setup phase. /* * TODO: move to separate function to use in both backup.c and catchup.c */ @@ -234,6 +255,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * current.start_lsn, current.tli); } + //REVIEW please adjust the comment. /* initialize backup list */ source_filelist = parray_new(); @@ -244,12 +266,15 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * else dir_list_file(source_filelist, source_pgdata, true, true, false, backup_logs, true, 0, FIO_LOCAL_HOST); + + //REVIEW FIXME. Let's fix that before release. // TODO filter pg_xlog/wal? // TODO what if wal is not a dir (symlink to a dir)? /* close ssh session in main thread */ fio_disconnect(); + //REVIEW Do we want to do similar calculation for dest? current.pgdata_bytes += calculate_datasize_of_filelist(source_filelist); pretty_size(current.pgdata_bytes, pretty_bytes, lengthof(pretty_bytes)); elog(INFO, "Source PGDATA size: %s", pretty_bytes); @@ -267,12 +292,14 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * */ parray_qsort(source_filelist, pgFileCompareRelPathWithExternal); + //REVIEW Please adjust the comment. /* Extract information about files in source_filelist parsing their names:*/ parse_filelist_filenames(source_filelist, source_pgdata); elog(LOG, "Start LSN (source): %X/%X, TLI: %X", (uint32) (current.start_lsn >> 32), (uint32) (current.start_lsn), current.tli); + //REVIEW FIXME Huh? Don't we check TLI at all? /* TODO проверить, нужна ли проверка TLI */ if (current.backup_mode != BACKUP_MODE_FULL) elog(LOG, "LSN in destination: %X/%X, TLI: %X", @@ -339,11 +366,14 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * char to_path[MAXPGPATH]; // perform additional check that this is actually synlink? + //REVIEW Why is this code block separated? { /* get full symlink path and map this path to new location */ char source_full_path[MAXPGPATH]; char symlink_content[MAXPGPATH]; join_path_components(source_full_path, source_pgdata, file->rel_path); fio_readlink(source_full_path, symlink_content, sizeof(symlink_content), FIO_DB_HOST); + //REVIEW What if we won't find mapping for this tablespace? + //I'd expect a failure. Otherwise, we may spoil source database data. linked_path = leaked_abstraction_get_tablespace_mapping(symlink_content); // TODO: check that linked_path != symlink_content in case of local catchup? elog(WARNING, "Map tablespace full_path: \"%s\" old_symlink_content: \"%s\" old_symlink_content: \"%s\"\n", @@ -361,6 +391,8 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * elog(VERBOSE, "Create directory \"%s\" and symbolic link \"%s\"", linked_path, to_path); + //REVIEW Handle return value here. + //We should not proceed if failed to create dir. /* create tablespace directory */ fio_mkdir(linked_path, DIR_PERMISSION, FIO_BACKUP_HOST); @@ -403,6 +435,8 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * bool redundant = true; pgFile *file = (pgFile *) parray_get(dest_filelist, i); + //REVIEW Can we maybe optimize it and use some merge-like algorithm + //instead of bsearch for each file? Of course it isn't an urgent fix. if (parray_bsearch(source_filelist, file, pgFileCompareRelPathWithExternal)) redundant = false; @@ -411,6 +445,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * pg_strcasecmp(file->name, RELMAPPER_FILENAME) == 0) redundant = true; + //REVIEW This check seems unneded. Anyway we delete only redundant stuff below. /* do not delete the useful internal directories */ if (S_ISDIR(file->mode) && !redundant) continue; @@ -433,12 +468,16 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * } } + //REVIEW Hmm. Why do we need this at all? + //I'd expect that we init pgfile with unset lock... + //Not related to this patch, though. /* clear file locks */ pfilearray_clear_locks(source_filelist); /* Sort by size for load balancing */ parray_qsort(source_filelist, pgFileCompareSize); + //REVIEW. This comment looks a bit misleading, since all theads share same filelist. /* init thread args with own file lists */ threads = (pthread_t *) palloc(sizeof(pthread_t) * num_threads); threads_args = (catchup_thread_runner_arg *) palloc(sizeof(catchup_thread_runner_arg) * num_threads); @@ -499,9 +538,12 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * elog(ERROR, "Data files transferring failed, time elapsed: %s", pretty_time); + //REVIEW The comment looks unrelated to the function. Do I miss something? /* Notify end of backup */ pg_silent_client_messages(source_conn); + //REVIEW. Do we want to support pg 9.5? I suppose we never test it... + //Maybe check it and error out early? /* Create restore point * Only if backup is from master. * For PG 9.5 create restore point only if pguser is superuser. @@ -545,14 +587,17 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * stop_backup_result.tablespace_map_content_len = 0; } + //REVIEW We don't pass a filelist. Please adjust the comment. /* This function will also add list of xlog files * to the passed filelist */ if(wait_WAL_streaming_end(NULL)) elog(ERROR, "WAL streaming failed"); + //REVIEW Please add a comment about these lsns. It is a crutial part of the algorithm. current.recovery_xid = stop_backup_result.snapshot_xid; elog(LOG, "Getting the Recovery Time from WAL"); + /* iterate over WAL from stop_backup lsn to start_backup lsn */ if (!read_recovery_info(dest_xlog_path, current.tli, instance_config.xlog_seg_size, @@ -566,6 +611,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * /* Cleanup */ pg_free(query_text); + //REVIEW Please adjust the comment. /* In case of backup from replica >= 9.6 we must fix minRecPoint, * First we must find pg_control in source_filelist. */ @@ -601,6 +647,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * /* construct fullpath */ if (file->external_dir_num == 0) join_path_components(to_fullpath, dest_pgdata, file->rel_path); + //REVIEW Let's clean this. /* TODO разобраться с external */ /*else { @@ -638,6 +685,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * parray_walk(source_filelist, pgFileFree); parray_free(source_filelist); pgFileFree(source_pg_control_file); + //REVIEW Huh? // где закрывается conn? } @@ -682,6 +730,7 @@ catchup_thread_runner(void *arg) join_path_components(from_fullpath, arguments->from_root, file->rel_path); join_path_components(to_fullpath, arguments->to_root, file->rel_path); } + //REVIEW Let's clean this. /*else { char external_dst[MAXPGPATH]; diff --git a/src/dir.c b/src/dir.c index 4514be9a3..aa7fc6b37 100644 --- a/src/dir.c +++ b/src/dir.c @@ -909,6 +909,7 @@ get_tablespace_mapping(const char *dir) return dir; } +//REVIEW What exactly wrong with this abstraction? I don't get it... /* * TODO протёкшая абстрация, надо на этапе ревью решить что с ней делать, * потому как непонятно, почему мы в backup.c напрямую работаем с созданием From 271cf160d75798bd25efa518e79f332055afc4a2 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Mon, 31 May 2021 13:51:48 +0300 Subject: [PATCH 30/63] indent correction as noted by Roman Zharkov --- src/help.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/help.c b/src/help.c index 92236121f..0d18d359a 100644 --- a/src/help.c +++ b/src/help.c @@ -248,7 +248,7 @@ help_pg_probackup(void) printf(_(" [--ssh-options]\n")); printf(_(" [--help]\n")); - printf(_("\n%s catchup -b catchup-mode\n"), PROGRAM_NAME); + printf(_("\n %s catchup -b catchup-mode\n"), PROGRAM_NAME); printf(_(" --catchup-source-pgdata=path_to_pgdata_on_remote_server\n")); printf(_(" --catchup-destination-pgdata=path_to_local_dir\n")); printf(_(" [--stream [-S slot-name]] [--temp-slot]\n")); From 8e03b8d7fb583b1883f745975868bcff36d2fa19 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Mon, 31 May 2021 15:19:26 +0300 Subject: [PATCH 31/63] Review answer #1 --- src/catchup.c | 29 +++++++++++++---------------- src/utils/file.c | 2 +- tests/catchup.py | 13 ++++++++++++- 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/src/catchup.c b/src/catchup.c index 643c34dae..9488ee24f 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -72,6 +72,9 @@ catchup_collect_info(PGNodeInfo *source_node_info, const char *source_pgdata, co /* Get WAL segments size and system ID of source PG instance */ instance_config.xlog_seg_size = get_xlog_seg_size(source_pgdata); instance_config.system_identifier = get_system_identifier(source_pgdata); +#if PG_VERSION_NUM < 90600 + instance_config.pgdata = source_pgdata; +#endif current.start_time = time(NULL); StrNCpy(current.program_version, PROGRAM_VERSION, sizeof(current.program_version)); @@ -113,7 +116,10 @@ catchup_collect_info(PGNodeInfo *source_node_info, const char *source_pgdata, co return source_conn; } -//REVIEW Please add a comment to this function. +/* + * Check that catchup can be performed on source and dest + * this function is for checks, that can be performed without modification of data on disk + */ static void catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, const char *source_pgdata, const char *dest_pgdata, bool dest_pgdata_is_empty) @@ -164,16 +170,13 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * RedoParams dest_redo = { 0, InvalidXLogRecPtr, 0 }; pgFile *source_pg_control_file = NULL; - //REVIEW please adjust this comment. - /* arrays with meta info for multi threaded backup */ + /* arrays with meta info for multi threaded catchup */ pthread_t *threads; catchup_thread_runner_arg *threads_args; bool catchup_isok = true; parray *source_filelist = NULL; parray *dest_filelist = NULL; - //REVIEW We don't handle external_dirs in catchup, do we? Let's clean this up. - parray *external_dirs = NULL; //REVIEW FIXME Let's fix it before release. It can cause some obscure bugs. /* TODO: in case of timeline mistmatch, check that source PG timeline descending from dest PG timeline */ @@ -199,8 +202,6 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * strncat(label, " with pg_probackup", lengthof(label) - strlen(" with pg_probackup")); - //REVIEW FIXME Let' do that. - /* Call pg_start_backup function in PostgreSQL connect */ pg_start_backup(label, smooth_checkpoint, ¤t, source_node_info, source_conn); elog(LOG, "pg_start_backup START LSN %X/%X", (uint32) (current.start_lsn >> 32), (uint32) (current.start_lsn)); @@ -231,8 +232,8 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * // new ptrack is more robust and checks Start LSN if (ptrack_lsn > dest_redo.lsn || ptrack_lsn == InvalidXLogRecPtr) - elog(ERROR, "LSN from ptrack_control %X/%X is greater than checkpoint LSN %X/%X.\n" - "Create new full backup before an incremental one.", + elog(ERROR, "LSN from ptrack_control in source %X/%X is greater than checkpoint LSN in destination %X/%X.\n" + "You can perform only FULL catchup.", (uint32) (ptrack_lsn >> 32), (uint32) (ptrack_lsn), (uint32) (dest_redo.lsn >> 32), (uint32) (dest_redo.lsn)); @@ -255,8 +256,6 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * current.start_lsn, current.tli); } - //REVIEW please adjust the comment. - /* initialize backup list */ source_filelist = parray_new(); /* list files with the logical path. omit $PGDATA */ @@ -471,14 +470,14 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * //REVIEW Hmm. Why do we need this at all? //I'd expect that we init pgfile with unset lock... //Not related to this patch, though. + //REVIEW_ANSWER initialization in the pgFileInit function was proposed but was not accepted (see 2c8b7e9) /* clear file locks */ pfilearray_clear_locks(source_filelist); /* Sort by size for load balancing */ parray_qsort(source_filelist, pgFileCompareSize); - //REVIEW. This comment looks a bit misleading, since all theads share same filelist. - /* init thread args with own file lists */ + /* init thread args */ threads = (pthread_t *) palloc(sizeof(pthread_t) * num_threads); threads_args = (catchup_thread_runner_arg *) palloc(sizeof(catchup_thread_runner_arg) * num_threads); @@ -539,6 +538,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * pretty_time); //REVIEW The comment looks unrelated to the function. Do I miss something? + //REVIEW_ANSWER because it is a part of pg_stop_backup() calling /* Notify end of backup */ pg_silent_client_messages(source_conn); @@ -681,12 +681,9 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * parray_walk(dest_filelist, pgFileFree); parray_free(dest_filelist); } - parray_walk(source_filelist, pgFileFree); parray_free(source_filelist); pgFileFree(source_pg_control_file); - //REVIEW Huh? - // где закрывается conn? } /* diff --git a/src/utils/file.c b/src/utils/file.c index e2a6eaf0b..bdcaafa5d 100644 --- a/src/utils/file.c +++ b/src/utils/file.c @@ -2001,7 +2001,7 @@ fio_copy_pages(const char *to_fullpath, const char *from_fullpath, pgFile *file, COMP_FILE_CRC32(true, file->crc, buf, hdr.size); - elog(INFO, "Copy block %u with size %u of %s", blknum, hdr.size - sizeof(BackupPageHeader), to_fullpath); + elog(INFO, "Copy block %u with size %lu of %s", blknum, hdr.size - sizeof(BackupPageHeader), to_fullpath); if (fio_fseek(out, blknum * BLCKSZ) < 0) { elog(ERROR, "Cannot seek block %u of \"%s\": %s", diff --git a/tests/catchup.py b/tests/catchup.py index e2a52feb0..696e28ace 100644 --- a/tests/catchup.py +++ b/tests/catchup.py @@ -14,7 +14,9 @@ def test_multithread_local_transfer(self): """ fname = self.id().split('.')[3] - source_pg = self.make_simple_node(base_dir = os.path.join(module_name, fname, 'src')) + source_pg = self.make_simple_node( + base_dir = os.path.join(module_name, fname, 'src'), + set_replication=True) source_pg.slow_start() source_pg.safe_psql( "postgres", @@ -140,6 +142,9 @@ def test_remote_ptrack_catchup(self): generate some load on master, insert some test data on master, catchup copy, start and select test data """ + if not self.ptrack: + return unittest.skip('Skipped because ptrack support is disabled') + fname = self.id().split('.')[3] # prepare master @@ -263,6 +268,9 @@ def test_remote_delta_catchup(self): def test_table_drop(self): """ """ + if not self.ptrack: + return unittest.skip('Skipped because ptrack support is disabled') + fname = self.id().split('.')[3] source_pg = self.make_simple_node( @@ -317,6 +325,9 @@ def test_table_drop(self): def test_tablefile_truncation(self): """ """ + if not self.ptrack: + return unittest.skip('Skipped because ptrack support is disabled') + fname = self.id().split('.')[3] source_pg = self.make_simple_node( From b294557c0e2a82dd01bf9be84455c2713bebb909 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Tue, 1 Jun 2021 12:22:37 +0300 Subject: [PATCH 32/63] typo --- src/restore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/restore.c b/src/restore.c index f90befd76..9a31ebbb7 100644 --- a/src/restore.c +++ b/src/restore.c @@ -2185,7 +2185,7 @@ check_incremental_compatibility(const char *pgdata, uint64 system_identifier, * data files content, because based on pg_control information we will * choose a backup suitable for lsn based incremental restore. */ - elog(INFO, "Trying to read pg_control file in destination direstory"); + elog(INFO, "Trying to read pg_control file in destination directory"); system_id_pgdata = get_system_identifier(pgdata); From 2e3adf1c61d081bf5d4038239a61b61ba01f3aae Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Tue, 1 Jun 2021 13:45:42 +0300 Subject: [PATCH 33/63] check emptyness of dest_pgdata --- src/catchup.c | 69 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 22 deletions(-) diff --git a/src/catchup.c b/src/catchup.c index 9488ee24f..e7ff5b06a 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -28,10 +28,9 @@ */ static PGconn *catchup_collect_info(PGNodeInfo *source_node_info, const char *source_pgdata, const char *dest_pgdata); static void catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, const char *source_pgdata, - const char *dest_pgdata, bool dest_pgdata_is_empty); + const char *dest_pgdata); static void do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn *source_conn, - PGNodeInfo *nodeInfo, bool no_sync, bool backup_logs, - bool dest_pgdata_is_empty); + PGNodeInfo *nodeInfo, bool no_sync, bool backup_logs); static void *catchup_thread_runner(void *arg); /* @@ -44,13 +43,12 @@ do_catchup(const char *source_pgdata, const char *dest_pgdata, int num_threads) PGNodeInfo source_node_info; bool no_sync = false; bool backup_logs = false; - bool dest_pgdata_is_empty = dir_is_empty(dest_pgdata, FIO_LOCAL_HOST); source_conn = catchup_collect_info(&source_node_info, source_pgdata, dest_pgdata); - catchup_preflight_checks(&source_node_info, source_conn, source_pgdata, dest_pgdata, dest_pgdata_is_empty); + catchup_preflight_checks(&source_node_info, source_conn, source_pgdata, dest_pgdata); do_catchup_instance(source_pgdata, dest_pgdata, source_conn, &source_node_info, - no_sync, backup_logs, dest_pgdata_is_empty); + no_sync, backup_logs); //REVIEW: Are we going to do that before release? /* TODO: show the amount of transfered data in bytes and calculate incremental ratio */ @@ -66,6 +64,7 @@ static PGconn * catchup_collect_info(PGNodeInfo *source_node_info, const char *source_pgdata, const char *dest_pgdata) { PGconn *source_conn; + /* Initialize PGInfonode */ pgNodeInit(source_node_info); @@ -85,8 +84,6 @@ catchup_collect_info(PGNodeInfo *source_node_info, const char *source_pgdata, co /* Do some compatibility checks and fill basic info about PG instance */ source_conn = pgdata_basic_setup(instance_config.conn_opt, source_node_info); - //REVIEW Please adjust the comment. Do we need this code for catchup at all? - /* below perform checks specific for backup command */ #if PG_VERSION_NUM >= 110000 if (!RetrieveWalSegSize(source_conn)) elog(ERROR, "Failed to retrieve wal_segment_size"); @@ -122,10 +119,42 @@ catchup_collect_info(PGNodeInfo *source_node_info, const char *source_pgdata, co */ static void catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, - const char *source_pgdata, const char *dest_pgdata, bool dest_pgdata_is_empty) + const char *source_pgdata, const char *dest_pgdata) { - //REVIEW Let's fix it before release. - // TODO: add sanity check that source PGDATA is not empty + /* TODO + * gsmol - fallback to FULL mode if dest PGDATA is empty + * kulaginm -- I think this is a harmful feature. If user requested an incremental catchup, then + * he expects that this will be done quickly and efficiently. If, for example, he made a mistake + * with dest_dir, then he will receive a second full copy instead of an error message, and I think + * that in some cases he would prefer the error. + * I propose in future versions to offer a backup_mode auto, in which we will look to the dest_dir + * and decide which of the modes will be the most effective. + * I.e.: + * if(requested_backup_mode == BACKUP_MODE_DIFF_AUTO) + * { + * if(dest_pgdata_is_empty) + * backup_mode = BACKUP_MODE_FULL; + * else + * if(ptrack supported and applicable) + * backup_mode = BACKUP_MODE_DIFF_PTRACK; + * else + * backup_mode = BACKUP_MODE_DIFF_DELTA; + * } + */ + + if (dir_is_empty(dest_pgdata, FIO_LOCAL_HOST)) + { + if (current.backup_mode == BACKUP_MODE_DIFF_PTRACK || + current.backup_mode == BACKUP_MODE_DIFF_DELTA) + elog(ERROR, "\"%s\" is empty but incremental catchup mode requested.", + dest_pgdata); + } + else /* dest dir not empty */ + { + if (current.backup_mode == BACKUP_MODE_FULL) + elog(ERROR, "Can't perform full catchup into not empty directory \"%s\".", + dest_pgdata); + } /* Check that connected PG instance and source PGDATA are the same */ check_system_identifiers(source_conn, source_pgdata); @@ -141,7 +170,7 @@ catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, elog(ERROR, "Ptrack is disabled"); } - if (!dest_pgdata_is_empty && + if (current.backup_mode != BACKUP_MODE_FULL && check_incremental_compatibility(dest_pgdata, instance_config.system_identifier, INCR_CHECKSUM) != DEST_OK) @@ -157,12 +186,10 @@ catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, /* * TODO: * - add description - * - fallback to FULL mode if dest PGDATA is empty */ static void do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn *source_conn, - PGNodeInfo *source_node_info, bool no_sync, bool backup_logs, - bool dest_pgdata_is_empty) + PGNodeInfo *source_node_info, bool no_sync, bool backup_logs) { int i; char dest_xlog_path[MAXPGPATH]; @@ -208,9 +235,8 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * //REVIEW I wonder, if we can move this piece above and call before pg_start backup()? //It seems to be a part of setup phase. - if (!dest_pgdata_is_empty && - (current.backup_mode == BACKUP_MODE_DIFF_PTRACK || - current.backup_mode == BACKUP_MODE_DIFF_DELTA)) + if (current.backup_mode == BACKUP_MODE_DIFF_PTRACK || + current.backup_mode == BACKUP_MODE_DIFF_DELTA) { dest_filelist = parray_new(); dir_list_file(dest_filelist, dest_pgdata, @@ -423,9 +449,8 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * * remove absent source files in dest (dropped tables, etc...) * note: global/pg_control will also be deleted here */ - if (!dest_pgdata_is_empty && - (current.backup_mode == BACKUP_MODE_DIFF_PTRACK || - current.backup_mode == BACKUP_MODE_DIFF_DELTA)) + if (current.backup_mode == BACKUP_MODE_DIFF_PTRACK || + current.backup_mode == BACKUP_MODE_DIFF_DELTA) { elog(INFO, "Removing redundant files in destination directory"); parray_qsort(dest_filelist, pgFileCompareRelPathWithExternalDesc); @@ -676,7 +701,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * } /* Cleanup */ - if (!dest_pgdata_is_empty && dest_filelist) + if (dest_filelist) { parray_walk(dest_filelist, pgFileFree); parray_free(dest_filelist); From 861ddd339b31c2951dde34572e2a5cde4cb6463c Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Tue, 1 Jun 2021 15:34:41 +0300 Subject: [PATCH 34/63] check that user uses tablespace_mapping --- src/catchup.c | 53 ++++++++++++++++++++++++++++++++++++++++++------ tests/catchup.py | 42 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 6 deletions(-) diff --git a/src/catchup.c b/src/catchup.c index e7ff5b06a..b44229d64 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -29,6 +29,7 @@ static PGconn *catchup_collect_info(PGNodeInfo *source_node_info, const char *source_pgdata, const char *dest_pgdata); static void catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, const char *source_pgdata, const char *dest_pgdata); +static void check_tablespaces_existance_in_tbsmapping(PGconn *conn); static void do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn *source_conn, PGNodeInfo *nodeInfo, bool no_sync, bool backup_logs); static void *catchup_thread_runner(void *arg); @@ -126,7 +127,7 @@ catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, * kulaginm -- I think this is a harmful feature. If user requested an incremental catchup, then * he expects that this will be done quickly and efficiently. If, for example, he made a mistake * with dest_dir, then he will receive a second full copy instead of an error message, and I think - * that in some cases he would prefer the error. + * that in some cases he would prefer the error. * I propose in future versions to offer a backup_mode auto, in which we will look to the dest_dir * and decide which of the modes will be the most effective. * I.e.: @@ -134,10 +135,10 @@ catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, * { * if(dest_pgdata_is_empty) * backup_mode = BACKUP_MODE_FULL; - * else - * if(ptrack supported and applicable) + * else + * if(ptrack supported and applicable) * backup_mode = BACKUP_MODE_DIFF_PTRACK; - * else + * else * backup_mode = BACKUP_MODE_DIFF_DELTA; * } */ @@ -179,8 +180,48 @@ catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, if (current.from_replica && exclusive_backup) elog(ERROR, "Catchup from standby is available only for PG >= 9.6"); - //REVIEW FIXME Let's fix it before release. This one seems like a potential bug. - // TODO check if it is local catchup and source contain tablespaces + if (!fio_is_remote(FIO_DB_HOST)) + check_tablespaces_existance_in_tbsmapping(source_conn); +} + +/* + * Check that all tablespaces exists in tablespace mapping (--tablespace-mapping option) + * Emit fatal error if that tablespace found + */ +static void +check_tablespaces_existance_in_tbsmapping(PGconn *conn) +{ + PGresult *res; + int i; + char *tablespace_path = NULL; + const char *linked_path = NULL; + char *query = "SELECT pg_catalog.pg_tablespace_location(oid) " + "FROM pg_catalog.pg_tablespace " + "WHERE pg_catalog.pg_tablespace_location(oid) <> '';"; + + res = pgut_execute(conn, query, 0, NULL); + + if (!res) + elog(ERROR, "Failed to get list of tablespaces"); + + for (i = 0; i < res->ntups; i++) + { + tablespace_path = PQgetvalue(res, i, 0); + Assert (strlen(tablespace_path) > 0); + + canonicalize_path(tablespace_path); + linked_path = leaked_abstraction_get_tablespace_mapping(tablespace_path); + + if (strcmp(tablespace_path, linked_path) == 0) + /* same result -> not found in mapping */ + elog(ERROR, "Local catchup executed, but source database contains " + "tablespace (\"%s\"), that are not listed in the map", tablespace_path); + + if (!is_absolute_path(linked_path)) + elog(ERROR, "Tablespace directory path must be an absolute path: %s\n", + linked_path); + } + PQclear(res); } /* diff --git a/tests/catchup.py b/tests/catchup.py index 696e28ace..04f6f8997 100644 --- a/tests/catchup.py +++ b/tests/catchup.py @@ -381,3 +381,45 @@ def test_tablefile_truncation(self): # Clean after yourself source_pg.stop() self.del_test_dir(module_name, fname) + + # @unittest.skip("skip") + def test_local_tablespace_without_mapping(self): + fname = self.id().split('.')[3] + + source_pg = self.make_simple_node( + base_dir = os.path.join(module_name, fname, 'src'), + initdb_params = ['--data-checksums']) + source_pg.slow_start() + + tblspace_path = self.get_tblspace_path(source_pg, 'tblspace') + self.create_tblspace_in_node( + source_pg, 'tblspace', + tblspc_path = tblspace_path) + + source_pg.safe_psql( + "postgres", + "CREATE TABLE ultimate_question TABLESPACE tblspace AS SELECT 42 AS answer") + + dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) + try: + dest_pg = self.catchup_node( + backup_mode = 'FULL', + source_pgdata = source_pg.data_dir, + destination_node = dest_pg, + options = [ + '-d', 'postgres', + '-p', str(source_pg.port), + '--stream', + ] + ) + self.assertEqual(1, 0, "Expecting Error because '-T' parameter is not specified.\n Output: {0} \n CMD: {1}".format( + repr(self.output), self.cmd)) + except ProbackupException as e: + self.assertIn( + 'ERROR: Local catchup executed, but source database contains tablespace', + e.message, + '\n Unexpected Error Message: {0}\n CMD: {1}'.format(repr(e.message), self.cmd)) + + source_pg.stop() + # Clean after yourself + self.del_test_dir(module_name, fname) From e865c834393a8ab9f5baab8b0de75002af2233f0 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Wed, 2 Jun 2021 15:02:17 +0300 Subject: [PATCH 35/63] fix destination file existance --- src/catchup.c | 11 ++++++----- src/data.c | 7 +++++-- src/dir.c | 7 +++++++ src/pg_probackup.h | 1 + 4 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/catchup.c b/src/catchup.c index b44229d64..c5a217feb 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -389,7 +389,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * } /* - * Make directories before catchup and setup threads at the same time + * Make directories before catchup */ /* * We iterate over source_filelist and for every directory with parent 'pg_tblspc' @@ -541,7 +541,11 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * pfilearray_clear_locks(source_filelist); /* Sort by size for load balancing */ - parray_qsort(source_filelist, pgFileCompareSize); + parray_qsort(source_filelist, pgFileCompareSizeDesc); + + /* Sort the array for binary search */ + if (dest_filelist) + parray_qsort(dest_filelist, pgFileCompareRelPathWithExternal); /* init thread args */ threads = (pthread_t *) palloc(sizeof(pthread_t) * num_threads); @@ -653,9 +657,6 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * stop_backup_result.tablespace_map_content_len = 0; } - //REVIEW We don't pass a filelist. Please adjust the comment. - /* This function will also add list of xlog files - * to the passed filelist */ if(wait_WAL_streaming_end(NULL)) elog(ERROR, "WAL streaming failed"); diff --git a/src/data.c b/src/data.c index 1d3ee9ebd..c3ba5233b 100644 --- a/src/data.c +++ b/src/data.c @@ -717,12 +717,15 @@ catchup_data_file(pgFile *file, const char *from_fullpath, const char *to_fullpa else use_pagemap = true; + if (use_pagemap) + elog(VERBOSE, "Using pagemap for file \"%s\"", file->rel_path); + /* Remote mode */ if (fio_is_remote(FIO_DB_HOST)) { rc = fio_copy_pages(to_fullpath, from_fullpath, file, /* send prev backup START_LSN */ - backup_mode == BACKUP_MODE_DIFF_DELTA && + (backup_mode == BACKUP_MODE_DIFF_DELTA || backup_mode == BACKUP_MODE_DIFF_PTRACK) && file->exists_in_prev ? prev_backup_start_lsn : InvalidXLogRecPtr, calg, clevel, checksum_version, /* send pagemap if any */ @@ -735,7 +738,7 @@ catchup_data_file(pgFile *file, const char *from_fullpath, const char *to_fullpa /* TODO: stop handling errors internally */ rc = copy_pages(to_fullpath, from_fullpath, file, /* send prev backup START_LSN */ - backup_mode == BACKUP_MODE_DIFF_DELTA && + (backup_mode == BACKUP_MODE_DIFF_DELTA || backup_mode == BACKUP_MODE_DIFF_PTRACK) && file->exists_in_prev ? prev_backup_start_lsn : InvalidXLogRecPtr, checksum_version, use_pagemap, backup_mode, ptrack_version_num, ptrack_schema); diff --git a/src/dir.c b/src/dir.c index aa7fc6b37..1476f85f1 100644 --- a/src/dir.c +++ b/src/dir.c @@ -485,6 +485,13 @@ pgFileCompareSize(const void *f1, const void *f2) return 0; } +/* Compare two pgFile with their size in descending order */ +int +pgFileCompareSizeDesc(const void *f1, const void *f2) +{ + return -1 * pgFileCompareSize(f1, f2); +} + static int pgCompareString(const void *str1, const void *str2) { diff --git a/src/pg_probackup.h b/src/pg_probackup.h index 8701a09bc..af5b60f2a 100644 --- a/src/pg_probackup.h +++ b/src/pg_probackup.h @@ -1086,6 +1086,7 @@ extern int pgFileCompareRelPathWithExternal(const void *f1, const void *f2); extern int pgFileCompareRelPathWithExternalDesc(const void *f1, const void *f2); extern int pgFileCompareLinked(const void *f1, const void *f2); extern int pgFileCompareSize(const void *f1, const void *f2); +extern int pgFileCompareSizeDesc(const void *f1, const void *f2); extern int pgCompareOid(const void *f1, const void *f2); extern void pfilearray_clear_locks(parray *file_list); From 2598c99794094012e02c415251913be073b8245f Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Thu, 3 Jun 2021 00:34:35 +0300 Subject: [PATCH 36/63] Print octal permissions in tests compare_pgdata() --- tests/helpers/ptrack_helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/helpers/ptrack_helpers.py b/tests/helpers/ptrack_helpers.py index 61273c3d8..ae42bd879 100644 --- a/tests/helpers/ptrack_helpers.py +++ b/tests/helpers/ptrack_helpers.py @@ -1756,10 +1756,10 @@ def compare_pgdata(self, original_pgdata, restored_pgdata): ): fail = True error_message += '\nFile permissions mismatch:\n' - error_message += ' File_old: {0} Permissions: {1}\n'.format( + error_message += ' File_old: {0} Permissions: {1:o}\n'.format( os.path.join(original_pgdata['pgdata'], file), original_pgdata['files'][file]['mode']) - error_message += ' File_new: {0} Permissions: {1}\n'.format( + error_message += ' File_new: {0} Permissions: {1:o}\n'.format( os.path.join(restored_pgdata['pgdata'], file), restored_pgdata['files'][file]['mode']) From 746a1a5da0811dbd465aa14ce2cd6cffd1999455 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Thu, 3 Jun 2021 01:23:17 +0300 Subject: [PATCH 37/63] fix local delta backup --- src/catchup.c | 2 +- src/data.c | 41 +++++++++++++++++++++++++++++++++-------- tests/catchup.py | 15 +++++++++++---- 3 files changed, 45 insertions(+), 13 deletions(-) diff --git a/src/catchup.c b/src/catchup.c index c5a217feb..4bcdb1881 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -853,7 +853,7 @@ catchup_thread_runner(void *arg) if (file->write_size == BYTES_INVALID) { - elog(VERBOSE, "Skipping the unchanged file: \"%s\"", from_fullpath); + elog(VERBOSE, "Skipping the unchanged file: \"%s\", read %li bytes", from_fullpath, file->read_size); continue; } diff --git a/src/data.c b/src/data.c index c3ba5233b..88c99f752 100644 --- a/src/data.c +++ b/src/data.c @@ -268,7 +268,7 @@ get_checksum_errormsg(Page page, char **errormsg, BlockNumber absolute_blkno) * PageIsOk(0) if page was successfully retrieved * PageIsTruncated(-1) if the page was truncated * SkipCurrentPage(-2) if we need to skip this page, - * only used for DELTA backup + * only used for DELTA and PTRACK backup * PageIsCorrupted(-3) if the page checksum mismatch * or header corruption, * only used for checkdb @@ -403,7 +403,12 @@ prepare_page(pgFile *file, XLogRecPtr prev_backup_start_lsn, page_st->lsn > 0 && page_st->lsn < prev_backup_start_lsn) { - elog(VERBOSE, "Skipping blknum %u in file: \"%s\"", blknum, from_fullpath); + elog(VERBOSE, "Skipping blknum %u in file: \"%s\", file->exists_in_prev: %s, page_st->lsn: %X/%X, prev_backup_start_lsn: %X/%X", + blknum, from_fullpath, + file->exists_in_prev ? "true" : "false", + (uint32) (page_st->lsn >> 32), (uint32) page_st->lsn, + (uint32) (prev_backup_start_lsn >> 32), (uint32) prev_backup_start_lsn + ); return SkipCurrentPage; } @@ -2255,7 +2260,7 @@ send_pages(ConnectionArgs* conn_arg, const char *to_fullpath, const char *from_f /* copy local file (взята из send_pages, но используется простое копирование странички, без добавления заголовков и компрессии) */ int copy_pages(const char *to_fullpath, const char *from_fullpath, - pgFile *file, XLogRecPtr prev_backup_start_lsn, + pgFile *file, XLogRecPtr sync_lsn, uint32 checksum_version, bool use_pagemap, BackupMode backup_mode, int ptrack_version_num, const char *ptrack_schema) { @@ -2303,13 +2308,26 @@ copy_pages(const char *to_fullpath, const char *from_fullpath, setvbuf(in, in_buf, _IOFBF, STDIO_BUFSIZE); } - /* ошибки бы тут обработать! */ - out = open_local_file_rw(to_fullpath, &out_buf, STDIO_BUFSIZE); + out = fio_fopen(to_fullpath, PG_BINARY_R "+", FIO_BACKUP_HOST); + if (out == NULL) + elog(ERROR, "Cannot open destination file \"%s\": %s", + to_fullpath, strerror(errno)); + + /* update file permission */ + if (fio_chmod(to_fullpath, file->mode, FIO_BACKUP_HOST) == -1) + elog(ERROR, "Cannot change mode of \"%s\": %s", to_fullpath, + strerror(errno)); + + if (!fio_is_remote_file(out)) + { + out_buf = pgut_malloc(STDIO_BUFSIZE); + setvbuf(out, out_buf, _IOFBF, STDIO_BUFSIZE); + } while (blknum < file->n_blocks) { PageState page_st; - int rc = prepare_page(file, prev_backup_start_lsn, + int rc = prepare_page(file, sync_lsn, blknum, in, backup_mode, curr_page, true, checksum_version, ptrack_version_num, ptrack_schema, @@ -2318,7 +2336,14 @@ copy_pages(const char *to_fullpath, const char *from_fullpath, break; else if (rc == PageIsOk) + { + if (fio_fseek(out, blknum * BLCKSZ) < 0) + { + elog(ERROR, "Cannot seek block %u of \"%s\": %s", + blknum, to_fullpath, strerror(errno)); + } copy_page(file, blknum, in, out, curr_page, to_fullpath); + } n_blocks_read++; @@ -2339,8 +2364,8 @@ copy_pages(const char *to_fullpath, const char *from_fullpath, to_fullpath, strerror(errno)); /* close local output file */ - if (out && fclose(out)) - elog(ERROR, "Cannot close the backup file \"%s\": %s", + if (out && fio_fclose(out)) + elog(ERROR, "Cannot close the destination file \"%s\": %s", to_fullpath, strerror(errno)); pg_free(iter); diff --git a/tests/catchup.py b/tests/catchup.py index 04f6f8997..c0c0bfa43 100644 --- a/tests/catchup.py +++ b/tests/catchup.py @@ -217,6 +217,7 @@ def test_remote_delta_catchup(self): base_dir = os.path.join(module_name, fname, 'src'), set_replication = True, ptrack_enable = True, + pg_options = { 'wal_log_hints': 'on' } ) source_pg.slow_start() source_pg.safe_psql("postgres", "CREATE TABLE ultimate_question(answer int)") @@ -227,7 +228,8 @@ def test_remote_delta_catchup(self): backup_mode = 'FULL', source_pgdata = source_pg.data_dir, destination_node = dest_pg, - options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream']) + options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream'] + ) self.set_replica(source_pg, dest_pg) dest_options = {} dest_options['port'] = str(dest_pg.port) @@ -236,7 +238,7 @@ def test_remote_delta_catchup(self): dest_pg.stop() # make changes on master - source_pg.pgbench_init(scale=10) + source_pg.pgbench_init(scale = 10) pgbench = source_pg.pgbench(options=['-T', '10', '--no-vacuum']) pgbench.wait() source_pg.safe_psql("postgres", "INSERT INTO ultimate_question VALUES(42)") @@ -247,13 +249,18 @@ def test_remote_delta_catchup(self): backup_mode = 'DELTA', source_pgdata = source_pg.data_dir, destination_node = dest_pg, - options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream']) + options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream'] + ) + + source_pgdata = self.pgdata_content(source_pg.data_dir) + dest_pgdata = self.pgdata_content(dest_pg.data_dir) + self.compare_pgdata(source_pgdata, dest_pgdata) # stop replication source_pg.stop() # check latest changes - self.set_replica(source_pg, dest_pg) + self.set_replica(master = source_pg, replica = dest_pg) dest_pg.slow_start(replica = True) self.assertEqual( result, From 2cceb71b20042a50eb7eca0dee9a5973594a212a Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Thu, 3 Jun 2021 02:50:39 +0300 Subject: [PATCH 38/63] Review answer #2 --- src/catchup.c | 91 +++++++++++++++++++-------------------------------- src/stream.c | 14 ++++---- 2 files changed, 40 insertions(+), 65 deletions(-) diff --git a/src/catchup.c b/src/catchup.c index 4bcdb1881..f59fb3635 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -61,6 +61,9 @@ do_catchup(const char *source_pgdata, const char *dest_pgdata, int num_threads) //Besides, the name of this function looks strange to me. //Maybe catchup_init_state() or catchup_setup() will do better? //I'd also suggest to wrap all these fields into some CatchupState, but it isn't urgent. +/* + * Prepare for work: fill some globals, open connection to source database + */ static PGconn * catchup_collect_info(PGNodeInfo *source_node_info, const char *source_pgdata, const char *dest_pgdata) { @@ -186,7 +189,7 @@ catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, /* * Check that all tablespaces exists in tablespace mapping (--tablespace-mapping option) - * Emit fatal error if that tablespace found + * Emit fatal error if that (not existent in map) tablespace found */ static void check_tablespaces_existance_in_tbsmapping(PGconn *conn) @@ -227,6 +230,7 @@ check_tablespaces_existance_in_tbsmapping(PGconn *conn) /* * TODO: * - add description + * main worker function, to be moved into do_catchup() and then to be split into meaningful pieces */ static void do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn *source_conn, @@ -315,13 +319,10 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * (uint32) (dest_redo.lsn >> 32), (uint32) (dest_redo.lsn)); /* Start stream replication */ - if (stream_wal) - { - join_path_components(dest_xlog_path, dest_pgdata, PG_XLOG_DIR); - fio_mkdir(dest_xlog_path, DIR_PERMISSION, FIO_BACKUP_HOST); - start_WAL_streaming(source_conn, dest_xlog_path, &instance_config.conn_opt, - current.start_lsn, current.tli); - } + join_path_components(dest_xlog_path, dest_pgdata, PG_XLOG_DIR); + fio_mkdir(dest_xlog_path, DIR_PERMISSION, FIO_BACKUP_HOST); + start_WAL_streaming(source_conn, dest_xlog_path, &instance_config.conn_opt, + current.start_lsn, current.tli); source_filelist = parray_new(); @@ -358,7 +359,6 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * */ parray_qsort(source_filelist, pgFileCompareRelPathWithExternal); - //REVIEW Please adjust the comment. /* Extract information about files in source_filelist parsing their names:*/ parse_filelist_filenames(source_filelist, source_pgdata); @@ -408,7 +408,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * /* * check if it is fake "directory" and is a tablespace link - * это происходит потому что мы передали follow_symlink при построении списка + * this is because we passed the follow_symlink when building the list */ /* get parent dir of rel_path */ strncpy(parent_dir, file->rel_path, MAXPGPATH); @@ -431,8 +431,9 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * const char *linked_path = NULL; char to_path[MAXPGPATH]; - // perform additional check that this is actually synlink? + // perform additional check that this is actually symlink? //REVIEW Why is this code block separated? + //REVIEW_ANSWER because i want to localize usage of source_full_path and symlink_content { /* get full symlink path and map this path to new location */ char source_full_path[MAXPGPATH]; char symlink_content[MAXPGPATH]; @@ -440,9 +441,10 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * fio_readlink(source_full_path, symlink_content, sizeof(symlink_content), FIO_DB_HOST); //REVIEW What if we won't find mapping for this tablespace? //I'd expect a failure. Otherwise, we may spoil source database data. + // REVIEW_ANSWER we checked that in preflight_checks for local catchup + // and for remote catchup this may be correct behavior linked_path = leaked_abstraction_get_tablespace_mapping(symlink_content); - // TODO: check that linked_path != symlink_content in case of local catchup? - elog(WARNING, "Map tablespace full_path: \"%s\" old_symlink_content: \"%s\" old_symlink_content: \"%s\"\n", + elog(INFO, "Map tablespace full_path: \"%s\" old_symlink_content: \"%s\" new_symlink_content: \"%s\"\n", source_full_path, symlink_content, linked_path); @@ -457,15 +459,15 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * elog(VERBOSE, "Create directory \"%s\" and symbolic link \"%s\"", linked_path, to_path); - //REVIEW Handle return value here. - //We should not proceed if failed to create dir. /* create tablespace directory */ - fio_mkdir(linked_path, DIR_PERMISSION, FIO_BACKUP_HOST); + if (fio_mkdir(linked_path, file->mode, FIO_BACKUP_HOST) != 0) + elog(ERROR, "Could not create tablespace directory \"%s\": %s", + linked_path, strerror(errno)); /* create link to linked_path */ if (fio_symlink(linked_path, to_path, true, FIO_BACKUP_HOST) < 0) - elog(ERROR, "Could not create symbolic link \"%s\": %s", - to_path, strerror(errno)); + elog(ERROR, "Could not create symbolic link \"%s\" -> \"%s\": %s", + linked_path, to_path, strerror(errno)); } } @@ -475,7 +477,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * */ { int control_file_elem_index; - pgFile search_key ; + pgFile search_key; MemSet(&search_key, 0, sizeof(pgFile)); /* pgFileCompareRelPathWithExternal uses only .rel_path and .external_dir_num for comparision */ search_key.rel_path = XLOG_CONTROL_FILE; @@ -502,12 +504,13 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * //REVIEW Can we maybe optimize it and use some merge-like algorithm //instead of bsearch for each file? Of course it isn't an urgent fix. + //REVIEW_ANSWER yes, merge will be better if (parray_bsearch(source_filelist, file, pgFileCompareRelPathWithExternal)) redundant = false; /* pg_filenode.map are always restored, because it's crc cannot be trusted */ - if (file->external_dir_num == 0 && - pg_strcasecmp(file->name, RELMAPPER_FILENAME) == 0) + Assert(file->external_dir_num == 0); + if (pg_strcasecmp(file->name, RELMAPPER_FILENAME) == 0) redundant = true; //REVIEW This check seems unneded. Anyway we delete only redundant stuff below. @@ -588,6 +591,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * } /* at last copy control file */ + if (catchup_isok) { char from_fullpath[MAXPGPATH]; char to_fullpath[MAXPGPATH]; @@ -678,9 +682,8 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * /* Cleanup */ pg_free(query_text); - //REVIEW Please adjust the comment. - /* In case of backup from replica >= 9.6 we must fix minRecPoint, - * First we must find pg_control in source_filelist. + /* + * In case of backup from replica >= 9.6 we must fix minRecPoint */ if (current.from_replica && !exclusive_backup) { @@ -712,19 +715,9 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * continue; /* construct fullpath */ - if (file->external_dir_num == 0) - join_path_components(to_fullpath, dest_pgdata, file->rel_path); - //REVIEW Let's clean this. - /* TODO разобраться с external */ - /*else - { - char external_dst[MAXPGPATH]; + Assert(file->external_dir_num == 0); + join_path_components(to_fullpath, dest_pgdata, file->rel_path); - makeExternalDirPathByNum(external_dst, external_prefix, - file->external_dir_num); - join_path_components(to_fullpath, external_dst, file->rel_path); - } - */ if (fio_sync(to_fullpath, FIO_BACKUP_HOST) != 0) elog(ERROR, "Cannot sync file \"%s\": %s", to_fullpath, strerror(errno)); } @@ -754,7 +747,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * } /* - * TODO: add description + * Catchup file copier executed in separate threads */ static void * catchup_thread_runner(void *arg) @@ -788,27 +781,9 @@ catchup_thread_runner(void *arg) i + 1, n_files, file->rel_path); /* construct destination filepath */ - /* TODO разобраться нужен ли external */ - if (file->external_dir_num == 0) - { - join_path_components(from_fullpath, arguments->from_root, file->rel_path); - join_path_components(to_fullpath, arguments->to_root, file->rel_path); - } - //REVIEW Let's clean this. - /*else - { - char external_dst[MAXPGPATH]; - char *external_path = parray_get(arguments->external_dirs, - file->external_dir_num - 1); - - makeExternalDirPathByNum(external_dst, - arguments->external_prefix, - file->external_dir_num); - - join_path_components(to_fullpath, external_dst, file->rel_path); - join_path_components(from_fullpath, external_path, file->rel_path); - } - */ + Assert(file->external_dir_num == 0); + join_path_components(from_fullpath, arguments->from_root, file->rel_path); + join_path_components(to_fullpath, arguments->to_root, file->rel_path); /* Encountered some strange beast */ if (!S_ISREG(file->mode)) diff --git a/src/stream.c b/src/stream.c index 9033e04bf..0ebc0eee6 100644 --- a/src/stream.c +++ b/src/stream.c @@ -308,14 +308,14 @@ stop_streaming(XLogRecPtr xlogpos, uint32 timeline, bool segment_finished) /* we assume that we get called once at the end of each segment */ if (segment_finished) - { - elog(VERBOSE, _("finished segment at %X/%X (timeline %u)"), - (uint32) (xlogpos >> 32), (uint32) xlogpos, timeline); + { + elog(VERBOSE, _("finished segment at %X/%X (timeline %u)"), + (uint32) (xlogpos >> 32), (uint32) xlogpos, timeline); - add_walsegment_to_filelist(xlog_files_list, timeline, xlogpos, - (char*) stream_thread_arg.basedir, - instance_config.xlog_seg_size); - } + add_walsegment_to_filelist(xlog_files_list, timeline, xlogpos, + (char*) stream_thread_arg.basedir, + instance_config.xlog_seg_size); + } /* * Note that we report the previous, not current, position here. After a From 803a7e3217ffe8856d066dfef84f0ac9c38e2cb6 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Thu, 3 Jun 2021 05:03:39 +0300 Subject: [PATCH 39/63] add more catchup preflight checks --- src/archive.c | 2 +- src/backup.c | 4 +- src/catchup.c | 159 +++++++++++++++++++++++++++++++++++++++------ src/init.c | 2 +- src/pg_probackup.h | 5 +- src/restore.c | 2 +- src/stream.c | 1 - src/util.c | 14 ++-- 8 files changed, 153 insertions(+), 36 deletions(-) diff --git a/src/archive.c b/src/archive.c index ef87910f8..0225df22b 100644 --- a/src/archive.c +++ b/src/archive.c @@ -148,7 +148,7 @@ do_archive_push(InstanceState *instanceState, InstanceConfig *instance, char *wa elog(ERROR, "getcwd() error"); /* verify that archive-push --instance parameter is valid */ - system_id = get_system_identifier(current_dir); + system_id = get_system_identifier(current_dir, FIO_DB_HOST); if (instance->pgdata == NULL) elog(ERROR, "Cannot read pg_probackup.conf for this instance"); diff --git a/src/backup.c b/src/backup.c index b9cd2207d..d2122b20d 100644 --- a/src/backup.c +++ b/src/backup.c @@ -140,7 +140,7 @@ do_backup_pg(InstanceState *instanceState, PGconn *backup_conn, #if PG_VERSION_NUM >= 90600 current.tli = get_current_timeline(backup_conn); #else - current.tli = get_current_timeline_from_control(false); + current.tli = get_current_timeline_from_control(instance_config.pgdata, FIO_DB_HOST, false); #endif /* @@ -968,7 +968,7 @@ check_system_identifiers(PGconn *conn, const char *pgdata) uint64 system_id_conn; uint64 system_id_pgdata; - system_id_pgdata = get_system_identifier(pgdata); + system_id_pgdata = get_system_identifier(pgdata, FIO_DB_HOST); system_id_conn = get_remote_system_identifier(conn); /* for checkdb check only system_id_pgdata and system_id_conn */ diff --git a/src/catchup.c b/src/catchup.c index f59fb3635..272d867e8 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -29,7 +29,8 @@ static PGconn *catchup_collect_info(PGNodeInfo *source_node_info, const char *source_pgdata, const char *dest_pgdata); static void catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, const char *source_pgdata, const char *dest_pgdata); -static void check_tablespaces_existance_in_tbsmapping(PGconn *conn); +static void catchup_check_tablespaces_existance_in_tbsmapping(PGconn *conn); +static parray* catchup_get_tli_history(ConnectionOptions *conn_opt, TimeLineID tli); static void do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn *source_conn, PGNodeInfo *nodeInfo, bool no_sync, bool backup_logs); static void *catchup_thread_runner(void *arg); @@ -74,10 +75,7 @@ catchup_collect_info(PGNodeInfo *source_node_info, const char *source_pgdata, co /* Get WAL segments size and system ID of source PG instance */ instance_config.xlog_seg_size = get_xlog_seg_size(source_pgdata); - instance_config.system_identifier = get_system_identifier(source_pgdata); -#if PG_VERSION_NUM < 90600 - instance_config.pgdata = source_pgdata; -#endif + instance_config.system_identifier = get_system_identifier(source_pgdata, FIO_DB_HOST); current.start_time = time(NULL); StrNCpy(current.program_version, PROGRAM_VERSION, sizeof(current.program_version)); @@ -101,7 +99,8 @@ catchup_collect_info(PGNodeInfo *source_node_info, const char *source_pgdata, co #if PG_VERSION_NUM >= 90600 current.tli = get_current_timeline(source_conn); #else - current.tli = get_current_timeline_from_control(false); + instance_config.pgdata = source_pgdata; + current.tli = get_current_timeline_from_control(source_pgdata, FIO_DB_HOST, false); #endif elog(INFO, "Catchup start, pg_probackup version: %s, " @@ -160,9 +159,46 @@ catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, dest_pgdata); } - /* Check that connected PG instance and source PGDATA are the same */ - check_system_identifiers(source_conn, source_pgdata); + /* check that postmaster is not running in destination */ + if (current.backup_mode != BACKUP_MODE_FULL) + { + pid_t pid; + pid = fio_check_postmaster(dest_pgdata, FIO_LOCAL_HOST); + if (pid == 1) /* postmaster.pid is mangled */ + { + char pid_filename[MAXPGPATH]; + join_path_components(pid_filename, dest_pgdata, "postmaster.pid"); + elog(ERROR, "Pid file \"%s\" is mangled, cannot determine whether postmaster is running or not", + pid_filename); + } + else if (pid > 1) /* postmaster is up */ + { + elog(ERROR, "Postmaster with pid %u is running in destination directory \"%s\"", + pid, dest_pgdata); + } + } + + /* Check that connected PG instance, source and destination PGDATA are the same */ + { + uint64 source_conn_id, source_id, dest_id; + source_conn_id = get_remote_system_identifier(source_conn); + source_id = get_system_identifier(source_pgdata, FIO_DB_HOST); /* same as instance_config.system_identifier */ + + if (source_conn_id != source_id) + elog(ERROR, "Database identifiers mismatch: we connected to DB id %lu, but in \"%s\" we found id %lu", + source_conn_id, source_pgdata, source_id); + + if (current.backup_mode != BACKUP_MODE_FULL) + { + dest_id = get_system_identifier(dest_pgdata, FIO_LOCAL_HOST); + if (source_conn_id != dest_id) + elog(ERROR, "Database identifiers mismatch: we connected to DB id %lu, but in \"%s\" we found id %lu", + source_conn_id, dest_pgdata, dest_id); + } + } + + /* check PTRACK version */ if (current.backup_mode == BACKUP_MODE_DIFF_PTRACK) { if (source_node_info->ptrack_version_num == 0) @@ -174,17 +210,42 @@ catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, elog(ERROR, "Ptrack is disabled"); } - if (current.backup_mode != BACKUP_MODE_FULL && - check_incremental_compatibility(dest_pgdata, - instance_config.system_identifier, - INCR_CHECKSUM) != DEST_OK) - elog(ERROR, "Catchup is not possible in this destination"); + /* check backup_label absence in dest */ + if (current.backup_mode != BACKUP_MODE_FULL) + { + char backup_label_filename[MAXPGPATH]; + + join_path_components(backup_label_filename, dest_pgdata, "backup_label"); + if (fio_access(backup_label_filename, F_OK, FIO_LOCAL_HOST) == 0) + elog(ERROR, "Destination directory contains \"backup_control\" file"); + } if (current.from_replica && exclusive_backup) elog(ERROR, "Catchup from standby is available only for PG >= 9.6"); + /* if local catchup, check that we don't overwrite tablespace in source pgdata */ if (!fio_is_remote(FIO_DB_HOST)) - check_tablespaces_existance_in_tbsmapping(source_conn); + catchup_check_tablespaces_existance_in_tbsmapping(source_conn); + + /* check timelines */ + if (current.backup_mode != BACKUP_MODE_FULL) + { + TimeLineID dest_tli; + parray *source_timelines; + + dest_tli = get_current_timeline_from_control(dest_pgdata, FIO_LOCAL_HOST, false); + + source_timelines = catchup_get_tli_history(&instance_config.conn_opt, current.tli); + + if (source_timelines != NULL && !tliIsPartOfHistory(source_timelines, dest_tli)) + elog(ERROR, "Destination is not in source history"); + + if (source_timelines != NULL) + { + parray_walk(source_timelines, pfree); + parray_free(source_timelines); + } + } } /* @@ -192,7 +253,7 @@ catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, * Emit fatal error if that (not existent in map) tablespace found */ static void -check_tablespaces_existance_in_tbsmapping(PGconn *conn) +catchup_check_tablespaces_existance_in_tbsmapping(PGconn *conn) { PGresult *res; int i; @@ -227,6 +288,68 @@ check_tablespaces_existance_in_tbsmapping(PGconn *conn) PQclear(res); } +/* + * Get timeline history via replication connection + * returns parray* of TimeLineHistoryEntry* + */ +static parray* +catchup_get_tli_history(ConnectionOptions *conn_opt, TimeLineID tli) +{ + PGresult *res; + PGconn *conn; + char *history; + char query[128]; + parray *result = NULL; + + snprintf(query, sizeof(query), "TIMELINE_HISTORY %u", tli); + + /* + * Connect in replication mode to the server. + */ + conn = pgut_connect_replication(conn_opt->pghost, + conn_opt->pgport, + conn_opt->pgdatabase, + conn_opt->pguser, + false); + + if (!conn) + return NULL; + + res = PQexec(conn, query); + PQfinish(conn); + + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + elog(WARNING, "Could not send replication command \"%s\": %s", + query, PQresultErrorMessage(res)); + PQclear(res); + return NULL; + } + + /* + * The response to TIMELINE_HISTORY is a single row result set + * with two fields: filename and content + */ + if (PQnfields(res) != 2 || PQntuples(res) != 1) + { + elog(ERROR, "Unexpected response to TIMELINE_HISTORY command: " + "got %d rows and %d fields, expected %d rows and %d fields", + PQntuples(res), PQnfields(res), 1, 2); + PQclear(res); + return NULL; + } + + history = pgut_strdup(PQgetvalue(res, 0, 1)); + result = parse_tli_history_buffer(history, tli); + + /* some cleanup */ + pg_free(history); + PQclear(res); + + return result; +} + + /* * TODO: * - add description @@ -250,10 +373,6 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * parray *source_filelist = NULL; parray *dest_filelist = NULL; - //REVIEW FIXME Let's fix it before release. It can cause some obscure bugs. - /* TODO: in case of timeline mistmatch, check that source PG timeline descending from dest PG timeline */ - parray *tli_list = NULL; - /* for fancy reporting */ time_t start_time, end_time; char pretty_time[20]; @@ -365,8 +484,6 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * elog(LOG, "Start LSN (source): %X/%X, TLI: %X", (uint32) (current.start_lsn >> 32), (uint32) (current.start_lsn), current.tli); - //REVIEW FIXME Huh? Don't we check TLI at all? - /* TODO проверить, нужна ли проверка TLI */ if (current.backup_mode != BACKUP_MODE_FULL) elog(LOG, "LSN in destination: %X/%X, TLI: %X", (uint32) (dest_redo.lsn >> 32), (uint32) (dest_redo.lsn), diff --git a/src/init.c b/src/init.c index dc821325a..a4911cb5c 100644 --- a/src/init.c +++ b/src/init.c @@ -57,7 +57,7 @@ do_add_instance(InstanceState *instanceState, InstanceConfig *instance) "(-D, --pgdata)"); /* Read system_identifier from PGDATA */ - instance->system_identifier = get_system_identifier(instance->pgdata); + instance->system_identifier = get_system_identifier(instance->pgdata, FIO_DB_HOST); /* Starting from PostgreSQL 11 read WAL segment size from PGDATA */ instance->xlog_seg_size = get_xlog_seg_size(instance->pgdata); diff --git a/src/pg_probackup.h b/src/pg_probackup.h index af5b60f2a..e6cd202d7 100644 --- a/src/pg_probackup.h +++ b/src/pg_probackup.h @@ -1170,9 +1170,9 @@ extern XLogRecPtr get_next_record_lsn(const char *archivedir, XLogSegNo segno, T /* in util.c */ extern TimeLineID get_current_timeline(PGconn *conn); -extern TimeLineID get_current_timeline_from_control(bool safe); +extern TimeLineID get_current_timeline_from_control(const char *pgdata_path, fio_location location, bool safe); extern XLogRecPtr get_checkpoint_location(PGconn *conn); -extern uint64 get_system_identifier(const char *pgdata_path); +extern uint64 get_system_identifier(const char *pgdata_path, fio_location location); extern uint64 get_remote_system_identifier(PGconn *conn); extern uint32 get_data_checksum_version(bool safe); extern pg_crc32c get_pgcontrol_checksum(const char *pgdata_path); @@ -1300,6 +1300,7 @@ extern void start_WAL_streaming(PGconn *backup_conn, char *stream_dst_path, ConnectionOptions *conn_opt, XLogRecPtr startpos, TimeLineID starttli); extern int wait_WAL_streaming_end(parray *backup_files_list); +extern parray* parse_tli_history_buffer(char *history, TimeLineID tli); /* external variables and functions, implemented in backup.c */ typedef struct PGStopBackupResult diff --git a/src/restore.c b/src/restore.c index 9a31ebbb7..5e1233a3e 100644 --- a/src/restore.c +++ b/src/restore.c @@ -2187,7 +2187,7 @@ check_incremental_compatibility(const char *pgdata, uint64 system_identifier, */ elog(INFO, "Trying to read pg_control file in destination directory"); - system_id_pgdata = get_system_identifier(pgdata); + system_id_pgdata = get_system_identifier(pgdata, FIO_DB_HOST); if (system_id_pgdata == instance_config.system_identifier) system_id_match = true; diff --git a/src/stream.c b/src/stream.c index 0ebc0eee6..88eab5098 100644 --- a/src/stream.c +++ b/src/stream.c @@ -70,7 +70,6 @@ static void add_walsegment_to_filelist(parray *filelist, uint32 timeline, uint32 xlog_seg_size); static void add_history_file_to_filelist(parray *filelist, uint32 timeline, char *basedir); -static parray* parse_tli_history_buffer(char *history, TimeLineID tli); /* * Run IDENTIFY_SYSTEM through a given connection and diff --git a/src/util.c b/src/util.c index 83bc5b20c..7d486ea44 100644 --- a/src/util.c +++ b/src/util.c @@ -174,7 +174,7 @@ get_current_timeline(PGconn *conn) if (PQresultStatus(res) == PGRES_TUPLES_OK) val = PQgetvalue(res, 0, 0); else - return get_current_timeline_from_control(false); + return get_current_timeline_from_control(instance_config.pgdata, FIO_DB_HOST, false); if (!parse_uint32(val, &tli, 0)) { @@ -182,7 +182,7 @@ get_current_timeline(PGconn *conn) elog(WARNING, "Invalid value of timeline_id %s", val); /* TODO 3.0 remove it and just error out */ - return get_current_timeline_from_control(false); + return get_current_timeline_from_control(instance_config.pgdata, FIO_DB_HOST, false); } return tli; @@ -190,15 +190,15 @@ get_current_timeline(PGconn *conn) /* Get timeline from pg_control file */ TimeLineID -get_current_timeline_from_control(bool safe) +get_current_timeline_from_control(const char *pgdata_path, fio_location location, bool safe) { ControlFileData ControlFile; char *buffer; size_t size; /* First fetch file... */ - buffer = slurpFile(instance_config.pgdata, XLOG_CONTROL_FILE, &size, - safe, FIO_DB_HOST); + buffer = slurpFile(pgdata_path, XLOG_CONTROL_FILE, &size, + safe, location); if (safe && buffer == NULL) return 0; @@ -249,14 +249,14 @@ get_checkpoint_location(PGconn *conn) } uint64 -get_system_identifier(const char *pgdata_path) +get_system_identifier(const char *pgdata_path, fio_location location) { ControlFileData ControlFile; char *buffer; size_t size; /* First fetch file... */ - buffer = slurpFile(pgdata_path, XLOG_CONTROL_FILE, &size, false, FIO_DB_HOST); + buffer = slurpFile(pgdata_path, XLOG_CONTROL_FILE, &size, false, location); if (buffer == NULL) return 0; digestControlFile(&ControlFile, buffer, size); From 5b46f097d2e2c9e99627668c35f5bf936c3f070a Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Thu, 3 Jun 2021 12:35:49 +0300 Subject: [PATCH 40/63] rename cmdline parameters, remove annoyng message --- src/catchup.c | 2 +- src/help.c | 8 ++++---- src/pg_probackup.c | 8 ++++---- src/utils/file.c | 1 - tests/helpers/ptrack_helpers.py | 4 ++-- 5 files changed, 11 insertions(+), 12 deletions(-) diff --git a/src/catchup.c b/src/catchup.c index 272d867e8..98e8bf4c7 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -105,7 +105,7 @@ catchup_collect_info(PGNodeInfo *source_node_info, const char *source_pgdata, co elog(INFO, "Catchup start, pg_probackup version: %s, " "PostgreSQL version: %s, " - "remote: %s, catchup-source-pgdata: %s, catchup-destination-pgdata: %s", + "remote: %s, source-pgdata: %s, destination-pgdata: %s", PROGRAM_VERSION, source_node_info->server_version_str, IsSshProtocol() ? "true" : "false", source_pgdata, dest_pgdata); diff --git a/src/help.c b/src/help.c index 0d18d359a..faad59cb2 100644 --- a/src/help.c +++ b/src/help.c @@ -249,8 +249,8 @@ help_pg_probackup(void) printf(_(" [--help]\n")); printf(_("\n %s catchup -b catchup-mode\n"), PROGRAM_NAME); - printf(_(" --catchup-source-pgdata=path_to_pgdata_on_remote_server\n")); - printf(_(" --catchup-destination-pgdata=path_to_local_dir\n")); + printf(_(" --source-pgdata=path_to_pgdata_on_remote_server\n")); + printf(_(" --destination-pgdata=path_to_local_dir\n")); printf(_(" [--stream [-S slot-name]] [--temp-slot]\n")); printf(_(" [-j num-threads]\n")); printf(_(" [-T OLDDIR=NEWDIR]\n")); @@ -1029,8 +1029,8 @@ static void help_catchup(void) { printf(_("\n%s catchup -b catchup-mode\n"), PROGRAM_NAME); - printf(_(" --catchup-source-pgdata=path_to_pgdata_on_remote_server\n")); - printf(_(" --catchup-destination-pgdata=path_to_local_dir\n")); + printf(_(" --source-pgdata=path_to_pgdata_on_remote_server\n")); + printf(_(" --destination-pgdata=path_to_local_dir\n")); printf(_(" [--stream [-S slot-name]] [--temp-slot]\n")); printf(_(" [-j num-threads]\n")); printf(_(" [-T OLDDIR=NEWDIR]\n")); diff --git a/src/pg_probackup.c b/src/pg_probackup.c index 2d5e813e7..7270cfb9c 100644 --- a/src/pg_probackup.c +++ b/src/pg_probackup.c @@ -205,8 +205,8 @@ static ConfigOption cmd_options[] = { 'b', 185, "dry-run", &dry_run, SOURCE_CMD_STRICT }, { 's', 238, "note", &backup_note, SOURCE_CMD_STRICT }, /* catchup options */ - { 's', 239, "catchup-source-pgdata", &catchup_source_pgdata, SOURCE_CMD_STRICT }, - { 's', 240, "catchup-destination-pgdata", &catchup_destination_pgdata, SOURCE_CMD_STRICT }, + { 's', 239, "source-pgdata", &catchup_source_pgdata, SOURCE_CMD_STRICT }, + { 's', 240, "destination-pgdata", &catchup_destination_pgdata, SOURCE_CMD_STRICT }, /* restore options */ { 's', 136, "recovery-target-time", &target_time, SOURCE_CMD_STRICT }, { 's', 137, "recovery-target-xid", &target_xid, SOURCE_CMD_STRICT }, @@ -760,9 +760,9 @@ main(int argc, char *argv[]) if (backup_subcmd == CATCHUP_CMD) { if (catchup_source_pgdata == NULL) - elog(ERROR, "You must specify \"--catchup-source-pgdata\" option with the \"%s\" command", get_subcmd_name(backup_subcmd)); + elog(ERROR, "You must specify \"--source-pgdata\" option with the \"%s\" command", get_subcmd_name(backup_subcmd)); if (catchup_destination_pgdata == NULL) - elog(ERROR, "You must specify \"--catchup-destination-pgdata\" option with the \"%s\" command", get_subcmd_name(backup_subcmd)); + elog(ERROR, "You must specify \"--destination-pgdata\" option with the \"%s\" command", get_subcmd_name(backup_subcmd)); if (current.backup_mode == BACKUP_MODE_INVALID) elog(ERROR, "Required parameter not specified: BACKUP_MODE (-b, --backup-mode)"); if (current.backup_mode != BACKUP_MODE_FULL && current.backup_mode != BACKUP_MODE_DIFF_PTRACK && current.backup_mode != BACKUP_MODE_DIFF_DELTA) diff --git a/src/utils/file.c b/src/utils/file.c index bdcaafa5d..377aafb5b 100644 --- a/src/utils/file.c +++ b/src/utils/file.c @@ -2001,7 +2001,6 @@ fio_copy_pages(const char *to_fullpath, const char *from_fullpath, pgFile *file, COMP_FILE_CRC32(true, file->crc, buf, hdr.size); - elog(INFO, "Copy block %u with size %lu of %s", blknum, hdr.size - sizeof(BackupPageHeader), to_fullpath); if (fio_fseek(out, blknum * BLCKSZ) < 0) { elog(ERROR, "Cannot seek block %u of \"%s\": %s", diff --git a/tests/helpers/ptrack_helpers.py b/tests/helpers/ptrack_helpers.py index ae42bd879..8b35bae7c 100644 --- a/tests/helpers/ptrack_helpers.py +++ b/tests/helpers/ptrack_helpers.py @@ -1047,8 +1047,8 @@ def catchup_node( cmd_list = [ 'catchup', '--backup-mode={0}'.format(backup_mode), - '--catchup-source-pgdata={0}'.format(source_pgdata), - '--catchup-destination-pgdata={0}'.format(destination_node.data_dir) + '--source-pgdata={0}'.format(source_pgdata), + '--destination-pgdata={0}'.format(destination_node.data_dir) ] if self.remote: cmd_list += ['--remote-proto=ssh', '--remote-host=localhost'] From 18c4b46e257dfd5c9952c662b35c81ba8f33d65b Mon Sep 17 00:00:00 2001 From: anastasia Date: Mon, 7 Jun 2021 13:02:17 +0300 Subject: [PATCH 41/63] one more review pass --- src/catchup.c | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/src/catchup.c b/src/catchup.c index 98e8bf4c7..825e6cb82 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -58,8 +58,8 @@ do_catchup(const char *source_pgdata, const char *dest_pgdata, int num_threads) return 0; } -//REVIEW Please add a comment to this function. -//Besides, the name of this function looks strange to me. + +//REVIEW The name of this function looks strange to me. //Maybe catchup_init_state() or catchup_setup() will do better? //I'd also suggest to wrap all these fields into some CatchupState, but it isn't urgent. /* @@ -79,9 +79,6 @@ catchup_collect_info(PGNodeInfo *source_node_info, const char *source_pgdata, co current.start_time = time(NULL); StrNCpy(current.program_version, PROGRAM_VERSION, sizeof(current.program_version)); - //REVIEW I guess these are some copy-paste leftovers. Let's clean them. - //current.compress_alg = instance_config.compress_alg; - //current.compress_level = instance_config.compress_level; /* Do some compatibility checks and fill basic info about PG instance */ source_conn = pgdata_basic_setup(instance_config.conn_opt, source_node_info); @@ -548,18 +545,13 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * const char *linked_path = NULL; char to_path[MAXPGPATH]; - // perform additional check that this is actually symlink? - //REVIEW Why is this code block separated? - //REVIEW_ANSWER because i want to localize usage of source_full_path and symlink_content + // TODO perform additional check that this is actually symlink? { /* get full symlink path and map this path to new location */ char source_full_path[MAXPGPATH]; char symlink_content[MAXPGPATH]; join_path_components(source_full_path, source_pgdata, file->rel_path); fio_readlink(source_full_path, symlink_content, sizeof(symlink_content), FIO_DB_HOST); - //REVIEW What if we won't find mapping for this tablespace? - //I'd expect a failure. Otherwise, we may spoil source database data. - // REVIEW_ANSWER we checked that in preflight_checks for local catchup - // and for remote catchup this may be correct behavior + /* we checked that mapping exists in preflight_checks for local catchup */ linked_path = leaked_abstraction_get_tablespace_mapping(symlink_content); elog(INFO, "Map tablespace full_path: \"%s\" old_symlink_content: \"%s\" new_symlink_content: \"%s\"\n", source_full_path, @@ -619,9 +611,8 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * bool redundant = true; pgFile *file = (pgFile *) parray_get(dest_filelist, i); - //REVIEW Can we maybe optimize it and use some merge-like algorithm - //instead of bsearch for each file? Of course it isn't an urgent fix. - //REVIEW_ANSWER yes, merge will be better + //TODO optimize it and use some merge-like algorithm + //instead of bsearch for each file. if (parray_bsearch(source_filelist, file, pgFileCompareRelPathWithExternal)) redundant = false; @@ -653,10 +644,6 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * } } - //REVIEW Hmm. Why do we need this at all? - //I'd expect that we init pgfile with unset lock... - //Not related to this patch, though. - //REVIEW_ANSWER initialization in the pgFileInit function was proposed but was not accepted (see 2c8b7e9) /* clear file locks */ pfilearray_clear_locks(source_filelist); @@ -728,8 +715,6 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * elog(ERROR, "Data files transferring failed, time elapsed: %s", pretty_time); - //REVIEW The comment looks unrelated to the function. Do I miss something? - //REVIEW_ANSWER because it is a part of pg_stop_backup() calling /* Notify end of backup */ pg_silent_client_messages(source_conn); From 01fe2f976adc719e6ba9a1d54c6672606d3c048a Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Mon, 7 Jun 2021 14:09:36 +0300 Subject: [PATCH 42/63] Additional tablespace checks --- src/catchup.c | 29 +++++++++++++++++------------ src/dir.c | 14 +------------- src/pg_probackup.h | 2 +- 3 files changed, 19 insertions(+), 26 deletions(-) diff --git a/src/catchup.c b/src/catchup.c index fbb09c151..78ed6043d 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -58,7 +58,6 @@ do_catchup(const char *source_pgdata, const char *dest_pgdata, int num_threads) return 0; } - //REVIEW The name of this function looks strange to me. //Maybe catchup_init_state() or catchup_setup() will do better? //I'd also suggest to wrap all these fields into some CatchupState, but it isn't urgent. @@ -247,7 +246,8 @@ catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, /* * Check that all tablespaces exists in tablespace mapping (--tablespace-mapping option) - * Emit fatal error if that (not existent in map) tablespace found + * Check that all local mapped directories is empty if it is local catchup + * Emit fatal error if that (not existent in map or not empty) tablespace found */ static void catchup_check_tablespaces_existance_in_tbsmapping(PGconn *conn) @@ -271,7 +271,7 @@ catchup_check_tablespaces_existance_in_tbsmapping(PGconn *conn) Assert (strlen(tablespace_path) > 0); canonicalize_path(tablespace_path); - linked_path = leaked_abstraction_get_tablespace_mapping(tablespace_path); + linked_path = get_tablespace_mapping(tablespace_path); if (strcmp(tablespace_path, linked_path) == 0) /* same result -> not found in mapping */ @@ -279,7 +279,12 @@ catchup_check_tablespaces_existance_in_tbsmapping(PGconn *conn) "tablespace (\"%s\"), that are not listed in the map", tablespace_path); if (!is_absolute_path(linked_path)) - elog(ERROR, "Tablespace directory path must be an absolute path: %s\n", + elog(ERROR, "Tablespace directory path must be an absolute path: \"%s\"", + linked_path); + + if (current.backup_mode == BACKUP_MODE_FULL + && !dir_is_empty(linked_path, FIO_LOCAL_HOST)) + elog(ERROR, "Target mapped tablespace direcotory (\"%s\") is not empty in local catchup", linked_path); } PQclear(res); @@ -436,7 +441,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * /* Start stream replication */ join_path_components(dest_xlog_path, dest_pgdata, PG_XLOG_DIR); - fio_mkdir(dest_xlog_path, DIR_PERMISSION, FIO_BACKUP_HOST); + fio_mkdir(dest_xlog_path, DIR_PERMISSION, FIO_LOCAL_HOST); start_WAL_streaming(source_conn, dest_xlog_path, &instance_config.conn_opt, current.start_lsn, current.tli); @@ -537,7 +542,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * join_path_components(dirpath, dest_pgdata, file->rel_path); elog(VERBOSE, "Create directory '%s'", dirpath); - fio_mkdir(dirpath, DIR_PERMISSION, FIO_BACKUP_HOST); + fio_mkdir(dirpath, DIR_PERMISSION, FIO_LOCAL_HOST); } else { @@ -552,7 +557,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * join_path_components(source_full_path, source_pgdata, file->rel_path); fio_readlink(source_full_path, symlink_content, sizeof(symlink_content), FIO_DB_HOST); /* we checked that mapping exists in preflight_checks for local catchup */ - linked_path = leaked_abstraction_get_tablespace_mapping(symlink_content); + linked_path = get_tablespace_mapping(symlink_content); elog(INFO, "Map tablespace full_path: \"%s\" old_symlink_content: \"%s\" new_symlink_content: \"%s\"\n", source_full_path, symlink_content, @@ -569,12 +574,12 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * linked_path, to_path); /* create tablespace directory */ - if (fio_mkdir(linked_path, file->mode, FIO_BACKUP_HOST) != 0) + if (fio_mkdir(linked_path, file->mode, FIO_LOCAL_HOST) != 0) elog(ERROR, "Could not create tablespace directory \"%s\": %s", linked_path, strerror(errno)); /* create link to linked_path */ - if (fio_symlink(linked_path, to_path, true, FIO_BACKUP_HOST) < 0) + if (fio_symlink(linked_path, to_path, true, FIO_LOCAL_HOST) < 0) elog(ERROR, "Could not create symbolic link \"%s\" -> \"%s\": %s", linked_path, to_path, strerror(errno)); } @@ -702,7 +707,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * join_path_components(from_fullpath, source_pgdata, source_pg_control_file->rel_path); join_path_components(to_fullpath, dest_pgdata, source_pg_control_file->rel_path); copy_pgcontrol_file(from_fullpath, FIO_DB_HOST, - to_fullpath, FIO_BACKUP_HOST, source_pg_control_file); + to_fullpath, FIO_LOCAL_HOST, source_pg_control_file); } time(&end_time); @@ -820,7 +825,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * Assert(file->external_dir_num == 0); join_path_components(to_fullpath, dest_pgdata, file->rel_path); - if (fio_sync(to_fullpath, FIO_BACKUP_HOST) != 0) + if (fio_sync(to_fullpath, FIO_LOCAL_HOST) != 0) elog(ERROR, "Cannot sync file \"%s\": %s", to_fullpath, strerror(errno)); } @@ -828,7 +833,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * * sync pg_control file */ join_path_components(to_fullpath, dest_pgdata, source_pg_control_file->rel_path); - if (fio_sync(to_fullpath, FIO_BACKUP_HOST) != 0) + if (fio_sync(to_fullpath, FIO_LOCAL_HOST) != 0) elog(ERROR, "Cannot sync file \"%s\": %s", to_fullpath, strerror(errno)); time(&end_time); diff --git a/src/dir.c b/src/dir.c index 1476f85f1..b79e6914d 100644 --- a/src/dir.c +++ b/src/dir.c @@ -904,7 +904,7 @@ dir_list_file_internal(parray *files, pgFile *parent, const char *parent_dir, * * Copy of function get_tablespace_mapping() from pg_basebackup.c. */ -static const char * +const char * get_tablespace_mapping(const char *dir) { TablespaceListCell *cell; @@ -916,18 +916,6 @@ get_tablespace_mapping(const char *dir) return dir; } -//REVIEW What exactly wrong with this abstraction? I don't get it... -/* - * TODO протёкшая абстрация, надо на этапе ревью решить что с ней делать, - * потому как непонятно, почему мы в backup.c напрямую работаем с созданием - * каталогов, видимо, когда-то подразумевалось, что вся работа будет в dir.c - */ -const char * -leaked_abstraction_get_tablespace_mapping(const char *dir) -{ - return get_tablespace_mapping(dir); -} - /* * Split argument into old_dir and new_dir and append to mapping * list. diff --git a/src/pg_probackup.h b/src/pg_probackup.h index e6cd202d7..9970298e4 100644 --- a/src/pg_probackup.h +++ b/src/pg_probackup.h @@ -1033,7 +1033,7 @@ extern void dir_list_file(parray *files, const char *root, bool exclude, bool follow_symlink, bool add_root, bool backup_logs, bool skip_hidden, int external_dir_num, fio_location location); -extern const char *leaked_abstraction_get_tablespace_mapping(const char *dir); +extern const char *get_tablespace_mapping(const char *dir); extern void create_data_directories(parray *dest_files, const char *data_dir, const char *backup_dir, From 39df7ac9ce8ddc587699af34ffa29767d590b632 Mon Sep 17 00:00:00 2001 From: Elena Indrupskaya Date: Mon, 7 Jun 2021 15:32:33 +0300 Subject: [PATCH 43/63] [DOC] Added documentation for catchup command --- doc/pgprobackup.xml | 274 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 259 insertions(+), 15 deletions(-) diff --git a/doc/pgprobackup.xml b/doc/pgprobackup.xml index 2bf197814..553532cf9 100644 --- a/doc/pgprobackup.xml +++ b/doc/pgprobackup.xml @@ -143,6 +143,14 @@ doc/src/sgml/pgprobackup.sgml wal_file_name option + + pg_probackup + + catchup_mode + =path_to_pgdata_on_remote_server + =path_to_local_dir + option + @@ -283,6 +291,12 @@ doc/src/sgml/pgprobackup.sgml Partial restore: restoring only the specified databases. + + + Catchup: catching up a fallen-behind standby server + with the catchup command. + + To manage backup data, pg_probackup creates a @@ -1076,7 +1090,8 @@ GRANT SELECT ON TABLE pg_catalog.pg_database TO backup; mode: , , , - , + , + , and . @@ -1459,6 +1474,7 @@ pg_probackup backup -B backup_dir --instance + Performing Cluster Verification @@ -1534,6 +1550,7 @@ pg_probackup checkdb --amcheck --skip-block-validation [connection_ higher cost of CPU, memory, and I/O consumption. + Validating a Backup @@ -2101,6 +2118,7 @@ pg_probackup restore -B backup_dir --instance , , , + , and processes can be executed on several parallel threads. This can significantly @@ -3418,6 +3436,128 @@ pg_probackup delete -B backup_dir --instance + + + Cloning a Database Instance + + pg_probackup can create a copy of a PostgreSQL + database instance directly, without using the backup catalog. This allows you + to add a new standby server in a parallel mode or to catch up a standby + server that has fallen behind. + + + + Cloning a database instance is different from other pg_probackup + processes: + + + + The backup catalog is not required. + + + + + STREAM WAL delivery mode is only supported. + + + + + Copying external directories + is not supported. + + + + + + + Before cloning a database instance, set up the source database server as follows: + + + + Configure + the database cluster that contains the database instance to copy. + + + + + To copy from a remote server, configure the remote mode. + + + + + To use the PTRACK backup mode, set up PTRACK backups. + + + + + + + To clone a database instance, ensure that the source + database is running and accepting connections and + on the server with the destination database, run the following command: + + +pg_probackup catchup -b catchup-mode --source-pgdata=path_to_pgdata_on_remote_server --destination-pgdata=path_to_local_dir --stream [connection_options] [remote_options] + + + Where catchup_mode can take one of the + following values: FULL, DELTA, or PTRACK. + + + + + FULL — creates a full copy of the database instance. + The destination directory must be empty for this mode. + + + + + DELTA — reads all data files in the data directory and + creates an incremental copy for pages that have changed + since the destination database was shut down cleanly. + For this mode, the destination directory must contain a previous + copy of the database that was shut down cleanly. + + + + + PTRACK — creates an incremental backup tracking page + changes on the fly. + For this mode, the destination directory must contain a previous + copy of the database that was shut down cleanly. + + + + + You can use connection_options to specify + the connection to the source database, and if it is located on a different server, + also specify remote_options. + If the source database contains tablespaces that must be located in + a different directory, additionally specify the + option: + +pg_probackup catchup -b catchup-mode --source-pgdata=path_to_pgdata_on_remote_server --destination-pgdata=path_to_local_dir --stream --tablespace-mapping=OLDDIR=NEWDIR + + To run the catchup command on parallel threads, specify the number + of threads with the option: + +pg_probackup catchup -b catchup-mode --source-pgdata=path_to_pgdata_on_remote_server --destination-pgdata=path_to_local_dir --stream --threads=num_threads + + + + For example, assume that a remote standby server with the database instance in /replica-pgdata data directory has fallen behind. To sync this database instance with that in /master-pgdata data directory, you can run + catchup in the PTRACK mode on four parallel threads as follows: + +pg_probackup catchup --source-pgdata=/master-pgdata --destination-pgdata=/replica-pgdata -p 5432 -d postgres -U remote-postgres-user --stream --backup-mode=PTRACK --remote-host=remote-hostname --remote-user=remote-unix-username -j 4 + + + + Another example shows how you can add a new remote standby server with the PostgreSQL data directory /replica-pgdata by running catchup in the FULL mode on four parallel threads: + +pg_probackup catchup --source-pgdata=/master-pgdata --destination-pgdata=/replica-pgdata -p 5432 -d postgres -U remote-postgres-user --stream --backup-mode=FULL --remote-host=remote-hostname --remote-user=remote-unix-username -j 4 + + + @@ -3576,7 +3716,7 @@ pg_probackup show-config -B backup_dir --instance show pg_probackup show -B backup_dir -[--help] [--instance instance_name [-i backup_id | --archive]] [--format=plain|json] [--no-color] +[--help] [--instance instance_name [-i backup_id | --archive]] [--format=plain|json] Shows the contents of the backup catalog. If @@ -3591,8 +3731,6 @@ pg_probackup show -B backup_dir plain text. You can specify the --format=json option to get the result in the JSON format. - If --no-color flag is used, - then the output is not colored. For details on usage, see the sections @@ -4290,6 +4428,120 @@ pg_probackup archive-get -B backup_dir --instance Archiving Options. + + + catchup + +pg_probackup catchup -b catchup_mode +--source-pgdata=path_to_pgdata_on_remote_server +--destination-pgdata=path_to_local_dir +[--help] [--stream] [-j num_threads] +[-T OLDDIR=NEWDIR] +[connection_options] [remote_options] + + + Creates a copy of a PostgreSQL database + instance without using the backup catalog. + + + + + + + + Specifies the catchup mode to use. Possible values are: + + + + + FULL — creates a full copy of the database instance. + + + + + DELTA — reads all data files in the data directory and + creates an incremental copy for pages that have changed + since the destination database was shut down cleanly. + + + + + PTRACK — creates an incremental PTRACK backup tracking + page changes on the fly. + + + + + + + + + + + + Specifies the path to the data directory of the database to be copied, which can be local or remote. + + + + + + + + + Specifies the path to the local data directory of the database to copy to. + + + + + + + + + Makes a STREAM backup, which + includes all the necessary WAL files by streaming them from + the database server via replication protocol. + + + + + + + + + + Sets the number of parallel threads for + catchup process. + + + + + + + + + + Relocates the tablespace from the OLDDIR to the NEWDIR + directory at the time of recovery. Both OLDDIR and NEWDIR must + be absolute paths. If the path contains the equals sign (=), + escape it with a backslash. This option can be specified + multiple times for multiple tablespaces. + + + + + + + + + Additionally, connection + options, remote + mode options can be used. + + + For details on usage, see the section + Creating a Catchup Copy. + + Options @@ -4674,16 +4926,6 @@ pg_probackup archive-get -B backup_dir --instance - - - - - - Disable the coloring for console log messages of warning and error levels. - - - - @@ -4832,7 +5074,8 @@ pg_probackup archive-get -B backup_dir --instance Connection Options You can use these options together with - and + + , , and commands. @@ -5123,6 +5366,7 @@ pg_probackup archive-get -B backup_dir --instance , , , + , , , and commands. From fc760385cfb8c784c2905fed4ceb5ff8a8bcbc17 Mon Sep 17 00:00:00 2001 From: Elena Indrupskaya Date: Mon, 7 Jun 2021 16:01:55 +0300 Subject: [PATCH 44/63] Proofread catchup messages --- src/catchup.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/catchup.c b/src/catchup.c index 78ed6043d..2bde3edd4 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -145,13 +145,13 @@ catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, { if (current.backup_mode == BACKUP_MODE_DIFF_PTRACK || current.backup_mode == BACKUP_MODE_DIFF_DELTA) - elog(ERROR, "\"%s\" is empty but incremental catchup mode requested.", + elog(ERROR, "\"%s\" is empty, but incremental catchup mode requested.", dest_pgdata); } else /* dest dir not empty */ { if (current.backup_mode == BACKUP_MODE_FULL) - elog(ERROR, "Can't perform full catchup into not empty directory \"%s\".", + elog(ERROR, "Can't perform full catchup into non-empty directory \"%s\".", dest_pgdata); } @@ -217,7 +217,7 @@ catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, } if (current.from_replica && exclusive_backup) - elog(ERROR, "Catchup from standby is available only for PG >= 9.6"); + elog(ERROR, "Catchup from standby is only available for PostgreSQL >= 9.6"); /* if local catchup, check that we don't overwrite tablespace in source pgdata */ if (!fio_is_remote(FIO_DB_HOST)) @@ -276,7 +276,7 @@ catchup_check_tablespaces_existance_in_tbsmapping(PGconn *conn) if (strcmp(tablespace_path, linked_path) == 0) /* same result -> not found in mapping */ elog(ERROR, "Local catchup executed, but source database contains " - "tablespace (\"%s\"), that are not listed in the map", tablespace_path); + "tablespace (\"%s\"), that is not listed in the map", tablespace_path); if (!is_absolute_path(linked_path)) elog(ERROR, "Tablespace directory path must be an absolute path: \"%s\"", @@ -284,7 +284,7 @@ catchup_check_tablespaces_existance_in_tbsmapping(PGconn *conn) if (current.backup_mode == BACKUP_MODE_FULL && !dir_is_empty(linked_path, FIO_LOCAL_HOST)) - elog(ERROR, "Target mapped tablespace direcotory (\"%s\") is not empty in local catchup", + elog(ERROR, "Target mapped tablespace directory (\"%s\") is not empty in local catchup", linked_path); } PQclear(res); @@ -881,7 +881,7 @@ catchup_thread_runner(void *arg) /* check for interrupt */ if (interrupted || thread_interrupted) - elog(ERROR, "interrupted during catchup"); + elog(ERROR, "Interrupted during catchup"); if (progress) elog(INFO, "Progress: (%d/%d). Process file \"%s\"", From 132ec941cd3af593c0d7a76b49351cc42177aaf1 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Mon, 7 Jun 2021 16:05:43 +0300 Subject: [PATCH 45/63] fix lost lost --no-color option in docs --- doc/pgprobackup.xml | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/doc/pgprobackup.xml b/doc/pgprobackup.xml index 553532cf9..92f166f8a 100644 --- a/doc/pgprobackup.xml +++ b/doc/pgprobackup.xml @@ -3716,7 +3716,7 @@ pg_probackup show-config -B backup_dir --instance show pg_probackup show -B backup_dir -[--help] [--instance instance_name [-i backup_id | --archive]] [--format=plain|json] +[--help] [--instance instance_name [-i backup_id | --archive]] [--format=plain|json] [--no-color] Shows the contents of the backup catalog. If @@ -3731,6 +3731,8 @@ pg_probackup show -B backup_dir plain text. You can specify the --format=json option to get the result in the JSON format. + If --no-color flag is used, + then the output is not colored. For details on usage, see the sections @@ -4926,6 +4928,16 @@ pg_probackup catchup -b catchup_mode + + + + + + Disable the coloring for console log messages of warning and error levels. + + + + From ce34427567504619f95ed4daad59fcc115420eb6 Mon Sep 17 00:00:00 2001 From: Elena Indrupskaya Date: Mon, 7 Jun 2021 17:27:31 +0300 Subject: [PATCH 46/63] Pure language refinement --- doc/pgprobackup.xml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/pgprobackup.xml b/doc/pgprobackup.xml index 92f166f8a..53dcc3a3f 100644 --- a/doc/pgprobackup.xml +++ b/doc/pgprobackup.xml @@ -293,8 +293,7 @@ doc/src/sgml/pgprobackup.sgml - Catchup: catching up a fallen-behind standby server - with the catchup command. + Catchup: catching up of a fallen-behind standby server. @@ -3442,8 +3441,8 @@ pg_probackup delete -B backup_dir --instance pg_probackup can create a copy of a PostgreSQL database instance directly, without using the backup catalog. This allows you - to add a new standby server in a parallel mode or to catch up a standby - server that has fallen behind. + to add a new standby server in a parallel mode or to have a standby + server that has fallen behind catch up with master. @@ -3546,13 +3545,14 @@ pg_probackup catchup -b catchup-mode --source-pgdata= For example, assume that a remote standby server with the database instance in /replica-pgdata data directory has fallen behind. To sync this database instance with that in /master-pgdata data directory, you can run - catchup in the PTRACK mode on four parallel threads as follows: + the catchup command in the PTRACK mode on four parallel threads as follows: pg_probackup catchup --source-pgdata=/master-pgdata --destination-pgdata=/replica-pgdata -p 5432 -d postgres -U remote-postgres-user --stream --backup-mode=PTRACK --remote-host=remote-hostname --remote-user=remote-unix-username -j 4 - Another example shows how you can add a new remote standby server with the PostgreSQL data directory /replica-pgdata by running catchup in the FULL mode on four parallel threads: + Another example shows how you can add a new remote standby server with the PostgreSQL data directory /replica-pgdata by running the catchup command in the FULL mode + on four parallel threads: pg_probackup catchup --source-pgdata=/master-pgdata --destination-pgdata=/replica-pgdata -p 5432 -d postgres -U remote-postgres-user --stream --backup-mode=FULL --remote-host=remote-hostname --remote-user=remote-unix-username -j 4 @@ -4933,7 +4933,7 @@ pg_probackup catchup -b catchup_mode - Disable the coloring for console log messages of warning and error levels. + Disable coloring for console log messages of warning and error levels. From 5089400d8c031e39e2bbb2d9e92adb6077da6822 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Mon, 7 Jun 2021 18:37:25 +0300 Subject: [PATCH 47/63] fix datafile truncation bug --- src/backup.c | 2 +- src/catchup.c | 5 +++-- src/data.c | 12 +++++++++--- src/pg_probackup.h | 2 +- src/utils/file.c | 18 +++++++++++++++++- tests/catchup.py | 22 ++++++++++++++-------- tests/helpers/ptrack_helpers.py | 5 +++-- 7 files changed, 48 insertions(+), 18 deletions(-) diff --git a/src/backup.c b/src/backup.c index 915781c27..dd87772da 100644 --- a/src/backup.c +++ b/src/backup.c @@ -2392,4 +2392,4 @@ calculate_datasize_of_filelist(parray *filelist) bytes += file->size; } return bytes; -} \ No newline at end of file +} diff --git a/src/catchup.c b/src/catchup.c index 2bde3edd4..8124359c3 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -213,7 +213,7 @@ catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, join_path_components(backup_label_filename, dest_pgdata, "backup_label"); if (fio_access(backup_label_filename, F_OK, FIO_LOCAL_HOST) == 0) - elog(ERROR, "Destination directory contains \"backup_control\" file"); + elog(ERROR, "Destination directory contains \"backup_label\" file"); } if (current.from_replica && exclusive_backup) @@ -922,7 +922,8 @@ catchup_thread_runner(void *arg) arguments->nodeInfo->checksum_version, arguments->nodeInfo->ptrack_version_num, arguments->nodeInfo->ptrack_schema, - false); + false, + dest_file != NULL ? dest_file->size : 0); } else { diff --git a/src/data.c b/src/data.c index 98d46062f..74a25c85b 100644 --- a/src/data.c +++ b/src/data.c @@ -662,7 +662,7 @@ catchup_data_file(pgFile *file, const char *from_fullpath, const char *to_fullpa XLogRecPtr prev_backup_start_lsn, BackupMode backup_mode, CompressAlg calg, int clevel, uint32 checksum_version, int ptrack_version_num, const char *ptrack_schema, - bool is_merge) + bool is_merge, size_t prev_size) { int rc; bool use_pagemap; @@ -689,7 +689,7 @@ catchup_data_file(pgFile *file, const char *from_fullpath, const char *to_fullpa */ if (backup_mode == BACKUP_MODE_DIFF_PTRACK && file->pagemap.bitmapsize == PageBitmapIsEmpty && - file->exists_in_prev && !file->pagemap_isabsent) + file->exists_in_prev && file->size == prev_size && !file->pagemap_isabsent) { /* * There are no changed blocks since last backup. We want to make @@ -793,7 +793,7 @@ catchup_data_file(pgFile *file, const char *from_fullpath, const char *to_fullpa backup_mode == BACKUP_MODE_DIFF_DELTA) file->n_blocks = file->read_size / BLCKSZ; - /* Determine that file didn`t changed in case of incremental backup */ + /* Determine that file didn`t changed in case of incremental catchup */ if (backup_mode != BACKUP_MODE_FULL && file->exists_in_prev && file->write_size == 0 && @@ -2318,6 +2318,12 @@ copy_pages(const char *to_fullpath, const char *from_fullpath, elog(ERROR, "Cannot change mode of \"%s\": %s", to_fullpath, strerror(errno)); + elog(VERBOSE, "ftruncate file \"%s\" to size %lu", + to_fullpath, file->size); + if (fio_ftruncate(out, file->size) == -1) + elog(ERROR, "Cannot ftruncate file \"%s\" to size %lu: %s", + to_fullpath, file->size, strerror(errno)); + if (!fio_is_remote_file(out)) { out_buf = pgut_malloc(STDIO_BUFSIZE); diff --git a/src/pg_probackup.h b/src/pg_probackup.h index 9970298e4..030e6b537 100644 --- a/src/pg_probackup.h +++ b/src/pg_probackup.h @@ -1104,7 +1104,7 @@ extern void catchup_data_file(pgFile *file, const char *from_fullpath, const cha XLogRecPtr prev_backup_start_lsn, BackupMode backup_mode, CompressAlg calg, int clevel, uint32 checksum_version, int ptrack_version_num, const char *ptrack_schema, - bool missing_ok); + bool is_merge, size_t prev_size); extern void backup_non_data_file(pgFile *file, pgFile *prev_file, const char *from_fullpath, const char *to_fullpath, BackupMode backup_mode, time_t parent_backup_time, diff --git a/src/utils/file.c b/src/utils/file.c index 377aafb5b..490b9ddba 100644 --- a/src/utils/file.c +++ b/src/utils/file.c @@ -1939,11 +1939,27 @@ fio_copy_pages(const char *to_fullpath, const char *from_fullpath, pgFile *file, if (use_pagemap) IO_CHECK(fio_write_all(fio_stdout, (*file).pagemap.bitmap, (*file).pagemap.bitmapsize), (*file).pagemap.bitmapsize); - //out = open_local_file_rw_append(to_fullpath, &out_buf, STDIO_BUFSIZE); out = fio_fopen(to_fullpath, PG_BINARY_R "+", FIO_BACKUP_HOST); if (out == NULL) elog(ERROR, "Cannot open restore target file \"%s\": %s", to_fullpath, strerror(errno)); + /* update file permission */ + if (fio_chmod(to_fullpath, file->mode, FIO_BACKUP_HOST) == -1) + elog(ERROR, "Cannot change mode of \"%s\": %s", to_fullpath, + strerror(errno)); + + elog(VERBOSE, "ftruncate file \"%s\" to size %lu", + to_fullpath, file->size); + if (fio_ftruncate(out, file->size) == -1) + elog(ERROR, "Cannot ftruncate file \"%s\" to size %lu: %s", + to_fullpath, file->size, strerror(errno)); + + if (!fio_is_remote_file(out)) + { + out_buf = pgut_malloc(STDIO_BUFSIZE); + setvbuf(out, out_buf, _IOFBF, STDIO_BUFSIZE); + } + while (true) { fio_header hdr; diff --git a/tests/catchup.py b/tests/catchup.py index c0c0bfa43..dd126ecd3 100644 --- a/tests/catchup.py +++ b/tests/catchup.py @@ -1,3 +1,4 @@ +import io import os import unittest from .helpers.ptrack_helpers import ProbackupTest, ProbackupException @@ -24,7 +25,7 @@ def test_multithread_local_transfer(self): result = source_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) - dest_pg = self.catchup_node( + self.catchup_node( backup_mode = 'FULL', source_pgdata = source_pg.data_dir, destination_node = dest_pg, @@ -66,7 +67,7 @@ def test_local_simple_transfer_with_tablespace(self): dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) tblspace1_new_path = self.get_tblspace_path(dest_pg, 'tblspace1_new') - dest_pg = self.catchup_node( + self.catchup_node( backup_mode = 'FULL', source_pgdata = source_pg.data_dir, destination_node = dest_pg, @@ -113,7 +114,7 @@ def test_multithread_remote_transfer(self): result = source_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) - dest_pg = self.catchup_node( + self.catchup_node( backup_mode = 'FULL', source_pgdata = source_pg.data_dir, destination_node = dest_pg, @@ -160,7 +161,7 @@ def test_remote_ptrack_catchup(self): # make clean shutdowned lagging behind replica dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) - dest_pg = self.catchup_node( + self.catchup_node( backup_mode = 'FULL', source_pgdata = source_pg.data_dir, destination_node = dest_pg, @@ -224,7 +225,7 @@ def test_remote_delta_catchup(self): # make clean shutdowned lagging behind replica dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) - dest_pg = self.catchup_node( + self.catchup_node( backup_mode = 'FULL', source_pgdata = source_pg.data_dir, destination_node = dest_pg, @@ -292,7 +293,7 @@ def test_table_drop(self): "CREATE TABLE ultimate_question AS SELECT 42 AS answer") dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) - dest_pg = self.catchup_node( + self.catchup_node( backup_mode = 'FULL', source_pgdata = source_pg.data_dir, destination_node = dest_pg, @@ -354,7 +355,7 @@ def test_tablefile_truncation(self): source_pg.safe_psql("postgres", "VACUUM t_heap") dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) - dest_pg = self.catchup_node( + self.catchup_node( backup_mode = 'FULL', source_pgdata = source_pg.data_dir, destination_node = dest_pg, @@ -380,6 +381,8 @@ def test_tablefile_truncation(self): source_pgdata = source_pg.data_dir, destination_node = dest_pg, options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream']) + with io.open(os.path.join(dest_pg.logs_dir, 'catchup.log'), 'a') as catchup_log: + catchup_log.write(self.output) source_pgdata = self.pgdata_content(source_pg.data_dir) dest_pgdata = self.pgdata_content(dest_pg.data_dir) @@ -391,6 +394,9 @@ def test_tablefile_truncation(self): # @unittest.skip("skip") def test_local_tablespace_without_mapping(self): + if self.remote: + return unittest.skip('Skipped because this test tests local catchup error handling') + fname = self.id().split('.')[3] source_pg = self.make_simple_node( @@ -409,7 +415,7 @@ def test_local_tablespace_without_mapping(self): dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) try: - dest_pg = self.catchup_node( + self.catchup_node( backup_mode = 'FULL', source_pgdata = source_pg.data_dir, destination_node = dest_pg, diff --git a/tests/helpers/ptrack_helpers.py b/tests/helpers/ptrack_helpers.py index 8b35bae7c..1ce4a8355 100644 --- a/tests/helpers/ptrack_helpers.py +++ b/tests/helpers/ptrack_helpers.py @@ -1052,9 +1052,10 @@ def catchup_node( ] if self.remote: cmd_list += ['--remote-proto=ssh', '--remote-host=localhost'] + if self.verbose: + cmd_list += ['--log-level-console=verbose'] - self.run_pb(cmd_list + options) - return destination_node + return self.run_pb(cmd_list + options) def show_pb( self, backup_dir, instance=None, backup_id=None, From 3e43d00530a06751511834056b6faeba914ea2ee Mon Sep 17 00:00:00 2001 From: Elena Indrupskaya Date: Tue, 8 Jun 2021 11:53:00 +0300 Subject: [PATCH 48/63] Wording in documentation refined from techwriters' feedback --- doc/pgprobackup.xml | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/doc/pgprobackup.xml b/doc/pgprobackup.xml index 53dcc3a3f..e2e8fe954 100644 --- a/doc/pgprobackup.xml +++ b/doc/pgprobackup.xml @@ -293,7 +293,7 @@ doc/src/sgml/pgprobackup.sgml - Catchup: catching up of a fallen-behind standby server. + Catchup: cloning a PostgreSQL instance for a fallen-behind standby server to catch up with master. @@ -3437,16 +3437,16 @@ pg_probackup delete -B backup_dir --instance - Cloning a Database Instance + Cloning <productname>PostgreSQL</productname> Instance pg_probackup can create a copy of a PostgreSQL - database instance directly, without using the backup catalog. This allows you + instance directly, without using the backup catalog. This allows you to add a new standby server in a parallel mode or to have a standby server that has fallen behind catch up with master. - Cloning a database instance is different from other pg_probackup + Cloning a PostgreSQL instance is different from other pg_probackup processes: @@ -3469,12 +3469,12 @@ pg_probackup delete -B backup_dir --instance - Before cloning a database instance, set up the source database server as follows: + Before cloning a PostgreSQL instance, set up the source database server as follows: Configure - the database cluster that contains the database instance to copy. + the database cluster for the instance to copy. @@ -3491,8 +3491,8 @@ pg_probackup delete -B backup_dir --instance - To clone a database instance, ensure that the source - database is running and accepting connections and + To clone a PostgreSQL instance, ensure that the source + database server is running and accepting connections and on the server with the destination database, run the following command: @@ -3505,7 +3505,7 @@ pg_probackup catchup -b catchup-mode --source-pgdata= - FULL — creates a full copy of the database instance. + FULL — creates a full copy of the PostgreSQL instance. The destination directory must be empty for this mode. @@ -3529,7 +3529,7 @@ pg_probackup catchup -b catchup-mode --source-pgdata= You can use connection_options to specify - the connection to the source database, and if it is located on a different server, + the connection to the source database cluster. If it is located on a different server, also specify remote_options. If the source database contains tablespaces that must be located in a different directory, additionally specify the @@ -3544,7 +3544,7 @@ pg_probackup catchup -b catchup-mode --source-pgdata= - For example, assume that a remote standby server with the database instance in /replica-pgdata data directory has fallen behind. To sync this database instance with that in /master-pgdata data directory, you can run + For example, assume that a remote standby server with the PostgreSQL instance in /replica-pgdata data directory has fallen behind. To sync this instance with the one in /master-pgdata data directory, you can run the catchup command in the PTRACK mode on four parallel threads as follows: pg_probackup catchup --source-pgdata=/master-pgdata --destination-pgdata=/replica-pgdata -p 5432 -d postgres -U remote-postgres-user --stream --backup-mode=PTRACK --remote-host=remote-hostname --remote-user=remote-unix-username -j 4 @@ -4442,7 +4442,7 @@ pg_probackup catchup -b catchup_mode [connection_options] [remote_options] - Creates a copy of a PostgreSQL database + Creates a copy of a PostgreSQL instance without using the backup catalog. @@ -4456,7 +4456,7 @@ pg_probackup catchup -b catchup_mode - FULL — creates a full copy of the database instance. + FULL — creates a full copy of the PostgreSQL instance. @@ -4481,7 +4481,7 @@ pg_probackup catchup -b catchup_mode - Specifies the path to the data directory of the database to be copied, which can be local or remote. + Specifies the path to the data directory of the instance to be copied. The path can be local or remote. @@ -4490,7 +4490,7 @@ pg_probackup catchup -b catchup_mode - Specifies the path to the local data directory of the database to copy to. + Specifies the path to the local data directory to copy to. @@ -4541,7 +4541,7 @@ pg_probackup catchup -b catchup_mode For details on usage, see the section - Creating a Catchup Copy. + Cloning PostgreSQL Instance. From 55c114f7b49bf8ba17629e95fcdb306e3a15119b Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Tue, 8 Jun 2021 12:21:54 +0300 Subject: [PATCH 49/63] bugfix: remove unnecessary tablespace_map file write --- src/catchup.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/catchup.c b/src/catchup.c index 8124359c3..866e308ad 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -745,10 +745,8 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * wait_wal_and_calculate_stop_lsn(dest_xlog_path, stop_backup_result.lsn, ¤t); - /* Write backup_label and tablespace_map */ - Assert(stop_backup_result.backup_label_content != NULL); - /* Write backup_label */ + Assert(stop_backup_result.backup_label_content != NULL); pg_stop_backup_write_file_helper(dest_pgdata, PG_BACKUP_LABEL_FILE, "backup label", stop_backup_result.backup_label_content, stop_backup_result.backup_label_content_len, NULL); @@ -756,13 +754,17 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * stop_backup_result.backup_label_content = NULL; stop_backup_result.backup_label_content_len = 0; - /* Write tablespace_map */ + /* tablespace_map */ if (stop_backup_result.tablespace_map_content != NULL) { // TODO what if tablespace is created during catchup? - pg_stop_backup_write_file_helper(dest_pgdata, PG_TABLESPACE_MAP_FILE, "tablespace map", - stop_backup_result.tablespace_map_content, stop_backup_result.tablespace_map_content_len, - NULL); + /* Because we have already created symlinks in pg_tblspc earlier, + * we do not need to write the tablespace_map file. + * So this call is unnecessary: + * pg_stop_backup_write_file_helper(dest_pgdata, PG_TABLESPACE_MAP_FILE, "tablespace map", + * stop_backup_result.tablespace_map_content, stop_backup_result.tablespace_map_content_len, + * NULL); + */ free(stop_backup_result.tablespace_map_content); stop_backup_result.tablespace_map_content = NULL; stop_backup_result.tablespace_map_content_len = 0; From 9cde0f9af377d6d36d42fa56a7488611bea25cac Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Tue, 8 Jun 2021 13:13:03 +0300 Subject: [PATCH 50/63] Modify tablespace preflight checks (as per comment by Roman Zharkov) --- src/catchup.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/catchup.c b/src/catchup.c index 866e308ad..901f2feb4 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -219,9 +219,8 @@ catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, if (current.from_replica && exclusive_backup) elog(ERROR, "Catchup from standby is only available for PostgreSQL >= 9.6"); - /* if local catchup, check that we don't overwrite tablespace in source pgdata */ - if (!fio_is_remote(FIO_DB_HOST)) - catchup_check_tablespaces_existance_in_tbsmapping(source_conn); + /* check that we don't overwrite tablespace in source pgdata */ + catchup_check_tablespaces_existance_in_tbsmapping(source_conn); /* check timelines */ if (current.backup_mode != BACKUP_MODE_FULL) @@ -246,7 +245,7 @@ catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, /* * Check that all tablespaces exists in tablespace mapping (--tablespace-mapping option) - * Check that all local mapped directories is empty if it is local catchup + * Check that all local mapped directories is empty if it is local FULL catchup * Emit fatal error if that (not existent in map or not empty) tablespace found */ static void @@ -274,9 +273,15 @@ catchup_check_tablespaces_existance_in_tbsmapping(PGconn *conn) linked_path = get_tablespace_mapping(tablespace_path); if (strcmp(tablespace_path, linked_path) == 0) - /* same result -> not found in mapping */ - elog(ERROR, "Local catchup executed, but source database contains " - "tablespace (\"%s\"), that is not listed in the map", tablespace_path); + /* same result -> not found in mapping */ + { + if (!fio_is_remote(FIO_DB_HOST)) + elog(ERROR, "Local catchup executed, but source database contains " + "tablespace (\"%s\"), that is not listed in the map", tablespace_path); + else + elog(WARNING, "Remote catchup executed and source database contains " + "tablespace (\"%s\"), that is not listed in the map", tablespace_path); + } if (!is_absolute_path(linked_path)) elog(ERROR, "Tablespace directory path must be an absolute path: \"%s\"", @@ -284,7 +289,7 @@ catchup_check_tablespaces_existance_in_tbsmapping(PGconn *conn) if (current.backup_mode == BACKUP_MODE_FULL && !dir_is_empty(linked_path, FIO_LOCAL_HOST)) - elog(ERROR, "Target mapped tablespace directory (\"%s\") is not empty in local catchup", + elog(ERROR, "Target mapped tablespace directory (\"%s\") is not empty in FULL catchup", linked_path); } PQclear(res); From 1ab642a80fb8ed7c8b952c9bc74d32539eac49f0 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Wed, 9 Jun 2021 12:33:29 +0300 Subject: [PATCH 51/63] typo --- src/stream.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/stream.c b/src/stream.c index 88eab5098..615d25281 100644 --- a/src/stream.c +++ b/src/stream.c @@ -172,7 +172,7 @@ StreamLog(void *arg) */ stream_arg->startpos -= stream_arg->startpos % instance_config.xlog_seg_size; - xlog_files_list = parray_new(); + xlog_files_list = parray_new(); /* Initialize timeout */ stream_stop_begin = 0; From 6240c9f63227486b0a8dc6336ed62b787f8b5b11 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Wed, 9 Jun 2021 12:35:07 +0300 Subject: [PATCH 52/63] bugfix: fix incorrect pg_control reading (as per comment by Roman Zharkov) --- src/catchup.c | 6 +++--- src/pg_probackup.h | 2 +- src/restore.c | 2 +- src/util.c | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/catchup.c b/src/catchup.c index 901f2feb4..fc004506e 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -211,9 +211,9 @@ catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, { char backup_label_filename[MAXPGPATH]; - join_path_components(backup_label_filename, dest_pgdata, "backup_label"); + join_path_components(backup_label_filename, dest_pgdata, PG_BACKUP_LABEL_FILE); if (fio_access(backup_label_filename, F_OK, FIO_LOCAL_HOST) == 0) - elog(ERROR, "Destination directory contains \"backup_label\" file"); + elog(ERROR, "Destination directory contains \"" PG_BACKUP_LABEL_FILE "\" file"); } if (current.from_replica && exclusive_backup) @@ -414,7 +414,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * true, true, false, backup_logs, true, 0, FIO_LOCAL_HOST); // fill dest_redo.lsn and dest_redo.tli - get_redo(dest_pgdata, &dest_redo); + get_redo(dest_pgdata, FIO_LOCAL_HOST, &dest_redo); elog(INFO, "syncLSN = %X/%X", (uint32) (dest_redo.lsn >> 32), (uint32) dest_redo.lsn); } diff --git a/src/pg_probackup.h b/src/pg_probackup.h index 030e6b537..01559a52f 100644 --- a/src/pg_probackup.h +++ b/src/pg_probackup.h @@ -1177,7 +1177,7 @@ extern uint64 get_remote_system_identifier(PGconn *conn); extern uint32 get_data_checksum_version(bool safe); extern pg_crc32c get_pgcontrol_checksum(const char *pgdata_path); extern uint32 get_xlog_seg_size(const char *pgdata_path); -extern void get_redo(const char *pgdata_path, RedoParams *redo); +extern void get_redo(const char *pgdata_path, fio_location pgdata_location, RedoParams *redo); extern void set_min_recovery_point(pgFile *file, const char *backup_path, XLogRecPtr stop_backup_lsn); extern void copy_pgcontrol_file(const char *from_fullpath, fio_location from_location, diff --git a/src/restore.c b/src/restore.c index 5e1233a3e..e785e551b 100644 --- a/src/restore.c +++ b/src/restore.c @@ -485,7 +485,7 @@ do_restore_or_validate(InstanceState *instanceState, time_t target_backup_id, pg { RedoParams redo; parray *timelines = NULL; - get_redo(instance_config.pgdata, &redo); + get_redo(instance_config.pgdata, FIO_DB_HOST, &redo); if (redo.checksum_version == 0) elog(ERROR, "Incremental restore in 'lsn' mode require " diff --git a/src/util.c b/src/util.c index 7d486ea44..094e5b3dd 100644 --- a/src/util.c +++ b/src/util.c @@ -352,14 +352,14 @@ get_pgcontrol_checksum(const char *pgdata_path) } void -get_redo(const char *pgdata_path, RedoParams *redo) +get_redo(const char *pgdata_path, fio_location pgdata_location, RedoParams *redo) { ControlFileData ControlFile; char *buffer; size_t size; /* First fetch file... */ - buffer = slurpFile(pgdata_path, XLOG_CONTROL_FILE, &size, false, FIO_DB_HOST); + buffer = slurpFile(pgdata_path, XLOG_CONTROL_FILE, &size, false, pgdata_location); digestControlFile(&ControlFile, buffer, size); pg_free(buffer); From 1a48b0a7ac56633f722b4e82df8df6a572401b54 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Wed, 9 Jun 2021 13:44:26 +0300 Subject: [PATCH 53/63] bugfix: add additional cmdline check (reported by Roman Zharkov) --- src/pg_probackup.c | 9 ++++++++- tests/catchup.py | 3 --- tests/helpers/ptrack_helpers.py | 5 ++++- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/pg_probackup.c b/src/pg_probackup.c index 7270cfb9c..1ae8fa660 100644 --- a/src/pg_probackup.c +++ b/src/pg_probackup.c @@ -451,7 +451,7 @@ main(int argc, char *argv[]) catalogState->catalog_path, WAL_SUBDIR); } - /* backup_path is required for all pg_probackup commands except help, version and checkdb */ + /* backup_path is required for all pg_probackup commands except help, version, checkdb and catchup */ if (backup_path == NULL && backup_subcmd != CHECKDB_CMD && backup_subcmd != HELP_CMD && @@ -598,6 +598,13 @@ main(int argc, char *argv[]) "You must specify --log-directory option when running checkdb with " "--log-level-file option enabled."); + if (backup_subcmd == CATCHUP_CMD && + instance_config.logger.log_level_file != LOG_OFF && + instance_config.logger.log_directory == NULL) + elog(ERROR, "Cannot save catchup logs to a file. " + "You must specify --log-directory option when running catchup with " + "--log-level-file option enabled."); + /* Initialize logger */ init_logger(backup_path, &instance_config.logger); diff --git a/tests/catchup.py b/tests/catchup.py index dd126ecd3..91ce9e961 100644 --- a/tests/catchup.py +++ b/tests/catchup.py @@ -1,4 +1,3 @@ -import io import os import unittest from .helpers.ptrack_helpers import ProbackupTest, ProbackupException @@ -381,8 +380,6 @@ def test_tablefile_truncation(self): source_pgdata = source_pg.data_dir, destination_node = dest_pg, options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream']) - with io.open(os.path.join(dest_pg.logs_dir, 'catchup.log'), 'a') as catchup_log: - catchup_log.write(self.output) source_pgdata = self.pgdata_content(source_pg.data_dir) dest_pgdata = self.pgdata_content(dest_pg.data_dir) diff --git a/tests/helpers/ptrack_helpers.py b/tests/helpers/ptrack_helpers.py index 1ce4a8355..de19db8c4 100644 --- a/tests/helpers/ptrack_helpers.py +++ b/tests/helpers/ptrack_helpers.py @@ -1053,7 +1053,10 @@ def catchup_node( if self.remote: cmd_list += ['--remote-proto=ssh', '--remote-host=localhost'] if self.verbose: - cmd_list += ['--log-level-console=verbose'] + cmd_list += [ + '--log-level-file=VERBOSE', + '--log-directory={0}'.format(destination_node.logs_dir) + ] return self.run_pb(cmd_list + options) From 1811eb7b3999fa3802333b019b96b39a9f9d90ca Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Thu, 10 Jun 2021 15:51:28 +0300 Subject: [PATCH 54/63] refine catchup tests --- tests/__init__.py | 4 +- tests/catchup.py | 663 ++++++++++++++++++++++++++-------------------- 2 files changed, 382 insertions(+), 285 deletions(-) diff --git a/tests/__init__.py b/tests/__init__.py index dbf84feea..080512760 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -6,7 +6,8 @@ retention, pgpro560, pgpro589, pgpro2068, false_positive, replica, \ compression, page, ptrack, archive, exclude, cfs_backup, cfs_restore, \ cfs_validate_backup, auth_test, time_stamp, snapfs, logging, \ - locking, remote, external, config, checkdb, set_backup, incr_restore + locking, remote, external, config, checkdb, set_backup, incr_restore, \ + catchup def load_tests(loader, tests, pattern): @@ -23,6 +24,7 @@ def load_tests(loader, tests, pattern): # suite.addTests(loader.loadTestsFromModule(auth_test)) suite.addTests(loader.loadTestsFromModule(archive)) suite.addTests(loader.loadTestsFromModule(backup)) + suite.addTests(loader.loadTestsFromModule(catchup)) suite.addTests(loader.loadTestsFromModule(compatibility)) suite.addTests(loader.loadTestsFromModule(checkdb)) suite.addTests(loader.loadTestsFromModule(config)) diff --git a/tests/catchup.py b/tests/catchup.py index 91ce9e961..0332bcb6e 100644 --- a/tests/catchup.py +++ b/tests/catchup.py @@ -5,420 +5,515 @@ module_name = 'catchup' class CatchupTest(ProbackupTest, unittest.TestCase): + def setUp(self): + self.fname = self.id().split('.')[3] - # @unittest.skip("skip") - def test_multithread_local_transfer(self): +######################################### +# Basic tests +######################################### + def test_simple_full_catchup(self): """ - Test 'multithreaded basebackup' mode - create node, insert some test data, catchup into other dir, start, select test data + Test 'multithreaded basebackup' mode (aka FULL catchup) """ - fname = self.id().split('.')[3] - - source_pg = self.make_simple_node( - base_dir = os.path.join(module_name, fname, 'src'), - set_replication=True) - source_pg.slow_start() - source_pg.safe_psql( + # preparation + src_pg = self.make_simple_node( + base_dir = os.path.join(module_name, self.fname, 'src'), + set_replication = True + ) + src_pg.slow_start() + src_pg.safe_psql( "postgres", "CREATE TABLE ultimate_question AS SELECT 42 AS answer") - result = source_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") + src_query_result = src_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") - dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) + # do full catchup + dst_pg = self.make_empty_node(os.path.join(module_name, self.fname, 'dst')) self.catchup_node( backup_mode = 'FULL', - source_pgdata = source_pg.data_dir, - destination_node = dest_pg, - options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream', '-j', '4'] + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] ) - source_pg.stop() - - dest_options = {} - dest_options['port'] = str(dest_pg.port) - self.set_auto_conf(dest_pg, dest_options) - dest_pg.slow_start() - self.assertEqual( - result, - dest_pg.safe_psql("postgres", "SELECT * FROM ultimate_question"), - 'Different answer from copy') - dest_pg.stop() - # Clean after yourself - self.del_test_dir(module_name, fname) + # 1st check: compare data directories + self.compare_pgdata( + self.pgdata_content(src_pg.data_dir), + self.pgdata_content(dst_pg.data_dir) + ) - # @unittest.skip("skip") - def test_local_simple_transfer_with_tablespace(self): - fname = self.id().split('.')[3] + # run&recover catchup'ed instance + src_pg.stop() + dst_options = {} + dst_options['port'] = str(dst_pg.port) + self.set_auto_conf(dst_pg, dst_options) + dst_pg.slow_start() - source_pg = self.make_simple_node( - base_dir = os.path.join(module_name, fname, 'src'), - initdb_params = ['--data-checksums']) - source_pg.slow_start() + # 2nd check: run verification query + dst_query_result = dst_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") + self.assertEqual(src_query_result, dst_query_result, 'Different answer from copy') - tblspace1_old_path = self.get_tblspace_path(source_pg, 'tblspace1_old') - self.create_tblspace_in_node( - source_pg, 'tblspace1', - tblspc_path = tblspace1_old_path) + # Cleanup + dst_pg.stop() + self.del_test_dir(module_name, self.fname) - source_pg.safe_psql( + def test_full_catchup_with_tablespace(self): + """ + Test tablespace transfers + """ + # preparation + src_pg = self.make_simple_node( + base_dir = os.path.join(module_name, self.fname, 'src'), + set_replication = True + ) + src_pg.slow_start() + tblspace1_old_path = self.get_tblspace_path(src_pg, 'tblspace1_old') + self.create_tblspace_in_node(src_pg, 'tblspace1', tblspc_path = tblspace1_old_path) + src_pg.safe_psql( "postgres", "CREATE TABLE ultimate_question TABLESPACE tblspace1 AS SELECT 42 AS answer") - result = source_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") + src_query_result = src_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") - dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) - tblspace1_new_path = self.get_tblspace_path(dest_pg, 'tblspace1_new') + # do full catchup with tablespace mapping + dst_pg = self.make_empty_node(os.path.join(module_name, self.fname, 'dst')) + tblspace1_new_path = self.get_tblspace_path(dst_pg, 'tblspace1_new') self.catchup_node( backup_mode = 'FULL', - source_pgdata = source_pg.data_dir, - destination_node = dest_pg, + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, options = [ '-d', 'postgres', - '-p', str(source_pg.port), + '-p', str(src_pg.port), '--stream', '-T', '{0}={1}'.format(tblspace1_old_path, tblspace1_new_path) ] ) - source_pgdata = self.pgdata_content(source_pg.data_dir) - dest_pgdata = self.pgdata_content(dest_pg.data_dir) - self.compare_pgdata(source_pgdata, dest_pgdata) + # 1st check: compare data directories + self.compare_pgdata( + self.pgdata_content(src_pg.data_dir), + self.pgdata_content(dst_pg.data_dir) + ) - source_pg.stop() + # make changes in master tablespace + src_pg.safe_psql( + "postgres", + "UPDATE ultimate_question SET answer = -1") + src_pg.stop() - dest_options = {} - dest_options['port'] = str(dest_pg.port) - self.set_auto_conf(dest_pg, dest_options) - dest_pg.slow_start() - self.assertEqual( - result, - dest_pg.safe_psql("postgres", "SELECT * FROM ultimate_question"), - 'Different answer from copy') - dest_pg.stop() + # run&recover catchup'ed instance + dst_options = {} + dst_options['port'] = str(dst_pg.port) + self.set_auto_conf(dst_pg, dst_options) + dst_pg.slow_start() - # Clean after yourself - self.del_test_dir(module_name, fname) + # 2nd check: run verification query + dst_query_result = dst_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") + self.assertEqual(src_query_result, dst_query_result, 'Different answer from copy') + + # Cleanup + dst_pg.stop() + self.del_test_dir(module_name, self.fname) - # @unittest.skip("skip") - def test_multithread_remote_transfer(self): + def test_simple_delta_catchup(self): """ - Test 'multithreaded basebackup' mode - create node, insert some test data, catchup into other dir, start, select test data + Test delta catchup """ - fname = self.id().split('.')[3] - - source_pg = self.make_simple_node(base_dir = os.path.join(module_name, fname, 'src')) - source_pg.slow_start() - source_pg.safe_psql( + # preparation 1: source + src_pg = self.make_simple_node( + base_dir = os.path.join(module_name, self.fname, 'src'), + set_replication = True, + pg_options = { 'wal_log_hints': 'on' } + ) + src_pg.slow_start() + src_pg.safe_psql( "postgres", - "CREATE TABLE ultimate_question AS SELECT 42 AS answer") - result = source_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") + "CREATE TABLE ultimate_question(answer int)") - dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) + # preparation 2: make clean shutdowned lagging behind replica + dst_pg = self.make_empty_node(os.path.join(module_name, self.fname, 'dst')) self.catchup_node( backup_mode = 'FULL', - source_pgdata = source_pg.data_dir, - destination_node = dest_pg, - options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream', '-j', '4']) - source_pg.stop() + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] + ) + self.set_replica(src_pg, dst_pg) + dst_options = {} + dst_options['port'] = str(dst_pg.port) + self.set_auto_conf(dst_pg, dst_options) + dst_pg.slow_start(replica = True) + dst_pg.stop() + + # preparation 3: make changes on master (source) + src_pg.pgbench_init(scale = 10) + pgbench = src_pg.pgbench(options=['-T', '10', '--no-vacuum']) + pgbench.wait() + src_pg.safe_psql("postgres", "INSERT INTO ultimate_question VALUES(42)") + src_query_result = src_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") - dest_options = {} - dest_options['port'] = str(dest_pg.port) - self.set_auto_conf(dest_pg, dest_options) - dest_pg.slow_start() - self.assertEqual( - result, - dest_pg.safe_psql("postgres", "SELECT * FROM ultimate_question"), - 'Different answer from copy') - dest_pg.stop() + # do delta catchup + self.catchup_node( + backup_mode = 'DELTA', + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] + ) - # Clean after yourself - self.del_test_dir(module_name, fname) + # 1st check: compare data directories + self.compare_pgdata( + self.pgdata_content(src_pg.data_dir), + self.pgdata_content(dst_pg.data_dir) + ) + + # run&recover catchup'ed instance + src_pg.stop() + self.set_replica(master = src_pg, replica = dst_pg) + dst_options = {} + dst_options['port'] = str(dst_pg.port) + self.set_auto_conf(dst_pg, dst_options) + dst_pg.slow_start(replica = True) + + # 2nd check: run verification query + dst_query_result = dst_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") + self.assertEqual(src_query_result, dst_query_result, 'Different answer from copy') + + # Cleanup + dst_pg.stop() + self.del_test_dir(module_name, self.fname) - # @unittest.skip("skip") - def test_remote_ptrack_catchup(self): + def test_simple_ptrack_catchup(self): """ - Test 'catchup' mode - create node, - make a copy with replication, start copy, stop copy, - generate some load on master, insert some test data on master, - catchup copy, start and select test data + Test ptrack catchup """ if not self.ptrack: return unittest.skip('Skipped because ptrack support is disabled') - fname = self.id().split('.')[3] - - # prepare master - source_pg = self.make_simple_node( - base_dir = os.path.join(module_name, fname, 'src'), + # preparation 1: source + src_pg = self.make_simple_node( + base_dir = os.path.join(module_name, self.fname, 'src'), set_replication = True, ptrack_enable = True, initdb_params = ['--data-checksums'] ) - source_pg.slow_start() - source_pg.safe_psql("postgres", "CREATE EXTENSION ptrack") - source_pg.safe_psql("postgres", "CREATE TABLE ultimate_question(answer int)") + src_pg.slow_start() + src_pg.safe_psql("postgres", "CREATE EXTENSION ptrack") + src_pg.safe_psql( + "postgres", + "CREATE TABLE ultimate_question(answer int)") - # make clean shutdowned lagging behind replica - dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) + # preparation 2: make clean shutdowned lagging behind replica + dst_pg = self.make_empty_node(os.path.join(module_name, self.fname, 'dst')) self.catchup_node( backup_mode = 'FULL', - source_pgdata = source_pg.data_dir, - destination_node = dest_pg, - options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream']) - self.set_replica(source_pg, dest_pg) - dest_pg.slow_start(replica = True) - dest_pg.stop() - - # make changes on master - source_pg.pgbench_init(scale=10) - pgbench = source_pg.pgbench(options=['-T', '10', '--no-vacuum']) + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] + ) + self.set_replica(src_pg, dst_pg) + dst_options = {} + dst_options['port'] = str(dst_pg.port) + self.set_auto_conf(dst_pg, dst_options) + dst_pg.slow_start(replica = True) + dst_pg.stop() + + # preparation 3: make changes on master (source) + src_pg.pgbench_init(scale = 10) + pgbench = src_pg.pgbench(options=['-T', '10', '--no-vacuum']) pgbench.wait() - source_pg.safe_psql("postgres", "INSERT INTO ultimate_question VALUES(42)") - result = source_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") + src_pg.safe_psql("postgres", "INSERT INTO ultimate_question VALUES(42)") + src_query_result = src_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") - # catchup + # do ptrack catchup self.catchup_node( backup_mode = 'PTRACK', - source_pgdata = source_pg.data_dir, - destination_node = dest_pg, - options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream']) - - # stop replication - source_pg.stop() - - # check latest changes - dest_options = {} - dest_options['port'] = str(dest_pg.port) - self.set_auto_conf(dest_pg, dest_options) - self.set_replica(source_pg, dest_pg) - dest_pg.slow_start(replica = True) - self.assertEqual( - result, - dest_pg.safe_psql("postgres", "SELECT * FROM ultimate_question"), - 'Different answer from copy') - dest_pg.stop() + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] + ) - # Clean after yourself - self.del_test_dir(module_name, fname) + # 1st check: compare data directories + self.compare_pgdata( + self.pgdata_content(src_pg.data_dir), + self.pgdata_content(dst_pg.data_dir) + ) - # @unittest.skip("skip") - def test_remote_delta_catchup(self): + # run&recover catchup'ed instance + src_pg.stop() + self.set_replica(master = src_pg, replica = dst_pg) + dst_options = {} + dst_options['port'] = str(dst_pg.port) + self.set_auto_conf(dst_pg, dst_options) + dst_pg.slow_start(replica = True) + + # 2nd check: run verification query + dst_query_result = dst_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") + self.assertEqual(src_query_result, dst_query_result, 'Different answer from copy') + + # Cleanup + dst_pg.stop() + self.del_test_dir(module_name, self.fname) + +######################################### +# Test various corner conditions +######################################### + def test_table_drop_with_delta(self): """ - Test 'catchup' mode - create node, - make a copy with replication, start copy, stop copy, - generate some load on master, insert some test data on master, - catchup copy, start and select test data + Test that dropped table in source will be dropped in delta catchup'ed instance too """ - fname = self.id().split('.')[3] - - # prepare master - source_pg = self.make_simple_node( - base_dir = os.path.join(module_name, fname, 'src'), + # preparation 1: source + src_pg = self.make_simple_node( + base_dir = os.path.join(module_name, self.fname, 'src'), set_replication = True, - ptrack_enable = True, pg_options = { 'wal_log_hints': 'on' } ) - source_pg.slow_start() - source_pg.safe_psql("postgres", "CREATE TABLE ultimate_question(answer int)") + src_pg.slow_start() + src_pg.safe_psql( + "postgres", + "CREATE TABLE ultimate_question AS SELECT 42 AS answer") - # make clean shutdowned lagging behind replica - dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) + # preparation 2: make clean shutdowned lagging behind replica + dst_pg = self.make_empty_node(os.path.join(module_name, self.fname, 'dst')) self.catchup_node( backup_mode = 'FULL', - source_pgdata = source_pg.data_dir, - destination_node = dest_pg, - options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream'] + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] ) - self.set_replica(source_pg, dest_pg) - dest_options = {} - dest_options['port'] = str(dest_pg.port) - self.set_auto_conf(dest_pg, dest_options) - dest_pg.slow_start(replica = True) - dest_pg.stop() - - # make changes on master - source_pg.pgbench_init(scale = 10) - pgbench = source_pg.pgbench(options=['-T', '10', '--no-vacuum']) - pgbench.wait() - source_pg.safe_psql("postgres", "INSERT INTO ultimate_question VALUES(42)") - result = source_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") - - # catchup + dst_options = {} + dst_options['port'] = str(dst_pg.port) + self.set_auto_conf(dst_pg, dst_options) + dst_pg.slow_start() + dst_pg.stop() + + # preparation 3: make changes on master (source) + # perform checkpoint twice to ensure, that datafile is actually deleted on filesystem + src_pg.safe_psql("postgres", "DROP TABLE ultimate_question") + src_pg.safe_psql("postgres", "CHECKPOINT") + src_pg.safe_psql("postgres", "CHECKPOINT") + + # do delta catchup self.catchup_node( backup_mode = 'DELTA', - source_pgdata = source_pg.data_dir, - destination_node = dest_pg, - options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream'] + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] ) - source_pgdata = self.pgdata_content(source_pg.data_dir) - dest_pgdata = self.pgdata_content(dest_pg.data_dir) - self.compare_pgdata(source_pgdata, dest_pgdata) - - # stop replication - source_pg.stop() - - # check latest changes - self.set_replica(master = source_pg, replica = dest_pg) - dest_pg.slow_start(replica = True) - self.assertEqual( - result, - dest_pg.safe_psql("postgres", "SELECT * FROM ultimate_question"), - 'Different answer from copy') - dest_pg.stop() + # Check: compare data directories + self.compare_pgdata( + self.pgdata_content(src_pg.data_dir), + self.pgdata_content(dst_pg.data_dir) + ) - # Clean after yourself - self.del_test_dir(module_name, fname) + # Cleanup + src_pg.stop() + self.del_test_dir(module_name, self.fname) - # @unittest.skip("skip") - def test_table_drop(self): + def test_table_drop_with_ptrack(self): """ + Test that dropped table in source will be dropped in ptrack catchup'ed instance too """ if not self.ptrack: return unittest.skip('Skipped because ptrack support is disabled') - fname = self.id().split('.')[3] - - source_pg = self.make_simple_node( - base_dir = os.path.join(module_name, fname, 'src'), + # preparation 1: source + src_pg = self.make_simple_node( + base_dir = os.path.join(module_name, self.fname, 'src'), + set_replication = True, ptrack_enable = True, - initdb_params = ['--data-checksums']) - source_pg.slow_start() - - source_pg.safe_psql("postgres", "CREATE EXTENSION ptrack") - source_pg.safe_psql( + initdb_params = ['--data-checksums'] + ) + src_pg.slow_start() + src_pg.safe_psql("postgres", "CREATE EXTENSION ptrack") + src_pg.safe_psql( "postgres", "CREATE TABLE ultimate_question AS SELECT 42 AS answer") - dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) + # preparation 2: make clean shutdowned lagging behind replica + dst_pg = self.make_empty_node(os.path.join(module_name, self.fname, 'dst')) self.catchup_node( backup_mode = 'FULL', - source_pgdata = source_pg.data_dir, - destination_node = dest_pg, - options = [ - '-d', 'postgres', - '-p', str(source_pg.port), - '--stream' - ] + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] + ) + dst_options = {} + dst_options['port'] = str(dst_pg.port) + self.set_auto_conf(dst_pg, dst_options) + dst_pg.slow_start() + dst_pg.stop() + + # preparation 3: make changes on master (source) + # perform checkpoint twice to ensure, that datafile is actually deleted on filesystem + src_pg.safe_psql("postgres", "DROP TABLE ultimate_question") + src_pg.safe_psql("postgres", "CHECKPOINT") + src_pg.safe_psql("postgres", "CHECKPOINT") + + # do ptrack catchup + self.catchup_node( + backup_mode = 'PTRACK', + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] ) + # Check: compare data directories + self.compare_pgdata( + self.pgdata_content(src_pg.data_dir), + self.pgdata_content(dst_pg.data_dir) + ) + + # Cleanup + src_pg.stop() + self.del_test_dir(module_name, self.fname) + + def test_tablefile_truncation_with_delta(self): + """ + Test that truncated table in source will be truncated in delta catchup'ed instance too + """ + # preparation 1: source + src_pg = self.make_simple_node( + base_dir = os.path.join(module_name, self.fname, 'src'), + set_replication = True, + pg_options = { 'wal_log_hints': 'on' } + ) + src_pg.slow_start() + src_pg.safe_psql( + "postgres", + "CREATE SEQUENCE t_seq; " + "CREATE TABLE t_heap AS SELECT i AS id, " + "md5(i::text) AS text, " + "md5(repeat(i::text, 10))::tsvector AS tsvector " + "FROM generate_series(0, 1024) i") + src_pg.safe_psql("postgres", "VACUUM t_heap") + + # preparation 2: make clean shutdowned lagging behind replica + dst_pg = self.make_empty_node(os.path.join(module_name, self.fname, 'dst')) + self.catchup_node( + backup_mode = 'FULL', + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] + ) dest_options = {} - dest_options['port'] = str(dest_pg.port) - self.set_auto_conf(dest_pg, dest_options) - dest_pg.slow_start() - dest_pg.stop() + dst_options = {} + dst_options['port'] = str(dst_pg.port) + self.set_auto_conf(dst_pg, dst_options) + dst_pg.slow_start() + dst_pg.stop() - source_pg.safe_psql("postgres", "DROP TABLE ultimate_question") - source_pg.safe_psql("postgres", "CHECKPOINT") - source_pg.safe_psql("postgres", "CHECKPOINT") + # preparation 3: make changes on master (source) + src_pg.safe_psql("postgres", "DELETE FROM t_heap WHERE ctid >= '(11,0)'") + src_pg.safe_psql("postgres", "VACUUM t_heap") - # catchup + # do delta catchup self.catchup_node( - backup_mode = 'PTRACK', - source_pgdata = source_pg.data_dir, - destination_node = dest_pg, - options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream']) + backup_mode = 'DELTA', + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] + ) - source_pgdata = self.pgdata_content(source_pg.data_dir) - dest_pgdata = self.pgdata_content(dest_pg.data_dir) - self.compare_pgdata(source_pgdata, dest_pgdata) + # Check: compare data directories + self.compare_pgdata( + self.pgdata_content(src_pg.data_dir), + self.pgdata_content(dst_pg.data_dir) + ) - # Clean after yourself - source_pg.stop() - self.del_test_dir(module_name, fname) + # Cleanup + src_pg.stop() + self.del_test_dir(module_name, self.fname) - # @unittest.skip("skip") - def test_tablefile_truncation(self): + def test_tablefile_truncation_with_ptrack(self): """ + Test that truncated table in source will be truncated in ptrack catchup'ed instance too """ if not self.ptrack: return unittest.skip('Skipped because ptrack support is disabled') - fname = self.id().split('.')[3] - - source_pg = self.make_simple_node( - base_dir = os.path.join(module_name, fname, 'src'), + # preparation 1: source + src_pg = self.make_simple_node( + base_dir = os.path.join(module_name, self.fname, 'src'), + set_replication = True, ptrack_enable = True, - initdb_params = ['--data-checksums']) - source_pg.slow_start() - - source_pg.safe_psql("postgres", "CREATE EXTENSION ptrack") - source_pg.safe_psql( + initdb_params = ['--data-checksums'] + ) + src_pg.slow_start() + src_pg.safe_psql("postgres", "CREATE EXTENSION ptrack") + src_pg.safe_psql( "postgres", "CREATE SEQUENCE t_seq; " "CREATE TABLE t_heap AS SELECT i AS id, " "md5(i::text) AS text, " "md5(repeat(i::text, 10))::tsvector AS tsvector " "FROM generate_series(0, 1024) i") - source_pg.safe_psql("postgres", "VACUUM t_heap") + src_pg.safe_psql("postgres", "VACUUM t_heap") - dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) + # preparation 2: make clean shutdowned lagging behind replica + dst_pg = self.make_empty_node(os.path.join(module_name, self.fname, 'dst')) self.catchup_node( backup_mode = 'FULL', - source_pgdata = source_pg.data_dir, - destination_node = dest_pg, - options = [ - '-d', 'postgres', - '-p', str(source_pg.port), - '--stream' - ] + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] ) - dest_options = {} - dest_options['port'] = str(dest_pg.port) - self.set_auto_conf(dest_pg, dest_options) - dest_pg.slow_start() - dest_pg.stop() + dst_options = {} + dst_options['port'] = str(dst_pg.port) + self.set_auto_conf(dst_pg, dst_options) + dst_pg.slow_start() + dst_pg.stop() - source_pg.safe_psql("postgres", "DELETE FROM t_heap WHERE ctid >= '(11,0)'") - source_pg.safe_psql("postgres", "VACUUM t_heap") + # preparation 3: make changes on master (source) + src_pg.safe_psql("postgres", "DELETE FROM t_heap WHERE ctid >= '(11,0)'") + src_pg.safe_psql("postgres", "VACUUM t_heap") - # catchup + # do ptrack catchup self.catchup_node( backup_mode = 'PTRACK', - source_pgdata = source_pg.data_dir, - destination_node = dest_pg, - options = ['-d', 'postgres', '-p', str(source_pg.port), '--stream']) + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] + ) - source_pgdata = self.pgdata_content(source_pg.data_dir) - dest_pgdata = self.pgdata_content(dest_pg.data_dir) - self.compare_pgdata(source_pgdata, dest_pgdata) + # Check: compare data directories + self.compare_pgdata( + self.pgdata_content(src_pg.data_dir), + self.pgdata_content(dst_pg.data_dir) + ) - # Clean after yourself - source_pg.stop() - self.del_test_dir(module_name, fname) + # Cleanup + src_pg.stop() + self.del_test_dir(module_name, self.fname) - # @unittest.skip("skip") +######################################### +# Test reaction on user errors +######################################### def test_local_tablespace_without_mapping(self): if self.remote: return unittest.skip('Skipped because this test tests local catchup error handling') - fname = self.id().split('.')[3] - - source_pg = self.make_simple_node( - base_dir = os.path.join(module_name, fname, 'src'), - initdb_params = ['--data-checksums']) - source_pg.slow_start() + src_pg = self.make_simple_node(base_dir = os.path.join(module_name, self.fname, 'src')) + src_pg.slow_start() - tblspace_path = self.get_tblspace_path(source_pg, 'tblspace') + tblspace_path = self.get_tblspace_path(src_pg, 'tblspace') self.create_tblspace_in_node( - source_pg, 'tblspace', + src_pg, 'tblspace', tblspc_path = tblspace_path) - source_pg.safe_psql( + src_pg.safe_psql( "postgres", "CREATE TABLE ultimate_question TABLESPACE tblspace AS SELECT 42 AS answer") - dest_pg = self.make_empty_node(os.path.join(module_name, fname, 'dst')) + dst_pg = self.make_empty_node(os.path.join(module_name, self.fname, 'dst')) try: self.catchup_node( backup_mode = 'FULL', - source_pgdata = source_pg.data_dir, - destination_node = dest_pg, + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, options = [ '-d', 'postgres', - '-p', str(source_pg.port), + '-p', str(src_pg.port), '--stream', ] ) @@ -430,6 +525,6 @@ def test_local_tablespace_without_mapping(self): e.message, '\n Unexpected Error Message: {0}\n CMD: {1}'.format(repr(e.message), self.cmd)) - source_pg.stop() + src_pg.stop() # Clean after yourself - self.del_test_dir(module_name, fname) + self.del_test_dir(module_name, self.fname) From 629b7324a917377529666d8e515d31fa1fdeab89 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Thu, 10 Jun 2021 17:27:07 +0300 Subject: [PATCH 55/63] more checks and tests --- src/catchup.c | 31 +++++--- src/pg_probackup.h | 2 + src/util.c | 18 ++++- tests/catchup.py | 173 ++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 211 insertions(+), 13 deletions(-) diff --git a/src/catchup.c b/src/catchup.c index fc004506e..cab4a2899 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -174,6 +174,27 @@ catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, } } + /* check backup_label absence in dest */ + if (current.backup_mode != BACKUP_MODE_FULL) + { + char backup_label_filename[MAXPGPATH]; + + join_path_components(backup_label_filename, dest_pgdata, PG_BACKUP_LABEL_FILE); + if (fio_access(backup_label_filename, F_OK, FIO_LOCAL_HOST) == 0) + elog(ERROR, "Destination directory contains \"" PG_BACKUP_LABEL_FILE "\" file"); + } + + /* check that destination database is shutdowned cleanly */ + if (current.backup_mode != BACKUP_MODE_FULL) + { + DBState state; + state = get_system_dbstate(dest_pgdata, FIO_LOCAL_HOST); + /* see states in postgres sources (src/include/catalog/pg_control.h) */ + if (state != DB_SHUTDOWNED && state != DB_SHUTDOWNED_IN_RECOVERY) + elog(ERROR, "Postmaster in destination directory \"%s\" must be stopped cleanly", + dest_pgdata); + } + /* Check that connected PG instance, source and destination PGDATA are the same */ { uint64 source_conn_id, source_id, dest_id; @@ -206,16 +227,6 @@ catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, elog(ERROR, "Ptrack is disabled"); } - /* check backup_label absence in dest */ - if (current.backup_mode != BACKUP_MODE_FULL) - { - char backup_label_filename[MAXPGPATH]; - - join_path_components(backup_label_filename, dest_pgdata, PG_BACKUP_LABEL_FILE); - if (fio_access(backup_label_filename, F_OK, FIO_LOCAL_HOST) == 0) - elog(ERROR, "Destination directory contains \"" PG_BACKUP_LABEL_FILE "\" file"); - } - if (current.from_replica && exclusive_backup) elog(ERROR, "Catchup from standby is only available for PostgreSQL >= 9.6"); diff --git a/src/pg_probackup.h b/src/pg_probackup.h index 01559a52f..9dca8abe5 100644 --- a/src/pg_probackup.h +++ b/src/pg_probackup.h @@ -17,6 +17,7 @@ #include "access/xlog_internal.h" #include "utils/pg_crc.h" +#include "catalog/pg_control.h" #if PG_VERSION_NUM >= 120000 #include "common/logging.h" @@ -1176,6 +1177,7 @@ extern uint64 get_system_identifier(const char *pgdata_path, fio_location locati extern uint64 get_remote_system_identifier(PGconn *conn); extern uint32 get_data_checksum_version(bool safe); extern pg_crc32c get_pgcontrol_checksum(const char *pgdata_path); +extern DBState get_system_dbstate(const char *pgdata_path, fio_location location); extern uint32 get_xlog_seg_size(const char *pgdata_path); extern void get_redo(const char *pgdata_path, fio_location pgdata_location, RedoParams *redo); extern void set_min_recovery_point(pgFile *file, const char *backup_path, diff --git a/src/util.c b/src/util.c index 094e5b3dd..84e7a9eb1 100644 --- a/src/util.c +++ b/src/util.c @@ -10,8 +10,6 @@ #include "pg_probackup.h" -#include "catalog/pg_control.h" - #include #include @@ -351,6 +349,22 @@ get_pgcontrol_checksum(const char *pgdata_path) return ControlFile.crc; } +DBState +get_system_dbstate(const char *pgdata_path, fio_location location) +{ + ControlFileData ControlFile; + char *buffer; + size_t size; + + buffer = slurpFile(pgdata_path, XLOG_CONTROL_FILE, &size, false, location); + if (buffer == NULL) + return 0; + digestControlFile(&ControlFile, buffer, size); + pg_free(buffer); + + return ControlFile.state; +} + void get_redo(const char *pgdata_path, fio_location pgdata_location, RedoParams *redo) { diff --git a/tests/catchup.py b/tests/catchup.py index 0332bcb6e..60af6908d 100644 --- a/tests/catchup.py +++ b/tests/catchup.py @@ -1,4 +1,5 @@ import os +import signal import unittest from .helpers.ptrack_helpers import ProbackupTest, ProbackupException @@ -525,6 +526,176 @@ def test_local_tablespace_without_mapping(self): e.message, '\n Unexpected Error Message: {0}\n CMD: {1}'.format(repr(e.message), self.cmd)) + # Cleanup + src_pg.stop() + self.del_test_dir(module_name, self.fname) + + def test_running_dest_postmaster(self): + """ + Test that we detect running postmaster in destination + """ + # preparation 1: source + src_pg = self.make_simple_node( + base_dir = os.path.join(module_name, self.fname, 'src'), + set_replication = True, + pg_options = { 'wal_log_hints': 'on' } + ) + src_pg.slow_start() + + # preparation 2: destination + dst_pg = self.make_empty_node(os.path.join(module_name, self.fname, 'dst')) + self.catchup_node( + backup_mode = 'FULL', + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] + ) + dst_options = {} + dst_options['port'] = str(dst_pg.port) + self.set_auto_conf(dst_pg, dst_options) + dst_pg.slow_start() + # leave running destination postmaster + #dst_pg.stop() + + # try delta catchup + try: + self.catchup_node( + backup_mode = 'DELTA', + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] + ) + self.assertEqual(1, 0, "Expecting Error because postmaster in destination is running.\n Output: {0} \n CMD: {1}".format( + repr(self.output), self.cmd)) + except ProbackupException as e: + self.assertIn( + 'ERROR: Postmaster with pid ', + e.message, + '\n Unexpected Error Message: {0}\n CMD: {1}'.format(repr(e.message), self.cmd)) + + # Cleanup + src_pg.stop() + self.del_test_dir(module_name, self.fname) + + def test_same_db_id(self): + """ + Test that we detect different id's of source and destination + """ + # preparation: + # source + src_pg = self.make_simple_node( + base_dir = os.path.join(module_name, self.fname, 'src'), + set_replication = True + ) + src_pg.slow_start() + # destination + dst_pg = self.make_empty_node(os.path.join(module_name, self.fname, 'dst')) + self.catchup_node( + backup_mode = 'FULL', + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] + ) + # fake destination + fake_dst_pg = self.make_simple_node(base_dir = os.path.join(module_name, self.fname, 'fake_dst')) + # fake source + fake_src_pg = self.make_simple_node(base_dir = os.path.join(module_name, self.fname, 'fake_src')) + + # try delta catchup (src (with correct src conn), fake_dst) + try: + self.catchup_node( + backup_mode = 'DELTA', + source_pgdata = src_pg.data_dir, + destination_node = fake_dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] + ) + self.assertEqual(1, 0, "Expecting Error because database identifiers mismatch.\n Output: {0} \n CMD: {1}".format( + repr(self.output), self.cmd)) + except ProbackupException as e: + self.assertIn( + 'ERROR: Database identifiers mismatch: ', + e.message, + '\n Unexpected Error Message: {0}\n CMD: {1}'.format(repr(e.message), self.cmd)) + + # try delta catchup (fake_src (with wrong src conn), dst) + try: + self.catchup_node( + backup_mode = 'DELTA', + source_pgdata = fake_src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] + ) + self.assertEqual(1, 0, "Expecting Error because database identifiers mismatch.\n Output: {0} \n CMD: {1}".format( + repr(self.output), self.cmd)) + except ProbackupException as e: + self.assertIn( + 'ERROR: Database identifiers mismatch: ', + e.message, + '\n Unexpected Error Message: {0}\n CMD: {1}'.format(repr(e.message), self.cmd)) + + # Cleanup + src_pg.stop() + self.del_test_dir(module_name, self.fname) + + def test_destination_dbstate(self): + """ + Test that we detect that destination pg is not cleanly shutdowned + """ + # preparation 1: source + src_pg = self.make_simple_node( + base_dir = os.path.join(module_name, self.fname, 'src'), + set_replication = True, + pg_options = { 'wal_log_hints': 'on' } + ) + src_pg.slow_start() + + # preparation 2: destination + dst_pg = self.make_empty_node(os.path.join(module_name, self.fname, 'dst')) + self.catchup_node( + backup_mode = 'FULL', + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] + ) + + # try #1 + try: + self.catchup_node( + backup_mode = 'DELTA', + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] + ) + self.assertEqual(1, 0, "Expecting Error because destination pg is not cleanly shutdowned.\n Output: {0} \n CMD: {1}".format( + repr(self.output), self.cmd)) + except ProbackupException as e: + self.assertIn( + 'ERROR: Destination directory contains "backup_label" file', + e.message, + '\n Unexpected Error Message: {0}\n CMD: {1}'.format(repr(e.message), self.cmd)) + + # try #2 + dst_options = {} + dst_options['port'] = str(dst_pg.port) + self.set_auto_conf(dst_pg, dst_options) + dst_pg.slow_start() + self.assertNotEqual(dst_pg.pid, 0, "Cannot detect pid of running postgres") + os.kill(dst_pg.pid, signal.SIGKILL) + try: + self.catchup_node( + backup_mode = 'DELTA', + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] + ) + self.assertEqual(1, 0, "Expecting Error because destination pg is not cleanly shutdowned.\n Output: {0} \n CMD: {1}".format( + repr(self.output), self.cmd)) + except ProbackupException as e: + self.assertIn( + 'must be stopped cleanly', + e.message, + '\n Unexpected Error Message: {0}\n CMD: {1}'.format(repr(e.message), self.cmd)) + + # Cleanup src_pg.stop() - # Clean after yourself self.del_test_dir(module_name, self.fname) From b15fcf615c9733c8f582bd76eb6a699f597043d3 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Thu, 10 Jun 2021 18:02:41 +0300 Subject: [PATCH 56/63] refine test_same_db_id test --- tests/catchup.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/catchup.py b/tests/catchup.py index 60af6908d..a36b4f3a6 100644 --- a/tests/catchup.py +++ b/tests/catchup.py @@ -596,6 +596,11 @@ def test_same_db_id(self): destination_node = dst_pg, options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] ) + dst_options = {} + dst_options['port'] = str(dst_pg.port) + self.set_auto_conf(dst_pg, dst_options) + dst_pg.slow_start() + dst_pg.stop() # fake destination fake_dst_pg = self.make_simple_node(base_dir = os.path.join(module_name, self.fname, 'fake_dst')) # fake source From ba22e2effb91d9c441e46f8196e951cc865fbfba Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Thu, 10 Jun 2021 20:11:36 +0300 Subject: [PATCH 57/63] split do_catchup_instance() into parts #1 --- src/catchup.c | 540 ++++++++++++++++++++++++--------------------- src/pg_probackup.c | 2 +- src/pg_probackup.h | 23 +- 3 files changed, 286 insertions(+), 279 deletions(-) diff --git a/src/catchup.c b/src/catchup.c index cab4a2899..c3d3c289d 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -31,32 +31,6 @@ static void catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *sourc const char *dest_pgdata); static void catchup_check_tablespaces_existance_in_tbsmapping(PGconn *conn); static parray* catchup_get_tli_history(ConnectionOptions *conn_opt, TimeLineID tli); -static void do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn *source_conn, - PGNodeInfo *nodeInfo, bool no_sync, bool backup_logs); -static void *catchup_thread_runner(void *arg); - -/* - * Entry point of pg_probackup CATCHUP subcommand. - */ -int -do_catchup(const char *source_pgdata, const char *dest_pgdata, int num_threads) -{ - PGconn *source_conn = NULL; - PGNodeInfo source_node_info; - bool no_sync = false; - bool backup_logs = false; - - source_conn = catchup_collect_info(&source_node_info, source_pgdata, dest_pgdata); - catchup_preflight_checks(&source_node_info, source_conn, source_pgdata, dest_pgdata); - - do_catchup_instance(source_pgdata, dest_pgdata, source_conn, &source_node_info, - no_sync, backup_logs); - - //REVIEW: Are we going to do that before release? - /* TODO: show the amount of transfered data in bytes and calculate incremental ratio */ - - return 0; -} //REVIEW The name of this function looks strange to me. //Maybe catchup_init_state() or catchup_setup() will do better? @@ -367,58 +341,264 @@ catchup_get_tli_history(ConnectionOptions *conn_opt, TimeLineID tli) return result; } - /* - * TODO: - * - add description - * main worker function, to be moved into do_catchup() and then to be split into meaningful pieces + * catchup multithreaded copy rountine and helper structure and function */ -static void -do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn *source_conn, - PGNodeInfo *source_node_info, bool no_sync, bool backup_logs) + +/* parameters for catchup_thread_runner() passed from catchup_multithreaded_copy() */ +typedef struct +{ + PGNodeInfo *nodeInfo; + const char *from_root; + const char *to_root; + parray *source_filelist; + parray *dest_filelist; + XLogRecPtr sync_lsn; + BackupMode backup_mode; + int thread_num; + bool completed; +} catchup_thread_runner_arg; + +/* Catchup file copier executed in separate thread */ +static void * +catchup_thread_runner(void *arg) { int i; - char dest_xlog_path[MAXPGPATH]; - char label[1024]; - RedoParams dest_redo = { 0, InvalidXLogRecPtr, 0 }; - pgFile *source_pg_control_file = NULL; + char from_fullpath[MAXPGPATH]; + char to_fullpath[MAXPGPATH]; + + catchup_thread_runner_arg *arguments = (catchup_thread_runner_arg *) arg; + int n_files = parray_num(arguments->source_filelist); + + /* catchup a file */ + for (i = 0; i < n_files; i++) + { + pgFile *file = (pgFile *) parray_get(arguments->source_filelist, i); + pgFile *dest_file = NULL; + + /* We have already copied all directories */ + if (S_ISDIR(file->mode)) + continue; + + if (!pg_atomic_test_set_flag(&file->lock)) + continue; + + /* check for interrupt */ + if (interrupted || thread_interrupted) + elog(ERROR, "Interrupted during catchup"); + + if (progress) + elog(INFO, "Progress: (%d/%d). Process file \"%s\"", + i + 1, n_files, file->rel_path); + + /* construct destination filepath */ + Assert(file->external_dir_num == 0); + join_path_components(from_fullpath, arguments->from_root, file->rel_path); + join_path_components(to_fullpath, arguments->to_root, file->rel_path); + + /* Encountered some strange beast */ + if (!S_ISREG(file->mode)) + elog(WARNING, "Unexpected type %d of file \"%s\", skipping", + file->mode, from_fullpath); + + /* Check that file exist in dest pgdata */ + if (arguments->backup_mode != BACKUP_MODE_FULL) + { + pgFile **dest_file_tmp = NULL; + dest_file_tmp = (pgFile **) parray_bsearch(arguments->dest_filelist, + file, pgFileCompareRelPathWithExternal); + if (dest_file_tmp) + { + /* File exists in destination PGDATA */ + file->exists_in_prev = true; + dest_file = *dest_file_tmp; + } + } + + /* Do actual work */ + if (file->is_datafile && !file->is_cfs) + { + catchup_data_file(file, from_fullpath, to_fullpath, + arguments->sync_lsn, + arguments->backup_mode, + NONE_COMPRESS, + 0, + arguments->nodeInfo->checksum_version, + arguments->nodeInfo->ptrack_version_num, + arguments->nodeInfo->ptrack_schema, + false, + dest_file != NULL ? dest_file->size : 0); + } + else + { + backup_non_data_file(file, dest_file, from_fullpath, to_fullpath, + arguments->backup_mode, current.parent_backup, true); + } + if (file->write_size == FILE_NOT_FOUND) + continue; + + if (file->write_size == BYTES_INVALID) + { + elog(VERBOSE, "Skipping the unchanged file: \"%s\", read %li bytes", from_fullpath, file->read_size); + continue; + } + + elog(VERBOSE, "File \"%s\". Copied "INT64_FORMAT " bytes", + from_fullpath, file->write_size); + } + + /* ssh connection to longer needed */ + fio_disconnect(); + + /* Data files transferring is successful */ + arguments->completed = true; + + return NULL; +} + +/* + * main multithreaded copier + */ +static bool +catchup_multithreaded_copy(int num_threads, + PGNodeInfo *source_node_info, + const char *source_pgdata_path, + const char *dest_pgdata_path, + parray *source_filelist, + parray *dest_filelist, + XLogRecPtr sync_lsn, + BackupMode backup_mode) +{ /* arrays with meta info for multi threaded catchup */ - pthread_t *threads; catchup_thread_runner_arg *threads_args; + pthread_t *threads; + + bool all_threads_successful = true; + int i; + + /* init thread args */ + threads_args = (catchup_thread_runner_arg *) palloc(sizeof(catchup_thread_runner_arg) * num_threads); + for (i = 0; i < num_threads; i++) + threads_args[i] = (catchup_thread_runner_arg){ + .nodeInfo = source_node_info, + .from_root = source_pgdata_path, + .to_root = dest_pgdata_path, + .source_filelist = source_filelist, + .dest_filelist = dest_filelist, + .sync_lsn = sync_lsn, + .backup_mode = backup_mode, + .thread_num = i + 1, + .completed = false, + }; + + /* Run threads */ + thread_interrupted = false; + threads = (pthread_t *) palloc(sizeof(pthread_t) * num_threads); + for (i = 0; i < num_threads; i++) + { + elog(VERBOSE, "Start thread num: %i", i); + pthread_create(&threads[i], NULL, &catchup_thread_runner, &(threads_args[i])); + } + + /* Wait threads */ + for (i = 0; i < num_threads; i++) + { + pthread_join(threads[i], NULL); + all_threads_successful &= threads_args[i].completed; + } + + free(threads); + free(threads_args); + return all_threads_successful; +} + +/* + * + */ +static void +catchup_sync_destination_files(const char* pgdata_path, fio_location location, parray *filelist, pgFile *pg_control_file) +{ + char fullpath[MAXPGPATH]; + time_t start_time, end_time; + char pretty_time[20]; + int i; + + elog(INFO, "Syncing copied files to disk"); + time(&start_time); + + for (i = 0; i < parray_num(filelist); i++) + { + pgFile *file = (pgFile *) parray_get(filelist, i); + + /* TODO: sync directory ? */ + if (S_ISDIR(file->mode)) + continue; + + Assert(file->external_dir_num == 0); + join_path_components(fullpath, pgdata_path, file->rel_path); + if (fio_sync(fullpath, location) != 0) + elog(ERROR, "Cannot sync file \"%s\": %s", fullpath, strerror(errno)); + } + + /* + * sync pg_control file + */ + join_path_components(fullpath, pgdata_path, pg_control_file->rel_path); + if (fio_sync(fullpath, location) != 0) + elog(ERROR, "Cannot sync file \"%s\": %s", fullpath, strerror(errno)); + + time(&end_time); + pretty_time_interval(difftime(end_time, start_time), + pretty_time, lengthof(pretty_time)); + elog(INFO, "Files are synced, time elapsed: %s", pretty_time); +} + +/* + * Entry point of pg_probackup CATCHUP subcommand. + */ +int +do_catchup(const char *source_pgdata, const char *dest_pgdata, int num_threads, bool sync_dest_files) +{ + PGconn *source_conn = NULL; + PGNodeInfo source_node_info; + bool backup_logs = false; + parray *source_filelist = NULL; + pgFile *source_pg_control_file = NULL; + parray *dest_filelist = NULL; + char dest_xlog_path[MAXPGPATH]; + + RedoParams dest_redo = { 0, InvalidXLogRecPtr, 0 }; + PGStopBackupResult stop_backup_result; bool catchup_isok = true; - parray *source_filelist = NULL; - parray *dest_filelist = NULL; + int i; /* for fancy reporting */ time_t start_time, end_time; char pretty_time[20]; char pretty_bytes[20]; - PGStopBackupResult stop_backup_result; - //REVIEW Is it relevant to catchup? I suppose it isn't, since catchup is a new code. - //If we do need it, please write a comment explaining that. - /* kludge against some old bug in archive_timeout. TODO: remove in 3.0.0 */ - int timeout = (instance_config.archive_timeout > 0) ? - instance_config.archive_timeout : ARCHIVE_TIMEOUT_DEFAULT; - char *query_text = NULL; + source_conn = catchup_collect_info(&source_node_info, source_pgdata, dest_pgdata); + catchup_preflight_checks(&source_node_info, source_conn, source_pgdata, dest_pgdata); elog(LOG, "Database catchup start"); - /* notify start of backup to PostgreSQL server */ - time2iso(label, lengthof(label), current.start_time, false); - strncat(label, " with pg_probackup", lengthof(label) - - strlen(" with pg_probackup")); - - /* Call pg_start_backup function in PostgreSQL connect */ - pg_start_backup(label, smooth_checkpoint, ¤t, source_node_info, source_conn); - elog(LOG, "pg_start_backup START LSN %X/%X", (uint32) (current.start_lsn >> 32), (uint32) (current.start_lsn)); + { + char label[1024]; + /* notify start of backup to PostgreSQL server */ + time2iso(label, lengthof(label), current.start_time, false); + strncat(label, " with pg_probackup", lengthof(label) - + strlen(" with pg_probackup")); + + /* Call pg_start_backup function in PostgreSQL connect */ + pg_start_backup(label, smooth_checkpoint, ¤t, &source_node_info, source_conn); + elog(LOG, "pg_start_backup START LSN %X/%X", (uint32) (current.start_lsn >> 32), (uint32) (current.start_lsn)); + } //REVIEW I wonder, if we can move this piece above and call before pg_start backup()? //It seems to be a part of setup phase. - if (current.backup_mode == BACKUP_MODE_DIFF_PTRACK || - current.backup_mode == BACKUP_MODE_DIFF_DELTA) + if (current.backup_mode != BACKUP_MODE_FULL) { dest_filelist = parray_new(); dir_list_file(dest_filelist, dest_pgdata, @@ -436,7 +616,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * */ if (current.backup_mode == BACKUP_MODE_DIFF_PTRACK) { - XLogRecPtr ptrack_lsn = get_last_ptrack_lsn(source_conn, source_node_info); + XLogRecPtr ptrack_lsn = get_last_ptrack_lsn(source_conn, &source_node_info); // new ptrack is more robust and checks Start LSN if (ptrack_lsn > dest_redo.lsn || ptrack_lsn == InvalidXLogRecPtr) @@ -515,8 +695,8 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * /* Build the page map from ptrack information */ make_pagemap_from_ptrack_2(source_filelist, source_conn, - source_node_info->ptrack_schema, - source_node_info->ptrack_version_num, + source_node_info.ptrack_schema, + source_node_info.ptrack_version_num, dest_redo.lsn); time(&end_time); elog(INFO, "Pagemap successfully extracted, time elapsed: %.0f sec", @@ -622,8 +802,7 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * * remove absent source files in dest (dropped tables, etc...) * note: global/pg_control will also be deleted here */ - if (current.backup_mode == BACKUP_MODE_DIFF_PTRACK || - current.backup_mode == BACKUP_MODE_DIFF_DELTA) + if (current.backup_mode != BACKUP_MODE_FULL) { elog(INFO, "Removing redundant files in destination directory"); parray_qsort(dest_filelist, pgFileCompareRelPathWithExternalDesc); @@ -675,45 +854,13 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * if (dest_filelist) parray_qsort(dest_filelist, pgFileCompareRelPathWithExternal); - /* init thread args */ - threads = (pthread_t *) palloc(sizeof(pthread_t) * num_threads); - threads_args = (catchup_thread_runner_arg *) palloc(sizeof(catchup_thread_runner_arg) * num_threads); - - for (i = 0; i < num_threads; i++) - { - catchup_thread_runner_arg *arg = &(threads_args[i]); - - arg->nodeInfo = source_node_info; - arg->from_root = source_pgdata; - arg->to_root = dest_pgdata; - arg->source_filelist = source_filelist; - arg->dest_filelist = dest_filelist; - arg->sync_lsn = dest_redo.lsn; - arg->backup_mode = current.backup_mode; - arg->thread_num = i + 1; - /* By default there are some error */ - arg->ret = 1; - } - - /* Run threads */ - thread_interrupted = false; + /* run copy threads */ elog(INFO, "Start transferring data files"); time(&start_time); - for (i = 0; i < num_threads; i++) - { - catchup_thread_runner_arg *arg = &(threads_args[i]); - - elog(VERBOSE, "Start thread num: %i", i); - pthread_create(&threads[i], NULL, catchup_thread_runner, arg); - } - - /* Wait threads */ - for (i = 0; i < num_threads; i++) - { - pthread_join(threads[i], NULL); - if (threads_args[i].ret == 1) - catchup_isok = false; - } + catchup_isok = catchup_multithreaded_copy(num_threads, &source_node_info, + source_pgdata, dest_pgdata, + source_filelist, dest_filelist, + dest_redo.lsn, current.backup_mode); /* at last copy control file */ if (catchup_isok) @@ -737,27 +884,39 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * pretty_time); /* Notify end of backup */ - pg_silent_client_messages(source_conn); + { + //REVIEW Is it relevant to catchup? I suppose it isn't, since catchup is a new code. + //If we do need it, please write a comment explaining that. + /* kludge against some old bug in archive_timeout. TODO: remove in 3.0.0 */ + int timeout = (instance_config.archive_timeout > 0) ? + instance_config.archive_timeout : ARCHIVE_TIMEOUT_DEFAULT; + char *stop_backup_query_text = NULL; + + pg_silent_client_messages(source_conn); + + //REVIEW. Do we want to support pg 9.5? I suppose we never test it... + //Maybe check it and error out early? + /* Create restore point + * Only if backup is from master. + * For PG 9.5 create restore point only if pguser is superuser. + */ + if (!current.from_replica && + !(source_node_info.server_version < 90600 && + !source_node_info.is_superuser)) //TODO: check correctness + pg_create_restore_point(source_conn, current.start_time); - //REVIEW. Do we want to support pg 9.5? I suppose we never test it... - //Maybe check it and error out early? - /* Create restore point - * Only if backup is from master. - * For PG 9.5 create restore point only if pguser is superuser. - */ - if (!current.from_replica && - !(source_node_info->server_version < 90600 && - !source_node_info->is_superuser)) //TODO: check correctness - pg_create_restore_point(source_conn, current.start_time); + /* Execute pg_stop_backup using PostgreSQL connection */ + pg_stop_backup_send(source_conn, source_node_info.server_version, current.from_replica, exclusive_backup, &stop_backup_query_text); - /* Execute pg_stop_backup using PostgreSQL connection */ - pg_stop_backup_send(source_conn, source_node_info->server_version, current.from_replica, exclusive_backup, &query_text); + /* + * Wait for the result of pg_stop_backup(), but no longer than + * archive_timeout seconds + */ + pg_stop_backup_consume(source_conn, source_node_info.server_version, exclusive_backup, timeout, stop_backup_query_text, &stop_backup_result); - /* - * Wait for the result of pg_stop_backup(), but no longer than - * archive_timeout seconds - */ - pg_stop_backup_consume(source_conn, source_node_info->server_version, exclusive_backup, timeout, query_text, &stop_backup_result); + /* Cleanup */ + pg_free(stop_backup_query_text); + } wait_wal_and_calculate_stop_lsn(dest_xlog_path, stop_backup_result.lsn, ¤t); @@ -804,9 +963,6 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * current.recovery_time = stop_backup_result.invocation_time; } - /* Cleanup */ - pg_free(query_text); - /* * In case of backup from replica >= 9.6 we must fix minRecPoint */ @@ -819,45 +975,12 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * fio_disconnect(); /* Sync all copied files unless '--no-sync' flag is used */ - if (no_sync) - elog(WARNING, "Files are not synced to disk"); - else + if (catchup_isok) { - char to_fullpath[MAXPGPATH]; - - elog(INFO, "Syncing copied files to disk"); - time(&start_time); - - for (i = 0; i < parray_num(source_filelist); i++) - { - pgFile *file = (pgFile *) parray_get(source_filelist, i); - - /* TODO: sync directory ? */ - if (S_ISDIR(file->mode)) - continue; - - if (file->write_size <= 0) - continue; - - /* construct fullpath */ - Assert(file->external_dir_num == 0); - join_path_components(to_fullpath, dest_pgdata, file->rel_path); - - if (fio_sync(to_fullpath, FIO_LOCAL_HOST) != 0) - elog(ERROR, "Cannot sync file \"%s\": %s", to_fullpath, strerror(errno)); - } - - /* - * sync pg_control file - */ - join_path_components(to_fullpath, dest_pgdata, source_pg_control_file->rel_path); - if (fio_sync(to_fullpath, FIO_LOCAL_HOST) != 0) - elog(ERROR, "Cannot sync file \"%s\": %s", to_fullpath, strerror(errno)); - - time(&end_time); - pretty_time_interval(difftime(end_time, start_time), - pretty_time, lengthof(pretty_time)); - elog(INFO, "Files are synced, time elapsed: %s", pretty_time); + if (sync_dest_files) + catchup_sync_destination_files(dest_pgdata, FIO_LOCAL_HOST, source_filelist, source_pg_control_file); + else + elog(WARNING, "Files are not synced to disk"); } /* Cleanup */ @@ -869,104 +992,9 @@ do_catchup_instance(const char *source_pgdata, const char *dest_pgdata, PGconn * parray_walk(source_filelist, pgFileFree); parray_free(source_filelist); pgFileFree(source_pg_control_file); -} -/* - * Catchup file copier executed in separate threads - */ -static void * -catchup_thread_runner(void *arg) -{ - int i; - char from_fullpath[MAXPGPATH]; - char to_fullpath[MAXPGPATH]; - - catchup_thread_runner_arg *arguments = (catchup_thread_runner_arg *) arg; - int n_files = parray_num(arguments->source_filelist); - - /* catchup a file */ - for (i = 0; i < n_files; i++) - { - pgFile *file = (pgFile *) parray_get(arguments->source_filelist, i); - pgFile *dest_file = NULL; - - /* We have already copied all directories */ - if (S_ISDIR(file->mode)) - continue; - - if (!pg_atomic_test_set_flag(&file->lock)) - continue; - - /* check for interrupt */ - if (interrupted || thread_interrupted) - elog(ERROR, "Interrupted during catchup"); - - if (progress) - elog(INFO, "Progress: (%d/%d). Process file \"%s\"", - i + 1, n_files, file->rel_path); - - /* construct destination filepath */ - Assert(file->external_dir_num == 0); - join_path_components(from_fullpath, arguments->from_root, file->rel_path); - join_path_components(to_fullpath, arguments->to_root, file->rel_path); - - /* Encountered some strange beast */ - if (!S_ISREG(file->mode)) - elog(WARNING, "Unexpected type %d of file \"%s\", skipping", - file->mode, from_fullpath); - - /* Check that file exist in dest pgdata */ - if (arguments->backup_mode != BACKUP_MODE_FULL) - { - pgFile **dest_file_tmp = NULL; - dest_file_tmp = (pgFile **) parray_bsearch(arguments->dest_filelist, - file, pgFileCompareRelPathWithExternal); - if (dest_file_tmp) - { - /* File exists in destination PGDATA */ - file->exists_in_prev = true; - dest_file = *dest_file_tmp; - } - } - - /* Do actual work */ - if (file->is_datafile && !file->is_cfs) - { - catchup_data_file(file, from_fullpath, to_fullpath, - arguments->sync_lsn, - arguments->backup_mode, - NONE_COMPRESS, - 0, - arguments->nodeInfo->checksum_version, - arguments->nodeInfo->ptrack_version_num, - arguments->nodeInfo->ptrack_schema, - false, - dest_file != NULL ? dest_file->size : 0); - } - else - { - backup_non_data_file(file, dest_file, from_fullpath, to_fullpath, - arguments->backup_mode, current.parent_backup, true); - } - - if (file->write_size == FILE_NOT_FOUND) - continue; - - if (file->write_size == BYTES_INVALID) - { - elog(VERBOSE, "Skipping the unchanged file: \"%s\", read %li bytes", from_fullpath, file->read_size); - continue; - } - - elog(VERBOSE, "File \"%s\". Copied "INT64_FORMAT " bytes", - from_fullpath, file->write_size); - } - - /* ssh connection to longer needed */ - fio_disconnect(); - - /* Data files transferring is successful */ - arguments->ret = 0; + //REVIEW: Are we going to do that before release? + /* TODO: show the amount of transfered data in bytes and calculate incremental ratio */ - return NULL; + return 0; } diff --git a/src/pg_probackup.c b/src/pg_probackup.c index 1ae8fa660..00796be04 100644 --- a/src/pg_probackup.c +++ b/src/pg_probackup.c @@ -825,7 +825,7 @@ main(int argc, char *argv[]) no_validate, no_sync, backup_logs); } case CATCHUP_CMD: - return do_catchup(catchup_source_pgdata, catchup_destination_pgdata, num_threads); + return do_catchup(catchup_source_pgdata, catchup_destination_pgdata, num_threads, !no_sync); case RESTORE_CMD: return do_restore_or_validate(instanceState, current.backup_id, recovery_target_options, diff --git a/src/pg_probackup.h b/src/pg_probackup.h index 9dca8abe5..1b489a9c3 100644 --- a/src/pg_probackup.h +++ b/src/pg_probackup.h @@ -601,27 +601,6 @@ typedef struct int ret; } backup_files_arg; -typedef struct -{ - PGNodeInfo *nodeInfo; - - const char *from_root; - const char *to_root; - - parray *source_filelist; - parray *dest_filelist; - - XLogRecPtr sync_lsn; - BackupMode backup_mode; - int thread_num; - - /* - * Return value from the thread. - * 0 means there is no error, 1 - there is an error. - */ - int ret; -} catchup_thread_runner_arg; - typedef struct timelineInfo timelineInfo; /* struct to collect info about timelines in WAL archive */ @@ -869,7 +848,7 @@ extern char *pg_ptrack_get_block(ConnectionArgs *arguments, BlockNumber blknum, size_t *result_size, int ptrack_version_num, const char *ptrack_schema); /* in catchup.c */ -extern int do_catchup(const char *source_pgdata, const char *dest_pgdata, int num_threads); +extern int do_catchup(const char *source_pgdata, const char *dest_pgdata, int num_threads, bool sync_dest_files); /* in restore.c */ extern int do_restore_or_validate(InstanceState *instanceState, time_t target_backup_id, From 8d3fb162ae2f689f86b3aa11f51b98df816823e1 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Fri, 11 Jun 2021 01:11:47 +0300 Subject: [PATCH 58/63] TLI tests and TLI check fix --- src/catchup.c | 26 +++-- src/pg_probackup.h | 2 +- src/restore.c | 8 +- tests/catchup.py | 281 ++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 299 insertions(+), 18 deletions(-) diff --git a/src/catchup.c b/src/catchup.c index c3d3c289d..5adcd4f67 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -13,6 +13,7 @@ #include "catalog/catalog.h" #endif #include "catalog/pg_tablespace.h" +#include "access/timeline.h" #include "pgtar.h" #include "streamutil.h" @@ -210,21 +211,30 @@ catchup_preflight_checks(PGNodeInfo *source_node_info, PGconn *source_conn, /* check timelines */ if (current.backup_mode != BACKUP_MODE_FULL) { - TimeLineID dest_tli; - parray *source_timelines; + RedoParams dest_redo = { 0, InvalidXLogRecPtr, 0 }; - dest_tli = get_current_timeline_from_control(dest_pgdata, FIO_LOCAL_HOST, false); + /* fill dest_redo.lsn and dest_redo.tli */ + get_redo(dest_pgdata, FIO_LOCAL_HOST, &dest_redo); - source_timelines = catchup_get_tli_history(&instance_config.conn_opt, current.tli); + if (current.tli != 1) + { + parray *source_timelines; /* parray* of TimeLineHistoryEntry* */ + source_timelines = catchup_get_tli_history(&instance_config.conn_opt, current.tli); - if (source_timelines != NULL && !tliIsPartOfHistory(source_timelines, dest_tli)) - elog(ERROR, "Destination is not in source history"); + if (source_timelines == NULL) + elog(ERROR, "Cannot get source timeline history"); + + if (!satisfy_timeline(source_timelines, dest_redo.tli, dest_redo.lsn)) + elog(ERROR, "Destination is not in source timeline history"); - if (source_timelines != NULL) - { parray_walk(source_timelines, pfree); parray_free(source_timelines); } + else /* special case -- no history files in source */ + { + if (dest_redo.tli != 1) + elog(ERROR, "Source is behind destination in timeline history"); + } } } diff --git a/src/pg_probackup.h b/src/pg_probackup.h index 1b489a9c3..a15c0b52b 100644 --- a/src/pg_probackup.h +++ b/src/pg_probackup.h @@ -855,7 +855,7 @@ extern int do_restore_or_validate(InstanceState *instanceState, pgRecoveryTarget *rt, pgRestoreParams *params, bool no_sync); -extern bool satisfy_timeline(const parray *timelines, const pgBackup *backup); +extern bool satisfy_timeline(const parray *timelines, TimeLineID tli, XLogRecPtr lsn); extern bool satisfy_recovery_target(const pgBackup *backup, const pgRecoveryTarget *rt); extern pgRecoveryTarget *parseRecoveryTargetOptions( diff --git a/src/restore.c b/src/restore.c index e785e551b..3cefd0805 100644 --- a/src/restore.c +++ b/src/restore.c @@ -291,7 +291,7 @@ do_restore_or_validate(InstanceState *instanceState, time_t target_backup_id, pg if (!timelines) elog(ERROR, "Failed to get history file for target timeline %i", rt->target_tli); - if (!satisfy_timeline(timelines, current_backup)) + if (!satisfy_timeline(timelines, current_backup->tli, current_backup->stop_lsn)) { if (target_backup_id != INVALID_BACKUP_ID) elog(ERROR, "target backup %s does not satisfy target timeline", @@ -1818,7 +1818,7 @@ satisfy_recovery_target(const pgBackup *backup, const pgRecoveryTarget *rt) /* TODO description */ bool -satisfy_timeline(const parray *timelines, const pgBackup *backup) +satisfy_timeline(const parray *timelines, TimeLineID tli, XLogRecPtr lsn) { int i; @@ -1827,9 +1827,9 @@ satisfy_timeline(const parray *timelines, const pgBackup *backup) TimeLineHistoryEntry *timeline; timeline = (TimeLineHistoryEntry *) parray_get(timelines, i); - if (backup->tli == timeline->tli && + if (tli == timeline->tli && (XLogRecPtrIsInvalid(timeline->end) || - backup->stop_lsn <= timeline->end)) + lsn <= timeline->end)) return true; } return false; diff --git a/tests/catchup.py b/tests/catchup.py index a36b4f3a6..46e56a885 100644 --- a/tests/catchup.py +++ b/tests/catchup.py @@ -42,7 +42,7 @@ def test_simple_full_catchup(self): self.pgdata_content(dst_pg.data_dir) ) - # run&recover catchup'ed instance + # run&recover catchup'ed instance src_pg.stop() dst_options = {} dst_options['port'] = str(dst_pg.port) @@ -101,7 +101,7 @@ def test_full_catchup_with_tablespace(self): "UPDATE ultimate_question SET answer = -1") src_pg.stop() - # run&recover catchup'ed instance + # run&recover catchup'ed instance dst_options = {} dst_options['port'] = str(dst_pg.port) self.set_auto_conf(dst_pg, dst_options) @@ -166,7 +166,7 @@ def test_simple_delta_catchup(self): self.pgdata_content(dst_pg.data_dir) ) - # run&recover catchup'ed instance + # run&recover catchup'ed instance src_pg.stop() self.set_replica(master = src_pg, replica = dst_pg) dst_options = {} @@ -238,7 +238,7 @@ def test_simple_ptrack_catchup(self): self.pgdata_content(dst_pg.data_dir) ) - # run&recover catchup'ed instance + # run&recover catchup'ed instance src_pg.stop() self.set_replica(master = src_pg, replica = dst_pg) dst_options = {} @@ -254,6 +254,137 @@ def test_simple_ptrack_catchup(self): dst_pg.stop() self.del_test_dir(module_name, self.fname) + def test_tli_delta_catchup(self): + """ + Test that we correctly follow timeline change with delta catchup + """ + # preparation 1: source + src_pg = self.make_simple_node( + base_dir = os.path.join(module_name, self.fname, 'src'), + set_replication = True, + pg_options = { 'wal_log_hints': 'on' } + ) + src_pg.slow_start() + + # preparation 2: destination + dst_pg = self.make_empty_node(os.path.join(module_name, self.fname, 'dst')) + self.catchup_node( + backup_mode = 'FULL', + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] + ) + dst_options = {} + dst_options['port'] = str(dst_pg.port) + self.set_auto_conf(dst_pg, dst_options) + dst_pg.slow_start() + dst_pg.stop() + + # preparation 3: promote source + src_pg.stop() + self.set_replica(dst_pg, src_pg) # fake replication + src_pg.slow_start(replica = True) + src_pg.promote() + src_pg.safe_psql("postgres", "CREATE TABLE ultimate_question AS SELECT 42 AS answer") + src_query_result = src_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") + + # do catchup + self.catchup_node( + backup_mode = 'DELTA', + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] + ) + + # 1st check: compare data directories + self.compare_pgdata( + self.pgdata_content(src_pg.data_dir), + self.pgdata_content(dst_pg.data_dir) + ) + + # run&recover catchup'ed instance + dst_options = {} + dst_options['port'] = str(dst_pg.port) + self.set_auto_conf(dst_pg, dst_options) + dst_pg.slow_start() + + # 2nd check: run verification query + dst_query_result = dst_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") + self.assertEqual(src_query_result, dst_query_result, 'Different answer from copy') + + # Cleanup + src_pg.stop() + dst_pg.stop() + self.del_test_dir(module_name, self.fname) + + def test_tli_ptrack_catchup(self): + """ + Test that we correctly follow timeline change with ptrack catchup + """ + if not self.ptrack: + return unittest.skip('Skipped because ptrack support is disabled') + + # preparation 1: source + src_pg = self.make_simple_node( + base_dir = os.path.join(module_name, self.fname, 'src'), + set_replication = True, + ptrack_enable = True, + initdb_params = ['--data-checksums'] + ) + src_pg.slow_start() + src_pg.safe_psql("postgres", "CREATE EXTENSION ptrack") + + # preparation 2: destination + dst_pg = self.make_empty_node(os.path.join(module_name, self.fname, 'dst')) + self.catchup_node( + backup_mode = 'FULL', + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] + ) + dst_options = {} + dst_options['port'] = str(dst_pg.port) + self.set_auto_conf(dst_pg, dst_options) + dst_pg.slow_start() + dst_pg.stop() + + # preparation 3: promote source + src_pg.stop() + self.set_replica(dst_pg, src_pg) # fake replication + src_pg.slow_start(replica = True) + src_pg.promote() + src_pg.safe_psql("postgres", "CREATE TABLE ultimate_question AS SELECT 42 AS answer") + src_query_result = src_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") + + # do catchup + self.catchup_node( + backup_mode = 'PTRACK', + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] + ) + + # 1st check: compare data directories + self.compare_pgdata( + self.pgdata_content(src_pg.data_dir), + self.pgdata_content(dst_pg.data_dir) + ) + + # run&recover catchup'ed instance + dst_options = {} + dst_options['port'] = str(dst_pg.port) + self.set_auto_conf(dst_pg, dst_options) + dst_pg.slow_start() + + # 2nd check: run verification query + dst_query_result = dst_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") + self.assertEqual(src_query_result, dst_query_result, 'Different answer from copy') + + # Cleanup + src_pg.stop() + dst_pg.stop() + self.del_test_dir(module_name, self.fname) + ######################################### # Test various corner conditions ######################################### @@ -491,6 +622,9 @@ def test_tablefile_truncation_with_ptrack(self): # Test reaction on user errors ######################################### def test_local_tablespace_without_mapping(self): + """ + Test that we detect absence of needed --tablespace-mapping option + """ if self.remote: return unittest.skip('Skipped because this test tests local catchup error handling') @@ -555,7 +689,7 @@ def test_running_dest_postmaster(self): self.set_auto_conf(dst_pg, dst_options) dst_pg.slow_start() # leave running destination postmaster - #dst_pg.stop() + # so don't call dst_pg.stop() # try delta catchup try: @@ -704,3 +838,140 @@ def test_destination_dbstate(self): # Cleanup src_pg.stop() self.del_test_dir(module_name, self.fname) + + def test_tli_destination_mismatch(self): + """ + Test that we detect TLI mismatch in destination + """ + # preparation 1: source + src_pg = self.make_simple_node( + base_dir = os.path.join(module_name, self.fname, 'src'), + set_replication = True, + pg_options = { 'wal_log_hints': 'on' } + ) + src_pg.slow_start() + + # preparation 2: destination + dst_pg = self.make_empty_node(os.path.join(module_name, self.fname, 'dst')) + self.catchup_node( + backup_mode = 'FULL', + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] + ) + dst_options = {} + dst_options['port'] = str(dst_pg.port) + self.set_auto_conf(dst_pg, dst_options) + self.set_replica(src_pg, dst_pg) + dst_pg.slow_start(replica = True) + dst_pg.promote() + dst_pg.stop() + + # preparation 3: "useful" changes + src_pg.safe_psql("postgres", "CREATE TABLE ultimate_question AS SELECT 42 AS answer") + src_query_result = src_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") + + # try catchup + try: + self.catchup_node( + backup_mode = 'DELTA', + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] + ) + dst_options = {} + dst_options['port'] = str(dst_pg.port) + self.set_auto_conf(dst_pg, dst_options) + dst_pg.slow_start() + dst_query_result = dst_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") + dst_pg.stop() + self.assertEqual(src_query_result, dst_query_result, 'Different answer from copy') + except ProbackupException as e: + self.assertIn( + 'ERROR: Source is behind destination in timeline history', + e.message, + '\n Unexpected Error Message: {0}\n CMD: {1}'.format(repr(e.message), self.cmd)) + + # Cleanup + src_pg.stop() + self.del_test_dir(module_name, self.fname) + + def test_tli_source_mismatch(self): + """ + Test that we detect TLI mismatch in source history + """ + # preparation 1: source + src_pg = self.make_simple_node( + base_dir = os.path.join(module_name, self.fname, 'src'), + set_replication = True, + pg_options = { 'wal_log_hints': 'on' } + ) + src_pg.slow_start() + + # preparation 2: fake source (promouted copy) + fake_src_pg = self.make_empty_node(os.path.join(module_name, self.fname, 'fake_src')) + self.catchup_node( + backup_mode = 'FULL', + source_pgdata = src_pg.data_dir, + destination_node = fake_src_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] + ) + fake_src_options = {} + fake_src_options['port'] = str(fake_src_pg.port) + self.set_auto_conf(fake_src_pg, fake_src_options) + self.set_replica(src_pg, fake_src_pg) + fake_src_pg.slow_start(replica = True) + fake_src_pg.promote() + self.switch_wal_segment(fake_src_pg) + fake_src_pg.safe_psql( + "postgres", + "CREATE TABLE t_heap AS SELECT i AS id, " + "md5(i::text) AS text, " + "md5(repeat(i::text, 10))::tsvector AS tsvector " + "FROM generate_series(0, 256) i") + self.switch_wal_segment(fake_src_pg) + fake_src_pg.safe_psql("postgres", "CREATE TABLE ultimate_question AS SELECT 'trash' AS garbage") + + # preparation 3: destination + dst_pg = self.make_empty_node(os.path.join(module_name, self.fname, 'dst')) + self.catchup_node( + backup_mode = 'FULL', + source_pgdata = src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(src_pg.port), '--stream'] + ) + dst_options = {} + dst_options['port'] = str(dst_pg.port) + self.set_auto_conf(dst_pg, dst_options) + dst_pg.slow_start() + dst_pg.stop() + + # preparation 4: "useful" changes + src_pg.safe_psql("postgres", "CREATE TABLE ultimate_question AS SELECT 42 AS answer") + src_query_result = src_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") + + # try catchup + try: + self.catchup_node( + backup_mode = 'DELTA', + source_pgdata = fake_src_pg.data_dir, + destination_node = dst_pg, + options = ['-d', 'postgres', '-p', str(fake_src_pg.port), '--stream'] + ) + dst_options = {} + dst_options['port'] = str(dst_pg.port) + self.set_auto_conf(dst_pg, dst_options) + dst_pg.slow_start() + dst_query_result = dst_pg.safe_psql("postgres", "SELECT * FROM ultimate_question") + dst_pg.stop() + self.assertEqual(src_query_result, dst_query_result, 'Different answer from copy') + except ProbackupException as e: + self.assertIn( + 'ERROR: Destination is not in source timeline history', + e.message, + '\n Unexpected Error Message: {0}\n CMD: {1}'.format(repr(e.message), self.cmd)) + + # Cleanup + src_pg.stop() + fake_src_pg.stop() + self.del_test_dir(module_name, self.fname) From 6b6086f8ebdfb28987bf32285d3f0f451676f895 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Fri, 11 Jun 2021 04:14:20 +0300 Subject: [PATCH 59/63] postgres-9.5 compability fix --- src/catchup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/catchup.c b/src/catchup.c index 5adcd4f67..4318ffa24 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -930,6 +930,7 @@ do_catchup(const char *source_pgdata, const char *dest_pgdata, int num_threads, wait_wal_and_calculate_stop_lsn(dest_xlog_path, stop_backup_result.lsn, ¤t); +#if PG_VERSION_NUM >= 90600 /* Write backup_label */ Assert(stop_backup_result.backup_label_content != NULL); pg_stop_backup_write_file_helper(dest_pgdata, PG_BACKUP_LABEL_FILE, "backup label", @@ -954,6 +955,7 @@ do_catchup(const char *source_pgdata, const char *dest_pgdata, int num_threads, stop_backup_result.tablespace_map_content = NULL; stop_backup_result.tablespace_map_content_len = 0; } +#endif if(wait_WAL_streaming_end(NULL)) elog(ERROR, "WAL streaming failed"); From ff444b3e704035842c8c0488063b2e73e4d23090 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Fri, 11 Jun 2021 05:34:16 +0300 Subject: [PATCH 60/63] rename tests (include simple tests to basic suite) --- tests/catchup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/catchup.py b/tests/catchup.py index 46e56a885..5df538e42 100644 --- a/tests/catchup.py +++ b/tests/catchup.py @@ -12,7 +12,7 @@ def setUp(self): ######################################### # Basic tests ######################################### - def test_simple_full_catchup(self): + def test_basic_full_catchup(self): """ Test 'multithreaded basebackup' mode (aka FULL catchup) """ @@ -115,7 +115,7 @@ def test_full_catchup_with_tablespace(self): dst_pg.stop() self.del_test_dir(module_name, self.fname) - def test_simple_delta_catchup(self): + def test_basic_delta_catchup(self): """ Test delta catchup """ @@ -182,7 +182,7 @@ def test_simple_delta_catchup(self): dst_pg.stop() self.del_test_dir(module_name, self.fname) - def test_simple_ptrack_catchup(self): + def test_basic_ptrack_catchup(self): """ Test ptrack catchup """ From 83ccabc766c2173e9fb26b4d488d61b847bd0f7c Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Fri, 11 Jun 2021 20:56:22 +0300 Subject: [PATCH 61/63] add comment of future improvements --- src/catchup.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/catchup.c b/src/catchup.c index 4318ffa24..f80a0f0f9 100644 --- a/src/catchup.c +++ b/src/catchup.c @@ -617,6 +617,14 @@ do_catchup(const char *source_pgdata, const char *dest_pgdata, int num_threads, // fill dest_redo.lsn and dest_redo.tli get_redo(dest_pgdata, FIO_LOCAL_HOST, &dest_redo); elog(INFO, "syncLSN = %X/%X", (uint32) (dest_redo.lsn >> 32), (uint32) dest_redo.lsn); + + /* + * Future improvement to catch partial catchup: + * 1. rename dest pg_control into something like pg_control.pbk + * (so user can't start partial catchup'ed instance from this point) + * 2. try to read by get_redo() pg_control and pg_control.pbk (to detect partial catchup) + * 3. at the end (after copy of correct pg_control), remove pg_control.pbk + */ } //REVIEW I wonder, if we can move this piece above and call before pg_start backup()? From d987131efbcbb1d45508eb49b54752d248440221 Mon Sep 17 00:00:00 2001 From: "Mikhail A. Kulagin" Date: Tue, 15 Jun 2021 12:45:03 +0300 Subject: [PATCH 62/63] fix help message --- src/help.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/help.c b/src/help.c index faad59cb2..921feaec0 100644 --- a/src/help.c +++ b/src/help.c @@ -1042,7 +1042,7 @@ help_catchup(void) printf(_(" [--help]\n\n")); printf(_(" -b, --backup-mode=catchup-mode catchup mode=FULL|DELTA|PTRACK\n")); - printf(_(" --stream stream the transaction log and include it in the backup\n")); + printf(_(" --stream stream the transaction log (only supported mode)\n")); printf(_(" -S, --slot=SLOTNAME replication slot to use\n")); printf(_(" --temp-slot use temporary replication slot\n")); From 68cfd5b00bd9d597f43ff1d7101da9c159234712 Mon Sep 17 00:00:00 2001 From: Elena Indrupskaya Date: Wed, 16 Jun 2021 16:09:02 +0300 Subject: [PATCH 63/63] [DOC] Documentation updated upon feedback from Probackup team --- doc/pgprobackup.xml | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/doc/pgprobackup.xml b/doc/pgprobackup.xml index e2e8fe954..70ac9f9c5 100644 --- a/doc/pgprobackup.xml +++ b/doc/pgprobackup.xml @@ -3447,7 +3447,7 @@ pg_probackup delete -B backup_dir --instance Cloning a PostgreSQL instance is different from other pg_probackup - processes: + operations: @@ -3465,6 +3465,24 @@ pg_probackup delete -B backup_dir --instance + + + No SQL commands involving tablespaces, such as + CREATE TABLESPACE/DROP TABLESPACE, + can be run simultaneously with catchup. + + + + + catchup takes configuration files, such as + postgresql.conf, postgresql.auto.conf, + or pg_hba.conf, from the source server and overwrites them + on the target server. + + @@ -3484,7 +3502,7 @@ pg_probackup delete -B backup_dir --instance - To use the PTRACK backup mode, set up PTRACK backups. + To use the PTRACK catchup mode, set up PTRACK backups. @@ -3520,8 +3538,9 @@ pg_probackup catchup -b catchup-mode --source-pgdata= - PTRACK — creates an incremental backup tracking page - changes on the fly. + PTRACK — tracking page changes on the fly, + only copies pages that have changed since the point of divergence + of the source and destination databases. For this mode, the destination directory must contain a previous copy of the database that was shut down cleanly. @@ -3544,7 +3563,7 @@ pg_probackup catchup -b catchup-mode --source-pgdata= - For example, assume that a remote standby server with the PostgreSQL instance in /replica-pgdata data directory has fallen behind. To sync this instance with the one in /master-pgdata data directory, you can run + For example, assume that a remote standby server with the PostgreSQL instance having /replica-pgdata data directory has fallen behind. To sync this instance with the one in /master-pgdata data directory, you can run the catchup command in the PTRACK mode on four parallel threads as follows: pg_probackup catchup --source-pgdata=/master-pgdata --destination-pgdata=/replica-pgdata -p 5432 -d postgres -U remote-postgres-user --stream --backup-mode=PTRACK --remote-host=remote-hostname --remote-user=remote-unix-username -j 4 @@ -4468,8 +4487,9 @@ pg_probackup catchup -b catchup_mode - PTRACK — creates an incremental PTRACK backup tracking - page changes on the fly. + PTRACK — tracking page changes on the fly, + only copies pages that have changed since the point of divergence + of the source and destination databases.