Skip to content

Commit 295b8a1

Browse files
committed
[Issue #183] If backup deletion failed to lock backup due to "out of space" condition, then treat backup as locked and carry on.
1 parent 23532e8 commit 295b8a1

File tree

9 files changed

+119
-73
lines changed

9 files changed

+119
-73
lines changed

src/backup.c

+9-9
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ do_backup_instance(PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync)
230230
prev_backup_start_lsn = prev_backup->start_lsn;
231231
current.parent_backup = prev_backup->start_time;
232232

233-
write_backup(&current);
233+
write_backup(&current, true);
234234
}
235235

236236
/*
@@ -287,7 +287,7 @@ do_backup_instance(PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync)
287287
base36enc(prev_backup->start_time));
288288

289289
/* Update running backup meta with START LSN */
290-
write_backup(&current);
290+
write_backup(&current, true);
291291

292292
pgBackupGetPath(&current, database_path, lengthof(database_path),
293293
DATABASE_DIR);
@@ -496,7 +496,7 @@ do_backup_instance(PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync)
496496
/* write initial backup_content.control file and update backup.control */
497497
write_backup_filelist(&current, backup_files_list,
498498
instance_config.pgdata, external_dirs);
499-
write_backup(&current);
499+
write_backup(&current, true);
500500

501501
/* init thread args with own file lists */
502502
threads = (pthread_t *) palloc(sizeof(pthread_t) * num_threads);
@@ -661,7 +661,7 @@ do_backup_instance(PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync)
661661
write_backup_filelist(&current, backup_files_list, instance_config.pgdata,
662662
external_dirs);
663663
/* update backup control file to update size info */
664-
write_backup(&current);
664+
write_backup(&current, true);
665665

666666
/* Sync all copied files unless '--no-sync' flag is used */
667667
if (no_sync)
@@ -840,10 +840,10 @@ do_backup(time_t start_time, bool no_validate,
840840
/* Create backup directory and BACKUP_CONTROL_FILE */
841841
if (pgBackupCreateDir(&current))
842842
elog(ERROR, "Cannot create backup directory");
843-
if (!lock_backup(&current))
843+
if (!lock_backup(&current, true))
844844
elog(ERROR, "Cannot lock backup %s directory",
845845
base36enc(current.start_time));
846-
write_backup(&current);
846+
write_backup(&current, true);
847847

848848
/* set the error processing function for the backup process */
849849
pgut_atexit_push(backup_cleanup, NULL);
@@ -931,7 +931,7 @@ do_backup(time_t start_time, bool no_validate,
931931
/* Backup is done. Update backup status */
932932
current.end_time = time(NULL);
933933
current.status = BACKUP_STATUS_DONE;
934-
write_backup(&current);
934+
write_backup(&current, true);
935935

936936
/* Pin backup if requested */
937937
if (set_backup_params &&
@@ -2020,7 +2020,7 @@ backup_cleanup(bool fatal, void *userdata)
20202020
base36enc(current.start_time));
20212021
current.end_time = time(NULL);
20222022
current.status = BACKUP_STATUS_ERROR;
2023-
write_backup(&current);
2023+
write_backup(&current, true);
20242024
}
20252025
}
20262026

@@ -2065,7 +2065,7 @@ backup_files(void *arg)
20652065
write_backup_filelist(&current, arguments->files_list, arguments->from_root,
20662066
arguments->external_dirs);
20672067
/* update backup control file to update size info */
2068-
write_backup(&current);
2068+
write_backup(&current, true);
20692069
}
20702070
}
20712071

src/catalog.c

+37-23
Original file line numberDiff line numberDiff line change
@@ -89,14 +89,11 @@ unlink_lock_atexit(void)
8989
* If no backup matches, return NULL.
9090
*/
9191
pgBackup *
92-
read_backup(const char *instance_name, time_t timestamp)
92+
read_backup(const char *root_dir)
9393
{
94-
pgBackup tmp;
9594
char conf_path[MAXPGPATH];
9695

97-
tmp.start_time = timestamp;
98-
pgBackupGetPathInInstance(instance_name, &tmp, conf_path,
99-
lengthof(conf_path), BACKUP_CONTROL_FILE, NULL);
96+
join_path_components(conf_path, root_dir, BACKUP_CONTROL_FILE);
10097

10198
return readBackupControlFile(conf_path);
10299
}
@@ -109,11 +106,11 @@ read_backup(const char *instance_name, time_t timestamp)
109106
*/
110107
void
111108
write_backup_status(pgBackup *backup, BackupStatus status,
112-
const char *instance_name)
109+
const char *instance_name, bool strict)
113110
{
114111
pgBackup *tmp;
115112

116-
tmp = read_backup(instance_name, backup->start_time);
113+
tmp = read_backup(backup->root_dir);
117114
if (!tmp)
118115
{
119116
/*
@@ -125,7 +122,9 @@ write_backup_status(pgBackup *backup, BackupStatus status,
125122

126123
backup->status = status;
127124
tmp->status = backup->status;
128-
write_backup(tmp);
125+
tmp->root_dir = pgut_strdup(backup->root_dir);
126+
127+
write_backup(tmp, strict);
129128

130129
pgBackupFree(tmp);
131130
}
@@ -134,7 +133,7 @@ write_backup_status(pgBackup *backup, BackupStatus status,
134133
* Create exclusive lockfile in the backup's directory.
135134
*/
136135
bool
137-
lock_backup(pgBackup *backup)
136+
lock_backup(pgBackup *backup, bool strict)
138137
{
139138
char lock_file[MAXPGPATH];
140139
int fd;
@@ -280,6 +279,14 @@ lock_backup(pgBackup *backup)
280279
fio_unlink(lock_file, FIO_BACKUP_HOST);
281280
/* if write didn't set errno, assume problem is no disk space */
282281
errno = save_errno ? save_errno : ENOSPC;
282+
283+
/* In lax mode if we failed to grab lock because of 'out of space error',
284+
* then treat backup as locked.
285+
* Only delete command should be run in lax mode.
286+
*/
287+
if (!strict && errno == ENOSPC)
288+
return true;
289+
283290
elog(ERROR, "Could not write lock file \"%s\": %s",
284291
lock_file, strerror(errno));
285292
}
@@ -536,7 +543,7 @@ get_backup_filelist(pgBackup *backup)
536543
parray *files = NULL;
537544
char backup_filelist_path[MAXPGPATH];
538545

539-
pgBackupGetPath(backup, backup_filelist_path, lengthof(backup_filelist_path), DATABASE_FILE_LIST);
546+
join_path_components(backup_filelist_path, backup->root_dir, DATABASE_FILE_LIST);
540547
files = dir_read_file_list(NULL, NULL, backup_filelist_path, FIO_BACKUP_HOST);
541548

542549
/* redundant sanity? */
@@ -550,7 +557,7 @@ get_backup_filelist(pgBackup *backup)
550557
* Lock list of backups. Function goes in backward direction.
551558
*/
552559
void
553-
catalog_lock_backup_list(parray *backup_list, int from_idx, int to_idx)
560+
catalog_lock_backup_list(parray *backup_list, int from_idx, int to_idx, bool strict)
554561
{
555562
int start_idx,
556563
end_idx;
@@ -565,7 +572,7 @@ catalog_lock_backup_list(parray *backup_list, int from_idx, int to_idx)
565572
for (i = start_idx; i >= end_idx; i--)
566573
{
567574
pgBackup *backup = (pgBackup *) parray_get(backup_list, i);
568-
if (!lock_backup(backup))
575+
if (!lock_backup(backup, strict))
569576
elog(ERROR, "Cannot lock backup %s directory",
570577
base36enc(backup->start_time));
571578
}
@@ -837,7 +844,7 @@ pgBackupCreateDir(pgBackup *backup)
837844
/* create directories for actual backup files */
838845
for (i = 0; i < parray_num(subdirs); i++)
839846
{
840-
pgBackupGetPath(backup, path, lengthof(path), parray_get(subdirs, i));
847+
join_path_components(path, backup->root_dir, parray_get(subdirs, i));
841848
fio_mkdir(path, DIR_PERMISSION, FIO_BACKUP_HOST);
842849
}
843850

@@ -1580,7 +1587,7 @@ pin_backup(pgBackup *target_backup, pgSetBackupParams *set_backup_params)
15801587
return;
15811588

15821589
/* Update backup.control */
1583-
write_backup(target_backup);
1590+
write_backup(target_backup, true);
15841591

15851592
if (set_backup_params->ttl > 0 || set_backup_params->expire_time > 0)
15861593
{
@@ -1630,7 +1637,7 @@ add_note(pgBackup *target_backup, char *note)
16301637
}
16311638

16321639
/* Update backup.control */
1633-
write_backup(target_backup);
1640+
write_backup(target_backup, true);
16341641
}
16351642

16361643
/*
@@ -1735,27 +1742,34 @@ pgBackupWriteControl(FILE *out, pgBackup *backup)
17351742
* Save the backup content into BACKUP_CONTROL_FILE.
17361743
*/
17371744
void
1738-
write_backup(pgBackup *backup)
1745+
write_backup(pgBackup *backup, bool strict)
17391746
{
1740-
FILE *fp = NULL;
1741-
char path[MAXPGPATH];
1742-
char path_temp[MAXPGPATH];
1743-
int errno_temp;
1747+
FILE *fp = NULL;
1748+
char path[MAXPGPATH];
1749+
char path_temp[MAXPGPATH];
1750+
int errno_temp;
1751+
char buf[4096];
17441752

1745-
pgBackupGetPath(backup, path, lengthof(path), BACKUP_CONTROL_FILE);
1753+
join_path_components(path, backup->root_dir, BACKUP_CONTROL_FILE);
17461754
snprintf(path_temp, sizeof(path_temp), "%s.tmp", path);
17471755

17481756
fp = fio_fopen(path_temp, PG_BINARY_W, FIO_BACKUP_HOST);
17491757
if (fp == NULL)
17501758
elog(ERROR, "Cannot open configuration file \"%s\": %s",
17511759
path_temp, strerror(errno));
17521760

1761+
setvbuf(fp, buf, _IOFBF, sizeof(buf));
1762+
17531763
pgBackupWriteControl(fp, backup);
17541764

1755-
if (fio_fflush(fp) || fio_fclose(fp))
1765+
if (fio_fclose(fp))
17561766
{
17571767
errno_temp = errno;
17581768
fio_unlink(path_temp, FIO_BACKUP_HOST);
1769+
1770+
if (!strict && errno_temp == ENOSPC)
1771+
return;
1772+
17591773
elog(ERROR, "Cannot write configuration file \"%s\": %s",
17601774
path_temp, strerror(errno_temp));
17611775
}
@@ -1788,7 +1802,7 @@ write_backup_filelist(pgBackup *backup, parray *files, const char *root,
17881802
int64 uncompressed_size_on_disk = 0;
17891803
int64 wal_size_on_disk = 0;
17901804

1791-
pgBackupGetPath(backup, path, lengthof(path), DATABASE_FILE_LIST);
1805+
join_path_components(path, backup->root_dir, DATABASE_FILE_LIST);
17921806
snprintf(path_temp, sizeof(path_temp), "%s.tmp", path);
17931807

17941808
out = fio_fopen(path_temp, PG_BINARY_W, FIO_BACKUP_HOST);

src/delete.c

+6-6
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ do_delete(time_t backup_id)
8989
if (!dry_run)
9090
{
9191
/* Lock marked for delete backups */
92-
catalog_lock_backup_list(delete_list, parray_num(delete_list) - 1, 0);
92+
catalog_lock_backup_list(delete_list, parray_num(delete_list) - 1, 0, false);
9393

9494
/* Delete backups from the end of list */
9595
for (i = (int) parray_num(delete_list) - 1; i >= 0; i--)
@@ -510,7 +510,7 @@ do_retention_merge(parray *backup_list, parray *to_keep_list, parray *to_purge_l
510510
parray_rm(to_purge_list, full_backup, pgBackupCompareId);
511511

512512
/* Lock merge chain */
513-
catalog_lock_backup_list(merge_list, parray_num(merge_list) - 1, 0);
513+
catalog_lock_backup_list(merge_list, parray_num(merge_list) - 1, 0, true);
514514

515515
/* Consider this extreme case */
516516
// PAGEa1 PAGEb1 both valid
@@ -627,7 +627,7 @@ do_retention_purge(parray *to_keep_list, parray *to_purge_list)
627627
continue;
628628

629629
/* Actual purge */
630-
if (!lock_backup(delete_backup))
630+
if (!lock_backup(delete_backup, false))
631631
{
632632
/* If the backup still is used, do not interrupt and go to the next */
633633
elog(WARNING, "Cannot lock backup %s directory, skip purging",
@@ -746,7 +746,7 @@ delete_backup_files(pgBackup *backup)
746746
* Update STATUS to BACKUP_STATUS_DELETING in preparation for the case which
747747
* the error occurs before deleting all backup files.
748748
*/
749-
write_backup_status(backup, BACKUP_STATUS_DELETING, instance_name);
749+
write_backup_status(backup, BACKUP_STATUS_DELETING, instance_name, false);
750750

751751
/* list files to be deleted */
752752
files = parray_new();
@@ -968,7 +968,7 @@ do_delete_instance(void)
968968
/* Delete all backups. */
969969
backup_list = catalog_get_backup_list(instance_name, INVALID_BACKUP_ID);
970970

971-
catalog_lock_backup_list(backup_list, 0, parray_num(backup_list) - 1);
971+
catalog_lock_backup_list(backup_list, 0, parray_num(backup_list) - 1, true);
972972

973973
for (i = 0; i < parray_num(backup_list); i++)
974974
{
@@ -1091,7 +1091,7 @@ do_delete_status(InstanceConfig *instance_config, const char *status)
10911091
if (backup->stream)
10921092
size_to_delete += backup->wal_bytes;
10931093

1094-
if (!dry_run && lock_backup(backup))
1094+
if (!dry_run && lock_backup(backup, false))
10951095
delete_backup_files(backup);
10961096

10971097
n_deleted++;

src/merge.c

+20-8
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,7 @@ do_merge(time_t backup_id)
397397
parray_append(merge_list, full_backup);
398398

399399
/* Lock merge chain */
400-
catalog_lock_backup_list(merge_list, parray_num(merge_list) - 1, 0);
400+
catalog_lock_backup_list(merge_list, parray_num(merge_list) - 1, 0, true);
401401

402402
/* do actual merge */
403403
merge_chain(merge_list, full_backup, dest_backup);
@@ -583,10 +583,10 @@ merge_chain(parray *parent_chain, pgBackup *full_backup, pgBackup *dest_backup)
583583
*/
584584
backup->merge_dest_backup = dest_backup->start_time;
585585
backup->status = BACKUP_STATUS_MERGING;
586-
write_backup(backup);
586+
write_backup(backup, true);
587587
}
588588
else
589-
write_backup_status(backup, BACKUP_STATUS_MERGING, instance_name);
589+
write_backup_status(backup, BACKUP_STATUS_MERGING, instance_name, true);
590590
}
591591

592592
/* Create directories */
@@ -704,9 +704,13 @@ merge_chain(parray *parent_chain, pgBackup *full_backup, pgBackup *dest_backup)
704704

705705
/* If incremental backup is pinned,
706706
* then result FULL backup must also be pinned.
707+
* And reverse, if FULL backup was pinned and dest was not,
708+
* then pinning is no more.
707709
*/
708-
if (dest_backup->expire_time)
709-
full_backup->expire_time = dest_backup->expire_time;
710+
full_backup->expire_time = dest_backup->expire_time;
711+
712+
pg_free(full_backup->note);
713+
full_backup->note = NULL;
710714

711715
if (dest_backup->note)
712716
full_backup->note = pgut_strdup(dest_backup->note);
@@ -724,7 +728,7 @@ merge_chain(parray *parent_chain, pgBackup *full_backup, pgBackup *dest_backup)
724728
parray_qsort(result_filelist, pgFileCompareRelPathWithExternal);
725729

726730
write_backup_filelist(full_backup, result_filelist, full_database_dir, NULL);
727-
write_backup(full_backup);
731+
write_backup(full_backup, true);
728732

729733
/* Delete FULL backup files, that do not exists in destination backup
730734
* Both arrays must be sorted in in reversed order to delete from leaf
@@ -760,7 +764,7 @@ merge_chain(parray *parent_chain, pgBackup *full_backup, pgBackup *dest_backup)
760764
* Files are merged into FULL backup. It is time to remove incremental chain.
761765
*/
762766
full_backup->status = BACKUP_STATUS_MERGED;
763-
write_backup(full_backup);
767+
write_backup(full_backup, true);
764768

765769
merge_delete:
766770
for (i = parray_num(parent_chain) - 2; i >= 0; i--)
@@ -787,6 +791,10 @@ merge_chain(parray *parent_chain, pgBackup *full_backup, pgBackup *dest_backup)
787791
if (rename(full_backup->root_dir, dest_backup->root_dir) == -1)
788792
elog(ERROR, "Could not rename directory \"%s\" to \"%s\": %s",
789793
full_backup->root_dir, dest_backup->root_dir, strerror(errno));
794+
795+
/* update root_dir after rename */
796+
pg_free(full_backup->root_dir);
797+
full_backup->root_dir = pgut_strdup(dest_backup->root_dir);
790798
}
791799
else
792800
{
@@ -804,6 +812,10 @@ merge_chain(parray *parent_chain, pgBackup *full_backup, pgBackup *dest_backup)
804812
if (rename(full_backup->root_dir, destination_path) == -1)
805813
elog(ERROR, "Could not rename directory \"%s\" to \"%s\": %s",
806814
full_backup->root_dir, destination_path, strerror(errno));
815+
816+
/* update root_dir after rename */
817+
pg_free(full_backup->root_dir);
818+
full_backup->root_dir = pgut_strdup(destination_path);
807819
}
808820

809821
/* If we crash here, it will produce full backup in MERGED
@@ -821,7 +833,7 @@ merge_chain(parray *parent_chain, pgBackup *full_backup, pgBackup *dest_backup)
821833
full_backup->status = BACKUP_STATUS_OK;
822834
full_backup->start_time = full_backup->merge_dest_backup;
823835
full_backup->merge_dest_backup = INVALID_BACKUP_ID;
824-
write_backup(full_backup);
836+
write_backup(full_backup, true);
825837
/* Critical section end */
826838

827839
/* Cleanup */

0 commit comments

Comments
 (0)