@@ -1431,6 +1431,142 @@ wait_wal_lsn(const char *wal_segment_dir, XLogRecPtr target_lsn, bool is_start_l
1431
1431
}
1432
1432
}
1433
1433
1434
+ /*
1435
+ * Check stop_lsn (returned from pg_stop_backup()) and update backup->stop_lsn
1436
+ */
1437
+ void
1438
+ wait_wal_and_calculate_stop_lsn (const char * xlog_path , XLogRecPtr stop_lsn , pgBackup * backup )
1439
+ {
1440
+ bool stop_lsn_exists = false;
1441
+
1442
+ /* It is ok for replica to return invalid STOP LSN
1443
+ * UPD: Apparently it is ok even for a master.
1444
+ */
1445
+ if (!XRecOffIsValid (stop_lsn ))
1446
+ {
1447
+ XLogSegNo segno = 0 ;
1448
+ XLogRecPtr lsn_tmp = InvalidXLogRecPtr ;
1449
+
1450
+ /*
1451
+ * Even though the value is invalid, it's expected postgres behaviour
1452
+ * and we're trying to fix it below.
1453
+ */
1454
+ elog (LOG , "Invalid offset in stop_lsn value %X/%X, trying to fix" ,
1455
+ (uint32 ) (stop_lsn >> 32 ), (uint32 ) (stop_lsn ));
1456
+
1457
+ /*
1458
+ * Note: even with gdb it is very hard to produce automated tests for
1459
+ * contrecord + invalid LSN, so emulate it for manual testing.
1460
+ */
1461
+ //lsn = lsn - XLOG_SEG_SIZE;
1462
+ //elog(WARNING, "New Invalid stop_backup_lsn value %X/%X",
1463
+ // (uint32) (stop_lsn >> 32), (uint32) (stop_lsn));
1464
+
1465
+ GetXLogSegNo (stop_lsn , segno , instance_config .xlog_seg_size );
1466
+
1467
+ /*
1468
+ * Note, that there is no guarantee that corresponding WAL file even exists.
1469
+ * Replica may return LSN from future and keep staying in present.
1470
+ * Or it can return invalid LSN.
1471
+ *
1472
+ * That's bad, since we want to get real LSN to save it in backup label file
1473
+ * and to use it in WAL validation.
1474
+ *
1475
+ * So we try to do the following:
1476
+ * 1. Wait 'archive_timeout' seconds for segment containing stop_lsn and
1477
+ * look for the first valid record in it.
1478
+ * It solves the problem of occasional invalid LSN on write-busy system.
1479
+ * 2. Failing that, look for record in previous segment with endpoint
1480
+ * equal or greater than stop_lsn. It may(!) solve the problem of invalid LSN
1481
+ * on write-idle system. If that fails too, error out.
1482
+ */
1483
+
1484
+ /* stop_lsn is pointing to a 0 byte of xlog segment */
1485
+ if (stop_lsn % instance_config .xlog_seg_size == 0 )
1486
+ {
1487
+ /* Wait for segment with current stop_lsn, it is ok for it to never arrive */
1488
+ wait_wal_lsn (xlog_path , stop_lsn , false, backup -> tli ,
1489
+ false, true, WARNING , backup -> stream );
1490
+
1491
+ /* Get the first record in segment with current stop_lsn */
1492
+ lsn_tmp = get_first_record_lsn (xlog_path , segno , backup -> tli ,
1493
+ instance_config .xlog_seg_size ,
1494
+ instance_config .archive_timeout );
1495
+
1496
+ /* Check that returned LSN is valid and greater than stop_lsn */
1497
+ if (XLogRecPtrIsInvalid (lsn_tmp ) ||
1498
+ !XRecOffIsValid (lsn_tmp ) ||
1499
+ lsn_tmp < stop_lsn )
1500
+ {
1501
+ /* Backup from master should error out here */
1502
+ if (!backup -> from_replica )
1503
+ elog (ERROR , "Failed to get next WAL record after %X/%X" ,
1504
+ (uint32 ) (stop_lsn >> 32 ),
1505
+ (uint32 ) (stop_lsn ));
1506
+
1507
+ /* No luck, falling back to looking up for previous record */
1508
+ elog (WARNING , "Failed to get next WAL record after %X/%X, "
1509
+ "looking for previous WAL record" ,
1510
+ (uint32 ) (stop_lsn >> 32 ),
1511
+ (uint32 ) (stop_lsn ));
1512
+
1513
+ /* Despite looking for previous record there is not guarantee of success
1514
+ * because previous record can be the contrecord.
1515
+ */
1516
+ lsn_tmp = wait_wal_lsn (xlog_path , stop_lsn , false, backup -> tli ,
1517
+ true, false, ERROR , backup -> stream );
1518
+
1519
+ /* sanity */
1520
+ if (!XRecOffIsValid (lsn_tmp ) || XLogRecPtrIsInvalid (lsn_tmp ))
1521
+ elog (ERROR , "Failed to get WAL record prior to %X/%X" ,
1522
+ (uint32 ) (stop_lsn >> 32 ),
1523
+ (uint32 ) (stop_lsn ));
1524
+ }
1525
+ }
1526
+ /* stop lsn is aligned to xlog block size, just find next lsn */
1527
+ else if (stop_lsn % XLOG_BLCKSZ == 0 )
1528
+ {
1529
+ /* Wait for segment with current stop_lsn */
1530
+ wait_wal_lsn (xlog_path , stop_lsn , false, backup -> tli ,
1531
+ false, true, ERROR , backup -> stream );
1532
+
1533
+ /* Get the next closest record in segment with current stop_lsn */
1534
+ lsn_tmp = get_next_record_lsn (xlog_path , segno , backup -> tli ,
1535
+ instance_config .xlog_seg_size ,
1536
+ instance_config .archive_timeout ,
1537
+ stop_lsn );
1538
+
1539
+ /* sanity */
1540
+ if (!XRecOffIsValid (lsn_tmp ) || XLogRecPtrIsInvalid (lsn_tmp ))
1541
+ elog (ERROR , "Failed to get WAL record next to %X/%X" ,
1542
+ (uint32 ) (stop_lsn >> 32 ),
1543
+ (uint32 ) (stop_lsn ));
1544
+ }
1545
+ /* PostgreSQL returned something very illegal as STOP_LSN, error out */
1546
+ else
1547
+ elog (ERROR , "Invalid stop_backup_lsn value %X/%X" ,
1548
+ (uint32 ) (stop_lsn >> 32 ), (uint32 ) (stop_lsn ));
1549
+
1550
+ /* Setting stop_backup_lsn will set stop point for streaming */
1551
+ stop_backup_lsn = lsn_tmp ;
1552
+ stop_lsn_exists = true;
1553
+ }
1554
+
1555
+ elog (LOG , "stop_lsn: %X/%X" ,
1556
+ (uint32 ) (stop_lsn >> 32 ), (uint32 ) (stop_lsn ));
1557
+
1558
+ /*
1559
+ * Wait for stop_lsn to be archived or streamed.
1560
+ * If replica returned valid STOP_LSN of not actually existing record,
1561
+ * look for previous record with endpoint >= STOP_LSN.
1562
+ */
1563
+ if (!stop_lsn_exists )
1564
+ stop_backup_lsn = wait_wal_lsn (xlog_path , stop_lsn , false, backup -> tli ,
1565
+ false, false, ERROR , backup -> stream );
1566
+
1567
+ backup -> stop_lsn = stop_backup_lsn ;
1568
+ }
1569
+
1434
1570
/* Remove annoying NOTICE messages generated by backend */
1435
1571
void
1436
1572
pg_silent_client_messages (PGconn * conn )
@@ -1729,7 +1865,6 @@ static void
1729
1865
pg_stop_backup (InstanceState * instanceState , pgBackup * backup , PGconn * pg_startbackup_conn ,
1730
1866
PGNodeInfo * nodeInfo )
1731
1867
{
1732
- bool stop_lsn_exists = false;
1733
1868
PGStopBackupResult stop_backup_result ;
1734
1869
char * xlog_path , stream_xlog_path [MAXPGPATH ];
1735
1870
/* kludge against some old bug in archive_timeout. TODO: remove in 3.0.0 */
@@ -1772,121 +1907,7 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb
1772
1907
else
1773
1908
xlog_path = instanceState -> instance_wal_subdir_path ;
1774
1909
1775
- /* It is ok for replica to return invalid STOP LSN
1776
- * UPD: Apparently it is ok even for a master.
1777
- */
1778
- if (!XRecOffIsValid (stop_backup_result .lsn ))
1779
- {
1780
- XLogSegNo segno = 0 ;
1781
- XLogRecPtr lsn_tmp = InvalidXLogRecPtr ;
1782
-
1783
- /*
1784
- * Even though the value is invalid, it's expected postgres behaviour
1785
- * and we're trying to fix it below.
1786
- */
1787
- elog (LOG , "Invalid offset in stop_lsn value %X/%X, trying to fix" ,
1788
- (uint32 ) (stop_backup_result .lsn >> 32 ), (uint32 ) (stop_backup_result .lsn ));
1789
-
1790
- /*
1791
- * Note: even with gdb it is very hard to produce automated tests for
1792
- * contrecord + invalid LSN, so emulate it for manual testing.
1793
- */
1794
- //stop_backup_result.lsn = stop_backup_result.lsn - XLOG_SEG_SIZE;
1795
- //elog(WARNING, "New Invalid stop_backup_lsn value %X/%X",
1796
- // (uint32) (stop_backup_result.lsn >> 32), (uint32) (stop_backup_result.lsn));
1797
-
1798
- GetXLogSegNo (stop_backup_result .lsn , segno , instance_config .xlog_seg_size );
1799
-
1800
- /*
1801
- * Note, that there is no guarantee that corresponding WAL file even exists.
1802
- * Replica may return LSN from future and keep staying in present.
1803
- * Or it can return invalid LSN.
1804
- *
1805
- * That's bad, since we want to get real LSN to save it in backup label file
1806
- * and to use it in WAL validation.
1807
- *
1808
- * So we try to do the following:
1809
- * 1. Wait 'archive_timeout' seconds for segment containing stop_lsn and
1810
- * look for the first valid record in it.
1811
- * It solves the problem of occasional invalid LSN on write-busy system.
1812
- * 2. Failing that, look for record in previous segment with endpoint
1813
- * equal or greater than stop_lsn. It may(!) solve the problem of invalid LSN
1814
- * on write-idle system. If that fails too, error out.
1815
- */
1816
-
1817
- /* stop_lsn is pointing to a 0 byte of xlog segment */
1818
- if (stop_backup_result .lsn % instance_config .xlog_seg_size == 0 )
1819
- {
1820
- /* Wait for segment with current stop_lsn, it is ok for it to never arrive */
1821
- wait_wal_lsn (xlog_path , stop_backup_result .lsn , false, backup -> tli ,
1822
- false, true, WARNING , backup -> stream );
1823
-
1824
- /* Get the first record in segment with current stop_lsn */
1825
- lsn_tmp = get_first_record_lsn (xlog_path , segno , backup -> tli ,
1826
- instance_config .xlog_seg_size ,
1827
- instance_config .archive_timeout );
1828
-
1829
- /* Check that returned LSN is valid and greater than stop_lsn */
1830
- if (XLogRecPtrIsInvalid (lsn_tmp ) ||
1831
- !XRecOffIsValid (lsn_tmp ) ||
1832
- lsn_tmp < stop_backup_result .lsn )
1833
- {
1834
- /* Backup from master should error out here */
1835
- if (!backup -> from_replica )
1836
- elog (ERROR , "Failed to get next WAL record after %X/%X" ,
1837
- (uint32 ) (stop_backup_result .lsn >> 32 ),
1838
- (uint32 ) (stop_backup_result .lsn ));
1839
-
1840
- /* No luck, falling back to looking up for previous record */
1841
- elog (WARNING , "Failed to get next WAL record after %X/%X, "
1842
- "looking for previous WAL record" ,
1843
- (uint32 ) (stop_backup_result .lsn >> 32 ),
1844
- (uint32 ) (stop_backup_result .lsn ));
1845
-
1846
- /* Despite looking for previous record there is not guarantee of success
1847
- * because previous record can be the contrecord.
1848
- */
1849
- lsn_tmp = wait_wal_lsn (xlog_path , stop_backup_result .lsn , false, backup -> tli ,
1850
- true, false, ERROR , backup -> stream );
1851
-
1852
- /* sanity */
1853
- if (!XRecOffIsValid (lsn_tmp ) || XLogRecPtrIsInvalid (lsn_tmp ))
1854
- elog (ERROR , "Failed to get WAL record prior to %X/%X" ,
1855
- (uint32 ) (stop_backup_result .lsn >> 32 ),
1856
- (uint32 ) (stop_backup_result .lsn ));
1857
- }
1858
- }
1859
- /* stop lsn is aligned to xlog block size, just find next lsn */
1860
- else if (stop_backup_result .lsn % XLOG_BLCKSZ == 0 )
1861
- {
1862
- /* Wait for segment with current stop_lsn */
1863
- wait_wal_lsn (xlog_path , stop_backup_result .lsn , false, backup -> tli ,
1864
- false, true, ERROR , backup -> stream );
1865
-
1866
- /* Get the next closest record in segment with current stop_lsn */
1867
- lsn_tmp = get_next_record_lsn (xlog_path , segno , backup -> tli ,
1868
- instance_config .xlog_seg_size ,
1869
- instance_config .archive_timeout ,
1870
- stop_backup_result .lsn );
1871
-
1872
- /* sanity */
1873
- if (!XRecOffIsValid (lsn_tmp ) || XLogRecPtrIsInvalid (lsn_tmp ))
1874
- elog (ERROR , "Failed to get WAL record next to %X/%X" ,
1875
- (uint32 ) (stop_backup_result .lsn >> 32 ),
1876
- (uint32 ) (stop_backup_result .lsn ));
1877
- }
1878
- /* PostgreSQL returned something very illegal as STOP_LSN, error out */
1879
- else
1880
- elog (ERROR , "Invalid stop_backup_lsn value %X/%X" ,
1881
- (uint32 ) (stop_backup_result .lsn >> 32 ), (uint32 ) (stop_backup_result .lsn ));
1882
-
1883
- /* Setting stop_backup_lsn will set stop point for streaming */
1884
- stop_backup_lsn = lsn_tmp ;
1885
- stop_lsn_exists = true;
1886
- }
1887
-
1888
- elog (LOG , "stop_lsn: %X/%X" ,
1889
- (uint32 ) (stop_backup_result .lsn >> 32 ), (uint32 ) (stop_backup_result .lsn ));
1910
+ wait_wal_and_calculate_stop_lsn (xlog_path , stop_backup_result .lsn , backup );
1890
1911
1891
1912
/* Write backup_label and tablespace_map */
1892
1913
if (!exclusive_backup )
@@ -1917,15 +1938,6 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb
1917
1938
}
1918
1939
}
1919
1940
1920
- /*
1921
- * Wait for stop_lsn to be archived or streamed.
1922
- * If replica returned valid STOP_LSN of not actually existing record,
1923
- * look for previous record with endpoint >= STOP_LSN.
1924
- */
1925
- if (!stop_lsn_exists )
1926
- stop_backup_lsn = wait_wal_lsn (xlog_path , stop_backup_result .lsn , false, backup -> tli ,
1927
- false, false, ERROR , backup -> stream );
1928
-
1929
1941
if (backup -> stream )
1930
1942
{
1931
1943
/* This function will also add list of xlog files
@@ -1934,7 +1946,6 @@ pg_stop_backup(InstanceState *instanceState, pgBackup *backup, PGconn *pg_startb
1934
1946
elog (ERROR , "WAL streaming failed" );
1935
1947
}
1936
1948
1937
- backup -> stop_lsn = stop_backup_lsn ;
1938
1949
backup -> recovery_xid = stop_backup_result .snapshot_xid ;
1939
1950
1940
1951
elog (LOG , "Getting the Recovery Time from WAL" );
0 commit comments