Skip to content

Commit 6b765c8

Browse files
author
Josh Williams
committed
Add xact timestamp support to hot_standby_delay
Allow the hot_standby_delay check to accept xlog byte position or timestamp lag intervals as thresholds, or even both at the same time.
1 parent ff6e828 commit 6b765c8

File tree

1 file changed

+67
-17
lines changed

1 file changed

+67
-17
lines changed

check_postgres.pl

Lines changed: 67 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -144,10 +144,13 @@ package check_postgres;
144144
'fsm-page-msg' => q{fsm page slots used: $1 of $2 ($3%)},
145145
'fsm-rel-highver' => q{Cannot check fsm_relations on servers version 8.4 or greater},
146146
'fsm-rel-msg' => q{fsm relations used: $1 of $2 ($3%)},
147+
'hs-future-replica' => q{Slave reporting master server clock is ahead, check time sync},
147148
'hs-no-role' => q{Not a master/slave couple},
148149
'hs-no-location' => q{Could not get current xlog location on $1},
149150
'hs-receive-delay' => q{receive-delay},
150151
'hs-replay-delay' => q{replay_delay},
152+
'hs-time-delay' => q{time_delay},
153+
'hs-time-version' => q{Database must be version 9.1 or higher to check slave lag by time},
151154
'index' => q{Index},
152155
'invalid-option' => q{Invalid option},
153156
'invalid-query' => q{Invalid query returned: $1},
@@ -3108,6 +3111,9 @@ sub validate_size_or_percent_with_oper {
31083111

31093112

31103113
sub validate_integer_for_time {
3114+
# Used for txn_idle and hot_standby_delay
3115+
# txn_idle, et. al, use the form "$count for $interval"
3116+
# hot_standby_delay appears as "$bytes and $interval"
31113117

31123118
my $arg = shift || {};
31133119
ndie qq{validate_integer_for_time must be called with a hashref\n}
@@ -3123,7 +3129,7 @@ sub validate_integer_for_time {
31233129
for my $spec ([ warning => $warning], [critical => $critical]) {
31243130
my ($level, $val) = @{ $spec };
31253131
if (length $val) {
3126-
if ($val =~ /^(.+?)\sfor\s(.+)$/i) {
3132+
if ($val =~ /^(.+?)\s(?:for|and)\s(.+)$/i) {
31273133
my ($int, $time) = ($1, $2);
31283134

31293135
# Integer first, time second.
@@ -3137,7 +3143,7 @@ sub validate_integer_for_time {
31373143
}
31383144
else {
31393145
# Disambiguate int from time int by sign.
3140-
if ($val =~ /^[-+]\d+$/) {
3146+
if (($val =~ /^[-+]\d+$/) || ($val =~ /^\d+$/ && $arg->{default_to_int})) {
31413147
ndie msg('range-int', $level) if $val !~ /^[-+]?\d+$/;
31423148
push @ret, int $val, '';
31433149
}
@@ -4741,9 +4747,17 @@ sub check_hot_standby_delay {
47414747
## Check on the delay in PITR replication between master and slave
47424748
## Supports: Nagios, MRTG
47434749
## Critical and warning are the delay between master and slave xlog locations
4744-
## Example: --critical=1024
4750+
## and/or transaction timestamps. If both are specified, both are checked.
4751+
## Examples:
4752+
## --critical=1024
4753+
## --warning=5min
4754+
## --warning='1048576 and 2min' --critical='16777216 and 10min'
47454755

4746-
my ($warning, $critical) = validate_range({type => 'integer', leastone => 1});
4756+
my ($warning, $wtime, $critical, $ctime) = validate_integer_for_time({default_to_int => 1});
4757+
if ($psql_version < 9.1 and (length $wtime or length $ctime)) {
4758+
add_unknown msg('hs-time-version');
4759+
return;
4760+
}
47474761

47484762
# check if master and slave comply with the check using pg_is_in_recovery()
47494763
my ($master, $slave);
@@ -4776,15 +4790,19 @@ sub check_hot_standby_delay {
47764790
}
47774791

47784792
## Get xlog positions
4779-
my ($moffset, $s_rec_offset, $s_rep_offset);
4793+
my ($moffset, $s_rec_offset, $s_rep_offset, $time_delta);
47804794

47814795
## On slave
47824796
$SQL = q{SELECT pg_last_xlog_receive_location() AS receive, pg_last_xlog_replay_location() AS replay};
4797+
if ($psql_version >= 9.1) {
4798+
$SQL .= q{, COALESCE(ROUND(EXTRACT(epoch FROM now() - pg_last_xact_replay_timestamp())),0) AS seconds};
4799+
}
47834800
my $info = run_command($SQL, { dbnumber => $slave, regex => qr/\// });
47844801
my $saved_db;
47854802
for $db (@{$info->{db}}) {
47864803
my $receive = $db->{slurp}[0]{receive};
47874804
my $replay = $db->{slurp}[0]{replay};
4805+
$time_delta = $db->{slurp}[0]{seconds};
47884806

47894807
if (defined $receive) {
47904808
my ($a, $b) = split(/\//, $receive);
@@ -4829,20 +4847,33 @@ sub check_hot_standby_delay {
48294847
# Make sure it's always positive or zero
48304848
$rec_delta = 0 if $rec_delta < 0;
48314849
$rep_delta = 0 if $rep_delta < 0;
4850+
if (defined $time_delta and $time_delta < 0) {
4851+
add_unknown msg('hs-future-replica');
4852+
return;
4853+
}
48324854

4833-
$MRTG and do_mrtg({one => $rep_delta, two => $rec_delta});
4855+
$MRTG and do_mrtg($psql_version >= 9.1 ?
4856+
{one => $rep_delta, two => $rec_delta, three => $time_delta} :
4857+
{one => $rep_delta, two => $rec_delta});
48344858

48354859
$db->{perf} = sprintf ' %s=%s;%s;%s ',
48364860
perfname(msg('hs-replay-delay')), $rep_delta, $warning, $critical;
48374861
$db->{perf} .= sprintf ' %s=%s;%s;%s',
48384862
perfname(msg('hs-receive-delay')), $rec_delta, $warning, $critical;
4863+
if ($psql_version >= 9.1) {
4864+
$db->{perf} .= sprintf ' %s=%s;%s;%s',
4865+
perfname(msg('hs-time-delay')), $time_delta, $wtime, $ctime;
4866+
}
48394867

48404868
## Do the check on replay delay in case SR has disconnected because it way too far behind
48414869
my $msg = qq{$rep_delta};
4842-
if (length $critical and $rep_delta > $critical) {
4870+
if ($psql_version >= 9.1) {
4871+
$msg .= qq{ and $time_delta seconds}
4872+
}
4873+
if ((length $critical or length $ctime) and (!length $critical or length $critical and $rep_delta > $critical) and (!length $ctime or length $ctime and $time_delta > $ctime)) {
48434874
add_critical $msg;
48444875
}
4845-
elsif (length $warning and $rep_delta > $warning) {
4876+
elsif ((length $warning or length $wtime) and (!length $warning or length $warning and $rep_delta > $warning) and (!length $wtime or length $wtime and $time_delta > $wtime)) {
48464877
add_warning $msg;
48474878
}
48484879
else {
@@ -8812,15 +8843,34 @@ =head2 B<hitratio>
88128843
=head2 B<hot_standby_delay>
88138844
88148845
(C<symlink: check_hot_standby_delay>) Checks the streaming replication lag by computing the delta
8815-
between the xlog position of a master server and the one of the slaves connected to it. The slave_
8816-
server must be in hot_standby (e.g. read only) mode, therefore the minimum version to use this_
8817-
action is Postgres 9.0. The I<--warning> and I<--critical> options are the delta between xlog
8818-
location. These values should match the volume of transactions needed to have the streaming
8819-
replication disconnect from the master because of too much lag.
8820-
8821-
You must provide information on how to reach the second database by a connection
8822-
parameter ending in the number 2, such as "--dbport2=5543". If if it not given,
8823-
the action fails.
8846+
between the xlog position of a master server and the one of the slaves connected to it, and/or the
8847+
last transaction timestamp received by the slave. The slave server must be in hot_standby (e.g. read
8848+
only) mode, therefore the minimum version to use this action is Postgres 9.0. To support transaction
8849+
timestamps the minimum version is Postgres 9.1.
8850+
8851+
The I<--warning> and I<--critical> options are either the delta between xlog positions in bytes,
8852+
units of time to compare timestamps, or both.
8853+
8854+
Byte values should be based on the volume of transactions needed to have the streaming replication
8855+
disconnect from the master because of too much lag, determined by the Postgres configuration variable
8856+
B<wal_keep_segments>. For units of time, valid units are 'seconds', 'minutes', 'hours', or 'days'.
8857+
Each may be written singular or abbreviated to just the first letter. When specifying both, in the
8858+
form 'I<bytes> and I<time>', both conditions must be true for the threshold to be met.
8859+
8860+
You must provide information on how to reach the databases by providing a comma separated list to the
8861+
--dbost and --dbport parameters, such as "--dbport=5432,5543". If not given, the action fails.
8862+
8863+
Example 1: Warn a database with a local replica on port 5433 is behind on any xlog replay at all
8864+
8865+
check_hot_standby_delay --dbport=5432,5433 --warning='1'
8866+
8867+
Example 2: Give a critical if the last transaction replica1 receives is more than 10 minutes ago
8868+
8869+
check_hot_standby_delay --dbhost=master,replica1 --critical='10 min'
8870+
8871+
Example 3: Allow replica1 to be 1 WAL segment behind, if the master is momentarily seeing more activity than the streaming replication connection can handle, or 10 minutes behind, if the master is seeing very little activity and not processing any transactions, but not both, which would indicate a lasting problem with the replication connection.
8872+
8873+
check_hot_standby_delay --dbhost=master,replica1 --warning='1048576 and 2 min' --critical='16777216 and 10 min'
88248874
88258875
=head2 B<index_size>
88268876

0 commit comments

Comments
 (0)