Skip to content

Commit 86c9ebf

Browse files
authored
Merge pull request bucardo#59 from kabalin/pgbouncer_maxwait
Add pgbouncer_maxwait check
2 parents 1bd5920 + 801821c commit 86c9ebf

File tree

1 file changed

+132
-0
lines changed

1 file changed

+132
-0
lines changed

check_postgres.pl

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,9 @@ package check_postgres;
211211
'pgb-backends-msg' => q{$1 of $2 connections ($3%)},
212212
'pgb-backends-none' => q{No connections},
213213
'pgb-backends-users' => q{$1 for number of users must be a number or percentage},
214+
'pgb-maxwait-msg' => q{longest wait: $1s},
215+
'pgb-maxwait-nomatch'=> q{No matching rows were found},
216+
'pgb-maxwait-skipped'=> q{No matching rows were found (skipped rows: $1)},
214217
'PID' => q{PID},
215218
'port' => q{port},
216219
'preptxn-none' => q{No prepared transactions found},
@@ -1913,6 +1916,7 @@ package check_postgres;
19131916
pgb_pool_maxwait => [1, 'Check the current maximum wait time for client connections in pgbouncer pools.'],
19141917
pgbouncer_backends => [0, 'Check how many clients are connected to pgbouncer compared to max_client_conn.'],
19151918
pgbouncer_checksum => [0, 'Check that no pgbouncer settings have changed since the last check.'],
1919+
pgbouncer_maxwait => [0, 'Check how long the first (oldest) client in queue has been waiting.'],
19161920
pgagent_jobs => [0, 'Check for no failed pgAgent jobs within a specified period of time.'],
19171921
prepared_txns => [1, 'Checks number and age of prepared transactions.'],
19181922
query_runtime => [0, 'Check how long a specific query takes to run.'],
@@ -2769,6 +2773,9 @@ sub finishup {
27692773
## Check the current maximum wait time for client connections in pgbouncer pools
27702774
check_pgb_pool('maxwait') if $action eq 'pgb_pool_maxwait';
27712775

2776+
## Check how long the first (oldest) client in queue has been waiting.
2777+
check_pgbouncer_maxwait() if $action eq 'pgbouncer_maxwait';
2778+
27722779
## Check how many clients are connected to pgbouncer compared to max_client_conn.
27732780
check_pgbouncer_backends() if $action eq 'pgbouncer_backends';
27742781

@@ -6758,6 +6765,107 @@ sub check_pgbouncer_checksum {
67586765

67596766
} ## end of check_pgbouncer_checksum
67606767

6768+
sub check_pgbouncer_maxwait {
6769+
6770+
## Check how long the first (oldest) client in queue has waited, in
6771+
## seconds.
6772+
## Supports: Nagios, MRTG
6773+
## Warning and critical are time limits - defaults to seconds
6774+
## Valid units: s[econd], m[inute], h[our], d[ay]
6775+
## All above may be written as plural as well (e.g. "2 hours")
6776+
## Can also ignore databases with exclude and limit with include
6777+
6778+
my $arg = shift || {};
6779+
6780+
my ($warning, $critical) = validate_range
6781+
({
6782+
type => 'time',
6783+
});
6784+
6785+
## Grab information from the pg_stat_activity table
6786+
## Since we clobber old info on a qtime "tie", use an ORDER BY
6787+
$SQL = qq{SHOW POOLS};
6788+
6789+
my $info = run_command($SQL, { regex => qr{\d+}, emptyok => 1 } );
6790+
6791+
## Default values for information gathered
6792+
my ($maxwait, $database, $user, $cl_active, $cl_waiting) =
6793+
(0,'?','?',0,0);
6794+
6795+
for $db (@{$info->{db}}) {
6796+
6797+
## Parse the psql output and gather stats from the winning row
6798+
## Read in and parse the psql output
6799+
my $skipped = 0;
6800+
ROW: for my $r (@{$db->{slurp}}) {
6801+
6802+
## Apply --exclude and --include arguments to the database name
6803+
if (skip_item($r->{database})) {
6804+
$skipped++;
6805+
next ROW;
6806+
}
6807+
6808+
## Assign stats if we have a new winner
6809+
if ($r->{maxwait} > $maxwait) {
6810+
$database = $r->{database};
6811+
$user = $r->{user};
6812+
$cl_active = $r->{cl_active};
6813+
$cl_waiting = $r->{cl_waiting};
6814+
$maxwait = $r->{maxwait};
6815+
}
6816+
}
6817+
6818+
## We don't really care why things matches as far as the final output
6819+
## But it's nice to report what we can
6820+
if ($database eq '?') {
6821+
$MRTG and do_mrtg({one => 0, msg => 'No rows'});
6822+
$db->{perf} = "0;$warning;$critical";
6823+
6824+
if ($skipped) {
6825+
add_ok msg('pgb-maxwait-skipped', $skipped);
6826+
}
6827+
else {
6828+
add_ok msg('pgb-maxwait-nomatch', $maxwait);
6829+
}
6830+
return;
6831+
}
6832+
6833+
## Details on who the offender was
6834+
my $whodunit = sprintf q{%s:%s %s:%s cl_active:%s cl_waiting:%s},
6835+
msg('database'),
6836+
$database,
6837+
msg('username'),
6838+
$user,
6839+
$cl_active,
6840+
$cl_waiting;
6841+
6842+
$MRTG and do_mrtg({one => $maxwait, msg => "$whodunit"});
6843+
6844+
$db->{perf} .= sprintf q{'%s'=%s;%s;%s},
6845+
$whodunit,
6846+
$maxwait,
6847+
$warning,
6848+
$critical;
6849+
6850+
my $m = msg('pgb-maxwait-msg', $maxwait);
6851+
my $msg = sprintf '%s (%s)', $m, $whodunit;
6852+
6853+
if (length $critical and $maxwait >= $critical) {
6854+
add_critical $msg;
6855+
}
6856+
elsif (length $warning and $maxwait >= $warning) {
6857+
add_warning $msg;
6858+
}
6859+
else {
6860+
add_ok $msg;
6861+
}
6862+
}
6863+
6864+
return;
6865+
6866+
6867+
} ## end of check_pgbouncer_maxwait
6868+
67616869
sub check_pgbouncer_backends {
67626870

67636871
## Check the number of connections to pgbouncer compared to
@@ -10504,6 +10612,30 @@ =head2 B<pgbouncer_checksum>
1050410612
checksum must be provided as the C<--mrtg> argument. The fourth line always gives the
1050510613
current checksum.
1050610614
10615+
=head2 B<pgbouncer_maxwait>
10616+
10617+
(C<symlink: check_postgres_pgbouncer_maxwait>) Checks how long the first
10618+
(oldest) client in the queue has been waiting, in seconds. If this starts
10619+
increasing, then the current pool of servers does not handle requests quick
10620+
enough. Reason may be either overloaded server or just too small of a
10621+
pool_size setting in pbouncer config file. Databases can be filtered by use
10622+
of the I<--include> and I<--exclude> options. See the L</"BASIC FILTERING">
10623+
section for more details. The values or the I<--warning> and I<--critical>
10624+
options are units of time, and must be provided (no default). Valid units are
10625+
'seconds', 'minutes', 'hours', or 'days'. Each may be written singular or
10626+
abbreviated to just the first letter. If no units are given, the units are
10627+
assumed to be seconds.
10628+
10629+
This action requires Postgres 8.3 or better.
10630+
10631+
Example 1: Give a critical if any transaction has been open for more than 10
10632+
minutes:
10633+
10634+
check_postgres_pgbouncer_maxwait -p 6432 -u pgbouncer --critical='10 minutes'
10635+
10636+
For MRTG output, returns the maximum time in seconds a transaction has been
10637+
open on the first line. The fourth line gives the name of the database.
10638+
1050710639
=head2 B<pgagent_jobs>
1050810640
1050910641
(C<symlink: check_postgres_pgagent_jobs>) Checks that all the pgAgent jobs

0 commit comments

Comments
 (0)