|
11 | 11 | summary: 'Postgres connections count is over the maximum amount.',
|
12 | 12 | },
|
13 | 13 | expr: |||
|
14 |
| - sum by (instance) (pg_stat_activity_count{%(postgresExporterSelector)s}) |
| 14 | + sum by (%(agg)s) (pg_stat_activity_count{%(postgresExporterSelector)s}) |
15 | 15 | >=
|
16 | 16 | sum by (instance) (pg_settings_max_connections{%(postgresExporterSelector)s})
|
17 | 17 | -
|
18 |
| - sum by (instance) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s}) |
19 |
| - ||| % $._config, |
| 18 | + sum by (%(agg)s) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s}) |
| 19 | + ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, |
20 | 20 | 'for': '1m',
|
21 | 21 | labels: {
|
22 | 22 | severity: 'warning',
|
|
29 | 29 | summary: 'Postgres connections count is over 80% of maximum amount.',
|
30 | 30 | },
|
31 | 31 | expr: |||
|
32 |
| - sum by (instance) (pg_stat_activity_count{%(postgresExporterSelector)s}) |
| 32 | + sum by (%(agg)s) (pg_stat_activity_count{%(postgresExporterSelector)s}) |
33 | 33 | >
|
34 | 34 | (
|
35 |
| - sum by (instance) (pg_settings_max_connections{%(postgresExporterSelector)s}) |
| 35 | + sum by (%(agg)s) (pg_settings_max_connections{%(postgresExporterSelector)s}) |
36 | 36 | -
|
37 |
| - sum by (instance) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s}) |
| 37 | + sum by (%(agg)s) (pg_settings_superuser_reserved_connections{%(postgresExporterSelector)s}) |
38 | 38 | ) * 0.8
|
39 |
| - ||| % $._config, |
| 39 | + ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, |
40 | 40 | 'for': '10m',
|
41 | 41 | labels: {
|
42 | 42 | severity: 'warning',
|
|
61 | 61 | summary: 'PostgreSQL high number of slow queries.',
|
62 | 62 | },
|
63 | 63 | expr: |||
|
64 |
| - avg by (datname) ( |
| 64 | + avg by (%(agg)s) ( |
65 | 65 | rate (
|
66 | 66 | pg_stat_activity_max_tx_duration{%(dbNameFilter)s,%(postgresExporterSelector)s}[2m]
|
67 | 67 | )
|
68 | 68 | ) > 2 * 60
|
69 |
| - ||| % $._config, |
| 69 | + ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, |
70 | 70 | 'for': '2m',
|
71 | 71 | labels: {
|
72 | 72 | severity: 'warning',
|
|
79 | 79 | summary: 'PostgreSQL high number of queries per second.',
|
80 | 80 | },
|
81 | 81 | expr: |||
|
82 |
| - avg by (datname) ( |
| 82 | + avg by (datname, %(agg)s) ( |
83 | 83 | irate(
|
84 | 84 | pg_stat_database_xact_commit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m]
|
85 | 85 | )
|
|
88 | 88 | pg_stat_database_xact_rollback{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m]
|
89 | 89 | )
|
90 | 90 | ) > 10000
|
91 |
| - ||| % $._config, |
| 91 | + ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, |
92 | 92 | 'for': '5m',
|
93 | 93 | labels: {
|
94 | 94 | severity: 'warning',
|
|
101 | 101 | summary: 'PostgreSQL low cache hit rate.',
|
102 | 102 | },
|
103 | 103 | expr: |||
|
104 |
| - avg by (datname) ( |
| 104 | + avg by (datname, %(agg)s) ( |
105 | 105 | rate(pg_stat_database_blks_hit{%(dbNameFilter)s,%(postgresExporterSelector)s}[5m])
|
106 | 106 | /
|
107 | 107 | (
|
|
114 | 114 | )
|
115 | 115 | )
|
116 | 116 | ) < 0.98
|
117 |
| - ||| % $._config, |
| 117 | + ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, |
118 | 118 | 'for': '5m',
|
119 | 119 | labels: {
|
120 | 120 | severity: 'warning',
|
|
157 | 157 | summary: 'PostgreSQL has high number of acquired locks.',
|
158 | 158 | },
|
159 | 159 | expr: |||
|
160 |
| - max by( server, job, datname, namespace) ((pg_locks_count{%(dbNameFilter)s}) / |
161 |
| - on(instance, namespace) group_left(server) (pg_settings_max_locks_per_transaction{} * pg_settings_max_connections{})) > 0.20 |
162 |
| - ||| % $._config, |
| 160 | + max by(datname, %(agg)s) ((pg_locks_count{%(dbNameFilter)s}) / |
| 161 | + on(%(agg)s) group_left(server) (pg_settings_max_locks_per_transaction{} * pg_settings_max_connections{})) > 0.20 |
| 162 | + ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, |
163 | 163 | 'for': '5m',
|
164 | 164 | labels: {
|
165 | 165 | severity: 'warning',
|
|
171 | 171 | description: '{{ $labels.instance }} replication lag exceeds 1 hour. Check for network issues or load imbalances.',
|
172 | 172 | summary: 'PostgreSQL replication lagging more than 1 hour.',
|
173 | 173 | },
|
174 |
| - expr: '(pg_replication_lag{} > 3600) and on (instance) (pg_replication_is_replica{} == 1)', |
| 174 | + expr: ||| |
| 175 | + (pg_replication_lag{} > 3600) and on (%(agg)s) (pg_replication_is_replica{} == 1)' |
| 176 | + ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, |
175 | 177 | 'for': '5m',
|
176 | 178 | labels: {
|
177 | 179 | severity: 'warning',
|
|
223 | 225 | timestamp(
|
224 | 226 | pg_stat_user_tables_n_dead_tup{} >
|
225 | 227 | pg_stat_user_tables_n_live_tup{}
|
226 |
| - * on(namespace, job, service, instance, server) group_left pg_settings_autovacuum_vacuum_scale_factor{} |
227 |
| - + on(namespace, job, service, instance, server) group_left pg_settings_autovacuum_vacuum_threshold{} |
| 228 | + * on(%(agg)s) group_left pg_settings_autovacuum_vacuum_scale_factor{} |
| 229 | + + on(%(agg)s) group_left pg_settings_autovacuum_vacuum_threshold{} |
228 | 230 | )
|
229 | 231 | < time() - 36000
|
230 | 232 | )
|
231 |
| - |||, |
| 233 | + ||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) }, |
232 | 234 | 'for': '30m',
|
233 | 235 | labels: {
|
234 | 236 | severity: 'critical',
|
|
0 commit comments