2323 summary: 'Postgres Exporter running on {{ $labels.job }} (instance: {{ $labels.instance }}) is encountering scrape errors processing queries. Error count: ( {{ $value }} )'
2424
2525
26+ ########## SYSTEM RULES ##########
27+ - alert: ExporterDown
28+ expr: avg_over_time(up[5m]) < 0.5
29+ for: 10s
30+ labels:
31+ service: system
32+ severity: critical
33+ severity_num: 300
34+ annotations:
35+ description: 'Metrics exporter service for {{ $labels.job }} running on {{ $labels.instance }} has been down at least 50% of the time for the last 5 minutes. Service may be flapping or down.'
36+ summary: 'Prometheus Exporter Service Down'
37+
38+
2639 ########## POSTGRESQL RULES ##########
2740 - alert: PGIsUp
2841 expr: pg_up < 1
@@ -173,6 +186,27 @@ data:
173186 description: '{{ $labels.job }} is using 90% or more of available connections ({{ $value }}%)'
174187 summary: 'PGSQL Instance connections'
175188
189+ - alert: DiskFillPredict
190+ expr: predict_linear(ccp_nodemx_data_disk_available_bytes{mount_point!~"tmpfs"}[1h], 24 * 3600) < 0 and 100 * ((ccp_nodemx_data_disk_total_bytes - ccp_nodemx_data_disk_available_bytes) / ccp_nodemx_data_disk_total_bytes) > 70
191+ for: 5m
192+ labels:
193+ service: postgresql
194+ severity: warning
195+ severity_num: 200
196+ annotations:
197+ summary: 'Disk predicted to be full in 24 hours'
198+ description: 'Disk on {{ $labels.pg_cluster }}:{{ $labels.kubernetes_pod_name }} is predicted to fill in 24 hrs based on current usage'
199+
200+ - alert: PGClusterRoleChange
201+ expr: count by (pg_cluster) (ccp_is_in_recovery_status != ignoring(instance,ip,pod,role) (ccp_is_in_recovery_status offset 5m)) >= 1
202+ for: 60s
203+ labels:
204+ service: postgresql
205+ severity: critical
206+ severity_num: 300
207+ annotations:
208+ summary: '{{ $labels.pg_cluster }} has had a switchover/failover event. Please check this cluster for more details'
209+
176210 - alert: PGDiskSize
177211 expr: 100 * ((ccp_nodemx_data_disk_total_bytes - ccp_nodemx_data_disk_available_bytes) / ccp_nodemx_data_disk_total_bytes) > 75
178212 for: 60s
@@ -196,7 +230,7 @@ data:
196230 summary: 'PGSQL Instance size critical'
197231
198232 - alert: PGReplicationByteLag
199- expr: ccp_replication_status_byte_lag > 5.24288e+07
233+ expr: ccp_replication_lag_size_bytes > 5.24288e+07
200234 for: 60s
201235 labels:
202236 service: postgresql
@@ -207,7 +241,7 @@ data:
207241 summary: 'PGSQL Instance replica lag warning'
208242
209243 - alert: PGReplicationByteLag
210- expr: ccp_replication_status_byte_lag > 1.048576e+08
244+ expr: ccp_replication_lag_size_bytes > 1.048576e+08
211245 for: 60s
212246 labels:
213247 service: postgresql
@@ -313,12 +347,15 @@ data:
313347 # Otherwise rule will be applied to all stanzas returned on target system if not set.
314348 #
315349 # Relevant metric names are:
316- # ccp_backrest_last_full_time_since_completion_seconds
317- # ccp_backrest_last_incr_time_since_completion_seconds
318- # ccp_backrest_last_diff_time_since_completion_seconds
350+ # ccp_backrest_last_full_backup_time_since_completion_seconds
351+ # ccp_backrest_last_incr_backup_time_since_completion_seconds
352+ # ccp_backrest_last_diff_backup_time_since_completion_seconds
353+ #
354+ # To avoid false positives on backup time alerts, 12 hours are added onto each threshold to allow a buffer if the backup runtime varies from day to day.
355+ # Further adjustment may be needed depending on your backup runtimes/schedule.
319356 #
320357 # - alert: PGBackRestLastCompletedFull_main
321- # expr: ccp_backrest_last_full_backup_time_since_completion_seconds{stanza="main"} > 604800
358+ # expr: ccp_backrest_last_full_backup_time_since_completion_seconds{stanza="main"} > 648000
322359 # for: 60s
323360 # labels:
324361 # service: postgresql
@@ -328,7 +365,7 @@ data:
328365 # summary: 'Full backup for stanza [main] on system {{ $labels.job }} has not completed in the last week.'
329366 #
330367 # - alert: PGBackRestLastCompletedIncr_main
331- # expr: ccp_backrest_last_incr_backup_time_since_completion_seconds{stanza="main"} > 86400
368+ # expr: ccp_backrest_last_incr_backup_time_since_completion_seconds{stanza="main"} > 129600
332369 # for: 60s
333370 # labels:
334371 # service: postgresql
@@ -340,14 +377,14 @@ data:
340377 #
341378 # Runtime monitoring is handled with a single metric:
342379 #
343- # ccp_backrest_last_runtime_backup_runtime_seconds
380+ # ccp_backrest_last_info_backup_runtime_seconds
344381 #
345382 # Runtime monitoring should have the "backup_type" label set.
346383 # Otherwise the rule will apply to the last run of all backup types returned (full, diff, incr)
347384 # Stanza should also be set if runtimes per stanza have different expected times
348385 #
349386 # - alert: PGBackRestLastRuntimeFull_main
350- # expr: ccp_backrest_last_runtime_backup_runtime_seconds {backup_type="full", stanza="main"} > 14400
387+ # expr: ccp_backrest_last_info_backup_runtime_seconds {backup_type="full", stanza="main"} > 14400
351388 # for: 60s
352389 # labels:
353390 # service: postgresql
@@ -357,7 +394,7 @@ data:
357394 # summary: 'Expected runtime of full backup for stanza [main] has exceeded 4 hours'
358395 #
359396 # - alert: PGBackRestLastRuntimeDiff_main
360- # expr: ccp_backrest_last_runtime_backup_runtime_seconds {backup_type="diff", stanza="main"} > 3600
397+ # expr: ccp_backrest_last_info_backup_runtime_seconds {backup_type="diff", stanza="main"} > 3600
361398 # for: 60s
362399 # labels:
363400 # service: postgresql
@@ -382,6 +419,7 @@ data:
382419 # severity_num: 300
383420 # annotations:
384421 # description: 'Backup Full status missing for Prod. Check that pgbackrest info command is working on target system.'
422+
385423kind : ConfigMap
386424metadata :
387425 labels :
0 commit comments