Skip to content

Commit 3a5d179

Browse files
author
Muhammad Shahzeb
committed
Add alerts for postgres
1 parent f59d4af commit 3a5d179

File tree

1 file changed

+239
-0
lines changed

1 file changed

+239
-0
lines changed

postgres_mixin/alerts/postgres.libsonnet

Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,245 @@
120120
severity: 'warning',
121121
},
122122
},
123+
{
124+
alert: 'PostgresHasTooManyRollbacks',
125+
annotations: {
126+
description: 'PostgreSQL has too many rollbacks on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}',
127+
summary: 'PostgreSQL has too many rollbacks.',
128+
},
129+
expr: |||
130+
avg without(pod, instance)
131+
(rate(pg_stat_database_xact_rollback{db_name!~"template.*|^$"}[5m]) /
132+
(rate(pg_stat_database_xact_commit{db_name!~"template.*|^$"}[5m])+ rate(pg_stat_database_xact_rollback{db_name!~"template.*|^$"}[5m]))) > 0.10
133+
||| % $._config,
134+
'for': '5m',
135+
labels: {
136+
severity: 'warning',
137+
},
138+
},
139+
{
140+
alert: 'PostgresHasHighDeadLocks',
141+
annotations: {
142+
description: 'PostgreSQL has too high deadlocks on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}',
143+
summary: 'PostgreSQL has high number of deadlocks.',
144+
},
145+
expr: |||
146+
max without(pod, instance) (rate(pg_stat_database_deadlocks{db_name!~"template.*|^$"}[5m]) * 60) > 5
147+
||| % $._config,
148+
'for': '5m',
149+
labels: {
150+
severity: 'warning',
151+
},
152+
},
153+
{
154+
alert: 'PostgresAcquiredTooManyLocks',
155+
annotations: {
156+
description: 'PostgreSQL has acquired too many locks on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}',
157+
summary: 'PostgreSQL has high number of acquired locks.',
158+
},
159+
expr: |||
160+
max by( server, job, db_name, asserts_env, asserts_site, namespace) ((pg_locks_count{db_name!~"template.*|^$"}) /
161+
on(instance, asserts_env, asserts_site, namespace) group_left(server) (pg_settings_max_locks_per_transaction{} * pg_settings_max_connections{})) > 0.20
162+
||| % $._config,
163+
'for': '5m',
164+
labels: {
165+
severity: 'warning',
166+
},
167+
},
168+
{
169+
alert: 'PostgresXLOGConsumptionVeryLow',
170+
annotations: {
171+
description: 'PostgreSQL instance {{ $labels.instance }} has a very low XLOG consumption rate.',
172+
summary: 'PostgreSQL XLOG consumption is very low.',
173+
},
174+
expr: 'rate(pg_xlog_position_bytes{asserts_env!=""}[5m]) < 200000',
175+
'for': '5m',
176+
labels: {
177+
asserts_severity: 'critical',
178+
asserts_entity_type: 'Service',
179+
asserts_alert_category: 'failure',
180+
},
181+
},
182+
{
183+
alert: 'PostgresXLOGConsumptionVeryHigh',
184+
annotations: {
185+
description: '{{ $labels.instance }} is experiencing very high XLOG consumption rate, which might indicate excessive write operations.',
186+
summary: 'PostgreSQL very high XLOG consumption rate.',
187+
},
188+
expr: 'rate(pg_xlog_position_bytes{asserts_env!=""}[2m]) > 36700160 and on (instance, asserts_env, asserts_site) (pg_replication_is_replica{asserts_env!=""} == 0)',
189+
'for': '10m',
190+
labels: {
191+
asserts_severity: 'critical',
192+
asserts_entity_type: 'Service',
193+
asserts_alert_category: 'failure',
194+
},
195+
},
196+
{
197+
alert: 'PostgresReplicationStopped',
198+
annotations: {
199+
description: 'PostgreSQL instance {{ $labels.instance }} has stopped replication.',
200+
summary: 'PostgreSQL replication has stopped.',
201+
},
202+
expr: 'pg_stat_replication_pg_xlog_location_diff{asserts_env!=""} != 0',
203+
'for': '5m',
204+
labels: {
205+
asserts_severity: 'critical',
206+
asserts_entity_type: 'Service',
207+
asserts_alert_category: 'failure',
208+
},
209+
},
210+
{
211+
alert: 'PostgresReplicationLagging_More_1Hour',
212+
annotations: {
213+
description: '{{ $labels.instance }} replication lag exceeds 1 hour. Check for network issues or load imbalances.',
214+
summary: 'PostgreSQL replication lagging more than 1 hour.',
215+
},
216+
expr: '(pg_replication_lag{asserts_env!=""} > 3600) and on (instance) (pg_replication_is_replica{asserts_env!=""} == 1)',
217+
'for': '5m',
218+
labels: {
219+
asserts_severity: 'warning',
220+
asserts_entity_type: 'Service',
221+
asserts_alert_category: 'failure',
222+
},
223+
},
224+
{
225+
alert: 'PostgresReplicationLagBytesAreTooLarge',
226+
annotations: {
227+
description: '{{ $labels.instance }} replication lag in bytes is too large, which might indicate replication issues or network bottlenecks.',
228+
summary: 'PostgreSQL replication lag in bytes too large.',
229+
},
230+
expr: '(pg_xlog_position_bytes{asserts_env!=""} and pg_replication_is_replica{asserts_env!=""} == 0) - on (job, service, asserts_env, asserts_site) group_right(instance) (pg_xlog_position_bytes{asserts_env!=""} and pg_replication_is_replica{asserts_env!=""} == 1) > 1e+09',
231+
'for': '5m',
232+
labels: {
233+
asserts_severity: 'critical',
234+
asserts_entity_type: 'Service',
235+
asserts_alert_category: 'failure',
236+
},
237+
},
238+
{
239+
alert: 'PostgresHasReplicationSlotUsed',
240+
annotations: {
241+
description: '{{ $labels.instance }} has replication slots that are not used, which might lead to replication lag or data inconsistency.',
242+
summary: 'PostgreSQL has unused replication slots.',
243+
},
244+
expr: 'pg_replication_slots_active{asserts_env!=""} == 0',
245+
'for': '30m',
246+
labels: {
247+
asserts_severity: 'critical',
248+
asserts_entity_type: 'Service',
249+
asserts_alert_category: 'failure',
250+
},
251+
},
252+
{
253+
alert: 'PostgresReplicationIsStale',
254+
annotations: {
255+
description: '{{ $labels.instance }} replication slots have not been updated for a significant period, indicating potential issues with replication.',
256+
summary: 'PostgreSQL replication slots are stale.',
257+
},
258+
expr: 'pg_replication_slots_xmin_age{asserts_env!="", slot_name =~ "^repmgr_slot_[0-9]+"} > 20000',
259+
'for': '30m',
260+
labels: {
261+
asserts_severity: 'critical',
262+
asserts_entity_type: 'Service',
263+
asserts_alert_category: 'failure',
264+
},
265+
},
266+
{
267+
alert: 'PostgresReplicationRoleChanged',
268+
annotations: {
269+
description: '{{ $labels.instance }} replication role has changed. Verify if this is expected or if it indicates a failover.',
270+
summary: 'PostgreSQL replication role change detected.',
271+
},
272+
expr: 'pg_replication_is_replica{asserts_env!=""} and changes(pg_replication_is_replica{asserts_env!=""}[1m]) > 0',
273+
labels: {
274+
asserts_severity: 'warning',
275+
asserts_entity_type: 'Service',
276+
asserts_alert_category: 'failure',
277+
},
278+
},
279+
{
280+
alert: 'PostgresHasExporterErrors',
281+
annotations: {
282+
description: '{{ $labels.instance }} exporter is experiencing errors. Verify exporter health and configuration.',
283+
summary: 'PostgreSQL exporter errors detected.',
284+
},
285+
expr: 'pg_exporter_last_scrape_error{asserts_env!=""} > 0',
286+
'for': '30m',
287+
labels: {
288+
asserts_severity: 'critical',
289+
asserts_entity_type: 'Service',
290+
asserts_alert_category: 'failure',
291+
},
292+
},
293+
{
294+
alert: 'PostgresHasTooManyDeadTuples',
295+
annotations: {
296+
description: '{{ $labels.instance }} has too many dead tuples, which may lead to inefficient query performance. Consider vacuuming the database.',
297+
summary: 'PostgreSQL has too many dead tuples.',
298+
},
299+
expr: '(sum without(relname) (pg_stat_user_tables_n_dead_tup{asserts_env!="", db_name!~"template.*|^$"}) > 10000) / ((sum without(relname) (pg_stat_user_tables_n_live_tup{asserts_env!="", db_name!~"template.*|^$"}) + sum without(relname)(pg_stat_user_tables_n_dead_tup{asserts_env!="", db_name!~"template.*|^$"})) > 0) >= 0.1 unless on(instance, asserts_env, asserts_site) (pg_replication_is_replica{asserts_env!=""} == 1)',
300+
'for': '5m',
301+
labels: {
302+
asserts_severity: 'warning',
303+
asserts_entity_type: 'Service',
304+
asserts_alert_category: 'failure',
305+
},
306+
},
307+
{
308+
alert: 'PostgresTablesNotVaccumed',
309+
annotations: {
310+
description: '{{ $labels.instance }} tables have not been vacuumed recently, which may lead to performance degradation.',
311+
summary: 'PostgreSQL tables not vacuumed.',
312+
},
313+
expr: 'group without(pod, instance)(timestamp(pg_stat_user_tables_n_dead_tup{asserts_env!=""} > pg_stat_user_tables_n_live_tup{asserts_env!=""} * on(asserts_env, asserts_site, namespace, job, service, instance, server) group_left pg_settings_autovacuum_vacuum_scale_factor{asserts_env!=""} + on(asserts_env, asserts_site, namespace, job, service, instance, server) group_left pg_settings_autovacuum_vacuum_threshold{asserts_env!=""})) < time() - 36000',
314+
'for': '30m',
315+
labels: {
316+
asserts_severity: 'critical',
317+
asserts_entity_type: 'Service',
318+
asserts_alert_category: 'failure',
319+
},
320+
},
321+
{
322+
alert: 'PostgresTableNotAnalyzed',
323+
annotations: {
324+
description: '{{ $labels.instance }} table has not been analyzed recently, which might lead to inefficient query planning.',
325+
summary: 'PostgreSQL table not analyzed.',
326+
},
327+
expr: '
328+
group without(pod, instance)(
329+
timestamp(
330+
pg_stat_user_tables_n_dead_tup{asserts_env!=""} >
331+
pg_stat_user_tables_n_live_tup{asserts_env!=""}
332+
* on(asserts_env, asserts_site, namespace, job, service, instance, server) group_left pg_settings_autovacuum_analyze_scale_factor{asserts_env!=""}
333+
+ on(asserts_env, asserts_site, namespace, job, service, instance, server) group_left pg_settings_autovacuum_analyze_threshold{asserts_env!=""}
334+
)
335+
-
336+
pg_stat_user_tables_last_autoanalyze{asserts_env!=""}
337+
> 24 * 60 * 60
338+
)',
339+
labels: {
340+
asserts_severity: 'warning',
341+
asserts_entity_type: 'DataSource',
342+
asserts_alert_category: 'failure',
343+
},
344+
},
345+
{
346+
alert: 'PostgresTooManyCheckpointsRequested',
347+
annotations: {
348+
description: '{{ $labels.instance }} is requesting too many checkpoints, which may lead to performance degradation.',
349+
summary: 'PostgreSQL too many checkpoints requested.',
350+
},
351+
expr:'
352+
rate(pg_stat_bgwriter_checkpoints_timed_total{asserts_env!=""}[5m]) /
353+
(rate(pg_stat_bgwriter_checkpoints_timed_total{asserts_env!=""}[5m]) + rate(pg_stat_bgwriter_checkpoints_req_total{asserts_env!=""}[5m]))
354+
< 0.5',
355+
'for': '5m',
356+
labels: {
357+
asserts_severity: 'warning',
358+
asserts_entity_type: 'Service',
359+
asserts_alert_category: 'failure',
360+
},
361+
},
123362
],
124363
},
125364
],

0 commit comments

Comments
 (0)