|
174 | 174 | expr: 'rate(pg_xlog_position_bytes{asserts_env!=""}[5m]) < 200000',
|
175 | 175 | 'for': '5m',
|
176 | 176 | labels: {
|
177 |
| - asserts_severity: 'critical', |
178 |
| - asserts_entity_type: 'Service', |
179 |
| - asserts_alert_category: 'failure', |
| 177 | + severity: 'critical', |
180 | 178 | },
|
181 | 179 | },
|
182 | 180 | {
|
|
188 | 186 | expr: 'rate(pg_xlog_position_bytes{asserts_env!=""}[2m]) > 36700160 and on (instance, asserts_env, asserts_site) (pg_replication_is_replica{asserts_env!=""} == 0)',
|
189 | 187 | 'for': '10m',
|
190 | 188 | labels: {
|
191 |
| - asserts_severity: 'critical', |
192 |
| - asserts_entity_type: 'Service', |
193 |
| - asserts_alert_category: 'failure', |
| 189 | + severity: 'critical', |
194 | 190 | },
|
195 | 191 | },
|
196 | 192 | {
|
|
202 | 198 | expr: 'pg_stat_replication_pg_xlog_location_diff{asserts_env!=""} != 0',
|
203 | 199 | 'for': '5m',
|
204 | 200 | labels: {
|
205 |
| - asserts_severity: 'critical', |
206 |
| - asserts_entity_type: 'Service', |
207 |
| - asserts_alert_category: 'failure', |
| 201 | + severity: 'critical', |
208 | 202 | },
|
209 | 203 | },
|
210 | 204 | {
|
|
216 | 210 | expr: '(pg_replication_lag{asserts_env!=""} > 3600) and on (instance) (pg_replication_is_replica{asserts_env!=""} == 1)',
|
217 | 211 | 'for': '5m',
|
218 | 212 | labels: {
|
219 |
| - asserts_severity: 'warning', |
220 |
| - asserts_entity_type: 'Service', |
221 |
| - asserts_alert_category: 'failure', |
| 213 | + severity: 'warning', |
222 | 214 | },
|
223 | 215 | },
|
224 | 216 | {
|
|
230 | 222 | expr: '(pg_xlog_position_bytes{asserts_env!=""} and pg_replication_is_replica{asserts_env!=""} == 0) - on (job, service, asserts_env, asserts_site) group_right(instance) (pg_xlog_position_bytes{asserts_env!=""} and pg_replication_is_replica{asserts_env!=""} == 1) > 1e+09',
|
231 | 223 | 'for': '5m',
|
232 | 224 | labels: {
|
233 |
| - asserts_severity: 'critical', |
234 |
| - asserts_entity_type: 'Service', |
235 |
| - asserts_alert_category: 'failure', |
| 225 | + severity: 'critical', |
236 | 226 | },
|
237 | 227 | },
|
238 | 228 | {
|
|
244 | 234 | expr: 'pg_replication_slots_active{asserts_env!=""} == 0',
|
245 | 235 | 'for': '30m',
|
246 | 236 | labels: {
|
247 |
| - asserts_severity: 'critical', |
248 |
| - asserts_entity_type: 'Service', |
249 |
| - asserts_alert_category: 'failure', |
| 237 | + severity: 'critical', |
250 | 238 | },
|
251 | 239 | },
|
252 | 240 | {
|
|
258 | 246 | expr: 'pg_replication_slots_xmin_age{asserts_env!="", slot_name =~ "^repmgr_slot_[0-9]+"} > 20000',
|
259 | 247 | 'for': '30m',
|
260 | 248 | labels: {
|
261 |
| - asserts_severity: 'critical', |
262 |
| - asserts_entity_type: 'Service', |
263 |
| - asserts_alert_category: 'failure', |
| 249 | + severity: 'critical', |
264 | 250 | },
|
265 | 251 | },
|
266 | 252 | {
|
|
271 | 257 | },
|
272 | 258 | expr: 'pg_replication_is_replica{asserts_env!=""} and changes(pg_replication_is_replica{asserts_env!=""}[1m]) > 0',
|
273 | 259 | labels: {
|
274 |
| - asserts_severity: 'warning', |
275 |
| - asserts_entity_type: 'Service', |
276 |
| - asserts_alert_category: 'failure', |
| 260 | + severity: 'warning', |
277 | 261 | },
|
278 | 262 | },
|
279 | 263 | {
|
|
285 | 269 | expr: 'pg_exporter_last_scrape_error{asserts_env!=""} > 0',
|
286 | 270 | 'for': '30m',
|
287 | 271 | labels: {
|
288 |
| - asserts_severity: 'critical', |
289 |
| - asserts_entity_type: 'Service', |
290 |
| - asserts_alert_category: 'failure', |
| 272 | + severity: 'critical', |
291 | 273 | },
|
292 | 274 | },
|
293 | 275 | {
|
|
299 | 281 | expr: '(sum without(relname) (pg_stat_user_tables_n_dead_tup{asserts_env!="", db_name!~"template.*|^$"}) > 10000) / ((sum without(relname) (pg_stat_user_tables_n_live_tup{asserts_env!="", db_name!~"template.*|^$"}) + sum without(relname)(pg_stat_user_tables_n_dead_tup{asserts_env!="", db_name!~"template.*|^$"})) > 0) >= 0.1 unless on(instance, asserts_env, asserts_site) (pg_replication_is_replica{asserts_env!=""} == 1)',
|
300 | 282 | 'for': '5m',
|
301 | 283 | labels: {
|
302 |
| - asserts_severity: 'warning', |
303 |
| - asserts_entity_type: 'Service', |
304 |
| - asserts_alert_category: 'failure', |
| 284 | + severity: 'warning', |
305 | 285 | },
|
306 | 286 | },
|
307 | 287 | {
|
|
313 | 293 | expr: 'group without(pod, instance)(timestamp(pg_stat_user_tables_n_dead_tup{asserts_env!=""} > pg_stat_user_tables_n_live_tup{asserts_env!=""} * on(asserts_env, asserts_site, namespace, job, service, instance, server) group_left pg_settings_autovacuum_vacuum_scale_factor{asserts_env!=""} + on(asserts_env, asserts_site, namespace, job, service, instance, server) group_left pg_settings_autovacuum_vacuum_threshold{asserts_env!=""})) < time() - 36000',
|
314 | 294 | 'for': '30m',
|
315 | 295 | labels: {
|
316 |
| - asserts_severity: 'critical', |
317 |
| - asserts_entity_type: 'Service', |
318 |
| - asserts_alert_category: 'failure', |
| 296 | + severity: 'critical', |
319 | 297 | },
|
320 | 298 | },
|
321 | 299 | {
|
|
324 | 302 | description: '{{ $labels.instance }} table has not been analyzed recently, which might lead to inefficient query planning.',
|
325 | 303 | summary: 'PostgreSQL table not analyzed.',
|
326 | 304 | },
|
327 |
| - expr: ' |
328 |
| - group without(pod, instance)( |
329 |
| - timestamp( |
330 |
| - pg_stat_user_tables_n_dead_tup{asserts_env!=""} > |
331 |
| - pg_stat_user_tables_n_live_tup{asserts_env!=""} |
332 |
| - * on(asserts_env, asserts_site, namespace, job, service, instance, server) group_left pg_settings_autovacuum_analyze_scale_factor{asserts_env!=""} |
333 |
| - + on(asserts_env, asserts_site, namespace, job, service, instance, server) group_left pg_settings_autovacuum_analyze_threshold{asserts_env!=""} |
334 |
| - ) |
335 |
| - - |
336 |
| - pg_stat_user_tables_last_autoanalyze{asserts_env!=""} |
337 |
| - > 24 * 60 * 60 |
338 |
| - )', |
| 305 | + expr: '\n group without(pod, instance)(\n timestamp(\n pg_stat_user_tables_n_dead_tup{asserts_env!=""} >\n pg_stat_user_tables_n_live_tup{asserts_env!=""}\n * on(asserts_env, asserts_site, namespace, job, service, instance, server) group_left pg_settings_autovacuum_analyze_scale_factor{asserts_env!=""}\n + on(asserts_env, asserts_site, namespace, job, service, instance, server) group_left pg_settings_autovacuum_analyze_threshold{asserts_env!=""}\n )\n -\n pg_stat_user_tables_last_autoanalyze{asserts_env!=""}\n > 24 * 60 * 60\n )', |
339 | 306 | labels: {
|
340 |
| - asserts_severity: 'warning', |
| 307 | + severity: 'warning', |
341 | 308 | asserts_entity_type: 'DataSource',
|
342 | 309 | asserts_alert_category: 'failure',
|
343 | 310 | },
|
|
348 | 315 | description: '{{ $labels.instance }} is requesting too many checkpoints, which may lead to performance degradation.',
|
349 | 316 | summary: 'PostgreSQL too many checkpoints requested.',
|
350 | 317 | },
|
351 |
| - expr:' |
352 |
| - rate(pg_stat_bgwriter_checkpoints_timed_total{asserts_env!=""}[5m]) / |
353 |
| - (rate(pg_stat_bgwriter_checkpoints_timed_total{asserts_env!=""}[5m]) + rate(pg_stat_bgwriter_checkpoints_req_total{asserts_env!=""}[5m])) |
354 |
| - < 0.5', |
| 318 | + expr: '\n rate(pg_stat_bgwriter_checkpoints_timed_total{asserts_env!=""}[5m]) /\n (rate(pg_stat_bgwriter_checkpoints_timed_total{asserts_env!=""}[5m]) + rate(pg_stat_bgwriter_checkpoints_req_total{asserts_env!=""}[5m]))\n < 0.5', |
355 | 319 | 'for': '5m',
|
356 | 320 | labels: {
|
357 |
| - asserts_severity: 'warning', |
358 |
| - asserts_entity_type: 'Service', |
359 |
| - asserts_alert_category: 'failure', |
| 321 | + severity: 'warning', |
360 | 322 | },
|
361 | 323 | },
|
362 | 324 | ],
|
|
0 commit comments