From 839dc6c4ab66c25539ae60342051878b3f623992 Mon Sep 17 00:00:00 2001 From: Ben Blattberg Date: Thu, 29 Sep 2022 14:27:54 -0500 Subject: [PATCH] Update exporter dashboards Issue: [sc-15707] --- kustomize/monitoring/alertmanager-config.yaml | 4 +- .../monitoring/alertmanager-rules-config.yaml | 58 +++++++++++++++---- .../monitoring/crunchy_grafana_dashboards.yml | 4 +- .../monitoring/dashboards/pgbackrest.json | 4 +- .../monitoring/dashboards/pod_details.json | 4 +- .../dashboards/postgres_overview.json | 2 +- .../dashboards/postgresql_details.json | 6 +- .../dashboards/postgresql_service_health.json | 6 +- .../dashboards/prometheus_alerts.json | 2 +- .../dashboards/query_statistics.json | 2 +- kustomize/monitoring/deploy-grafana.yaml | 2 +- kustomize/monitoring/deploy-prometheus.yaml | 2 +- 12 files changed, 67 insertions(+), 29 deletions(-) diff --git a/kustomize/monitoring/alertmanager-config.yaml b/kustomize/monitoring/alertmanager-config.yaml index 4abdd314..b3328146 100644 --- a/kustomize/monitoring/alertmanager-config.yaml +++ b/kustomize/monitoring/alertmanager-config.yaml @@ -3,7 +3,7 @@ data: alertmanager.yml: | ### # - # Copyright 2017-2022 Crunchy Data Solutions, Inc. All Rights Reserved. + # Copyright © 2017-2022 Crunchy Data Solutions, Inc. All Rights Reserved. # ### @@ -31,7 +31,7 @@ data: receivers: - name: 'default-receiver' email_configs: - - to: 'example@yourcompany.com' + - to: 'example@crunchydata.com' send_resolved: true ## Examples of alternative alert receivers. See documentation for more info on how to configure these fully diff --git a/kustomize/monitoring/alertmanager-rules-config.yaml b/kustomize/monitoring/alertmanager-rules-config.yaml index 9e657753..4536ed7f 100644 --- a/kustomize/monitoring/alertmanager-rules-config.yaml +++ b/kustomize/monitoring/alertmanager-rules-config.yaml @@ -23,6 +23,19 @@ data: summary: 'Postgres Exporter running on {{ $labels.job }} (instance: {{ $labels.instance }}) is encountering scrape errors processing queries. Error count: ( {{ $value }} )' + ########## SYSTEM RULES ########## + - alert: ExporterDown + expr: avg_over_time(up[5m]) < 0.5 + for: 10s + labels: + service: system + severity: critical + severity_num: 300 + annotations: + description: 'Metrics exporter service for {{ $labels.job }} running on {{ $labels.instance }} has been down at least 50% of the time for the last 5 minutes. Service may be flapping or down.' + summary: 'Prometheus Exporter Service Down' + + ########## POSTGRESQL RULES ########## - alert: PGIsUp expr: pg_up < 1 @@ -173,6 +186,27 @@ data: description: '{{ $labels.job }} is using 90% or more of available connections ({{ $value }}%)' summary: 'PGSQL Instance connections' + - alert: DiskFillPredict + expr: predict_linear(ccp_nodemx_data_disk_available_bytes{mount_point!~"tmpfs"}[1h], 24 * 3600) < 0 and 100 * ((ccp_nodemx_data_disk_total_bytes - ccp_nodemx_data_disk_available_bytes) / ccp_nodemx_data_disk_total_bytes) > 70 + for: 5m + labels: + service: postgresql + severity: warning + severity_num: 200 + annotations: + summary: 'Disk predicted to be full in 24 hours' + description: 'Disk on {{ $labels.pg_cluster }}:{{ $labels.kubernetes_pod_name }} is predicted to fill in 24 hrs based on current usage' + + - alert: PGClusterRoleChange + expr: count by (pg_cluster) (ccp_is_in_recovery_status != ignoring(instance,ip,pod,role) (ccp_is_in_recovery_status offset 5m)) >= 1 + for: 60s + labels: + service: postgresql + severity: critical + severity_num: 300 + annotations: + summary: '{{ $labels.pg_cluster }} has had a switchover/failover event. Please check this cluster for more details' + - alert: PGDiskSize expr: 100 * ((ccp_nodemx_data_disk_total_bytes - ccp_nodemx_data_disk_available_bytes) / ccp_nodemx_data_disk_total_bytes) > 75 for: 60s @@ -196,7 +230,7 @@ data: summary: 'PGSQL Instance size critical' - alert: PGReplicationByteLag - expr: ccp_replication_status_byte_lag > 5.24288e+07 + expr: ccp_replication_lag_size_bytes > 5.24288e+07 for: 60s labels: service: postgresql @@ -207,7 +241,7 @@ data: summary: 'PGSQL Instance replica lag warning' - alert: PGReplicationByteLag - expr: ccp_replication_status_byte_lag > 1.048576e+08 + expr: ccp_replication_lag_size_bytes > 1.048576e+08 for: 60s labels: service: postgresql @@ -313,12 +347,15 @@ data: # Otherwise rule will be applied to all stanzas returned on target system if not set. # # Relevant metric names are: - # ccp_backrest_last_full_time_since_completion_seconds - # ccp_backrest_last_incr_time_since_completion_seconds - # ccp_backrest_last_diff_time_since_completion_seconds + # ccp_backrest_last_full_backup_time_since_completion_seconds + # ccp_backrest_last_incr_backup_time_since_completion_seconds + # ccp_backrest_last_diff_backup_time_since_completion_seconds + # + # To avoid false positives on backup time alerts, 12 hours are added onto each threshold to allow a buffer if the backup runtime varies from day to day. + # Further adjustment may be needed depending on your backup runtimes/schedule. # # - alert: PGBackRestLastCompletedFull_main - # expr: ccp_backrest_last_full_backup_time_since_completion_seconds{stanza="main"} > 604800 + # expr: ccp_backrest_last_full_backup_time_since_completion_seconds{stanza="main"} > 648000 # for: 60s # labels: # service: postgresql @@ -328,7 +365,7 @@ data: # summary: 'Full backup for stanza [main] on system {{ $labels.job }} has not completed in the last week.' # # - alert: PGBackRestLastCompletedIncr_main - # expr: ccp_backrest_last_incr_backup_time_since_completion_seconds{stanza="main"} > 86400 + # expr: ccp_backrest_last_incr_backup_time_since_completion_seconds{stanza="main"} > 129600 # for: 60s # labels: # service: postgresql @@ -340,14 +377,14 @@ data: # # Runtime monitoring is handled with a single metric: # - # ccp_backrest_last_runtime_backup_runtime_seconds + # ccp_backrest_last_info_backup_runtime_seconds # # Runtime monitoring should have the "backup_type" label set. # Otherwise the rule will apply to the last run of all backup types returned (full, diff, incr) # Stanza should also be set if runtimes per stanza have different expected times # # - alert: PGBackRestLastRuntimeFull_main - # expr: ccp_backrest_last_runtime_backup_runtime_seconds{backup_type="full", stanza="main"} > 14400 + # expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="full", stanza="main"} > 14400 # for: 60s # labels: # service: postgresql @@ -357,7 +394,7 @@ data: # summary: 'Expected runtime of full backup for stanza [main] has exceeded 4 hours' # # - alert: PGBackRestLastRuntimeDiff_main - # expr: ccp_backrest_last_runtime_backup_runtime_seconds{backup_type="diff", stanza="main"} > 3600 + # expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="diff", stanza="main"} > 3600 # for: 60s # labels: # service: postgresql @@ -382,6 +419,7 @@ data: # severity_num: 300 # annotations: # description: 'Backup Full status missing for Prod. Check that pgbackrest info command is working on target system.' + kind: ConfigMap metadata: labels: diff --git a/kustomize/monitoring/crunchy_grafana_dashboards.yml b/kustomize/monitoring/crunchy_grafana_dashboards.yml index 91ae801f..3a9e75b0 100644 --- a/kustomize/monitoring/crunchy_grafana_dashboards.yml +++ b/kustomize/monitoring/crunchy_grafana_dashboards.yml @@ -1,6 +1,6 @@ ### # -# Copyright 2017-2022 Crunchy Data Solutions, Inc. All Rights Reserved. +# Copyright © 2017-2022 Crunchy Data Solutions, Inc. All Rights Reserved. # ### apiVersion: 1 @@ -13,4 +13,4 @@ providers: disableDeletion: false updateIntervalSeconds: 3 #how often Grafana will scan for changed dashboards options: - path: $GF_PATHS_PROVISIONING/dashboards + path: /etc/grafana/provisioning/dashboards diff --git a/kustomize/monitoring/dashboards/pgbackrest.json b/kustomize/monitoring/dashboards/pgbackrest.json index ec3704e9..d50f5c31 100644 --- a/kustomize/monitoring/dashboards/pgbackrest.json +++ b/kustomize/monitoring/dashboards/pgbackrest.json @@ -52,7 +52,7 @@ "gnetId": null, "graphTooltip": 0, "id": null, - "iteration": 1624546649377, + "iteration": 1625069660860, "links": [ { "asDropdown": false, @@ -664,7 +664,7 @@ ] }, "time": { - "from": "now-30m", + "from": "now-2w", "to": "now" }, "timepicker": { diff --git a/kustomize/monitoring/dashboards/pod_details.json b/kustomize/monitoring/dashboards/pod_details.json index 6789e89a..a4b379e3 100644 --- a/kustomize/monitoring/dashboards/pod_details.json +++ b/kustomize/monitoring/dashboards/pod_details.json @@ -42,11 +42,11 @@ } ] }, - "editable": true, + "editable": false, "gnetId": null, "graphTooltip": 0, "id": null, - "iteration": 1624647381559, + "iteration": 1625069717503, "links": [ { "icon": "external link", diff --git a/kustomize/monitoring/dashboards/postgres_overview.json b/kustomize/monitoring/dashboards/postgres_overview.json index 145f2d2f..48144270 100644 --- a/kustomize/monitoring/dashboards/postgres_overview.json +++ b/kustomize/monitoring/dashboards/postgres_overview.json @@ -46,7 +46,7 @@ "gnetId": null, "graphTooltip": 0, "id": null, - "iteration": 1624491413218, + "iteration": 1625069480601, "links": [], "panels": [ { diff --git a/kustomize/monitoring/dashboards/postgresql_details.json b/kustomize/monitoring/dashboards/postgresql_details.json index 404b30c9..d1843985 100644 --- a/kustomize/monitoring/dashboards/postgresql_details.json +++ b/kustomize/monitoring/dashboards/postgresql_details.json @@ -54,11 +54,11 @@ } ] }, - "editable": true, + "editable": false, "gnetId": null, "graphTooltip": 0, "id": null, - "iteration": 1624495934950, + "iteration": 1625069813048, "links": [ { "asDropdown": false, @@ -2143,6 +2143,6 @@ }, "timezone": "browser", "title": "PostgreSQLDetails", - "uid": "pc4NNgknk", + "uid": "fMip0cuMk", "version": 1 } diff --git a/kustomize/monitoring/dashboards/postgresql_service_health.json b/kustomize/monitoring/dashboards/postgresql_service_health.json index 929ea787..2bee1d0b 100644 --- a/kustomize/monitoring/dashboards/postgresql_service_health.json +++ b/kustomize/monitoring/dashboards/postgresql_service_health.json @@ -42,11 +42,11 @@ } ] }, - "editable": true, + "editable": false, "gnetId": null, "graphTooltip": 0, "id": null, - "iteration": 1624491530019, + "iteration": 1625069909806, "links": [ { "asDropdown": false, @@ -626,7 +626,7 @@ ] }, "time": { - "from": "now-30m", + "from": "now-1h", "to": "now" }, "timepicker": { diff --git a/kustomize/monitoring/dashboards/prometheus_alerts.json b/kustomize/monitoring/dashboards/prometheus_alerts.json index 83e6ea06..ef8fb41a 100644 --- a/kustomize/monitoring/dashboards/prometheus_alerts.json +++ b/kustomize/monitoring/dashboards/prometheus_alerts.json @@ -938,7 +938,7 @@ "list": [] }, "time": { - "from": "now-30m", + "from": "now-1h", "to": "now" }, "timepicker": { diff --git a/kustomize/monitoring/dashboards/query_statistics.json b/kustomize/monitoring/dashboards/query_statistics.json index 88c64db5..2f849061 100644 --- a/kustomize/monitoring/dashboards/query_statistics.json +++ b/kustomize/monitoring/dashboards/query_statistics.json @@ -59,7 +59,7 @@ "gnetId": null, "graphTooltip": 0, "id": null, - "iteration": 1624501789811, + "iteration": 1625070004605, "links": [ { "icon": "external link", diff --git a/kustomize/monitoring/deploy-grafana.yaml b/kustomize/monitoring/deploy-grafana.yaml index 7e437b49..af280906 100644 --- a/kustomize/monitoring/deploy-grafana.yaml +++ b/kustomize/monitoring/deploy-grafana.yaml @@ -37,7 +37,7 @@ spec: value: crunchy-prometheus - name: PROM_PORT value: "9090" - image: grafana/grafana:7.4.5 + image: grafana/grafana:8.5.10 imagePullPolicy: IfNotPresent livenessProbe: failureThreshold: 3 diff --git a/kustomize/monitoring/deploy-prometheus.yaml b/kustomize/monitoring/deploy-prometheus.yaml index 65224e3b..fd574e01 100644 --- a/kustomize/monitoring/deploy-prometheus.yaml +++ b/kustomize/monitoring/deploy-prometheus.yaml @@ -27,7 +27,7 @@ spec: name: crunchy-prometheus spec: containers: - - image: prom/prometheus:v2.27.1 + - image: prom/prometheus:v2.33.5 imagePullPolicy: IfNotPresent livenessProbe: failureThreshold: 3