From e082fa831faa54a926bd4e32c7cd4f6cdfb21640 Mon Sep 17 00:00:00 2001 From: crunchy-containers-bot Date: Thu, 28 Aug 2025 16:31:54 +0000 Subject: [PATCH] patching Kustomize monitoring installer --- .../monitoring/alertmanager/deployment.yaml | 2 +- .../grafana/dashboards/pgbouncer_direct.json | 4 +-- .../dashboards/postgresql_details.json | 16 ++++++------ .../dashboards/postgresql_overview.json | 2 +- .../grafana/dashboards/prometheus_alerts.json | 6 ++--- kustomize/monitoring/grafana/deployment.yaml | 2 +- .../config/crunchy-alert-rules-pg.yml | 26 ++++++++++++++++--- 7 files changed, 39 insertions(+), 19 deletions(-) diff --git a/kustomize/monitoring/alertmanager/deployment.yaml b/kustomize/monitoring/alertmanager/deployment.yaml index 8844a6e6..5453205f 100644 --- a/kustomize/monitoring/alertmanager/deployment.yaml +++ b/kustomize/monitoring/alertmanager/deployment.yaml @@ -8,7 +8,7 @@ spec: spec: containers: - name: alertmanager - image: prom/alertmanager:v0.27.0 + image: prom/alertmanager:v0.28.1 args: - --config.file=/etc/alertmanager/alertmanager.yml - --storage.path=/alertmanager diff --git a/kustomize/monitoring/grafana/dashboards/pgbouncer_direct.json b/kustomize/monitoring/grafana/dashboards/pgbouncer_direct.json index 68c23793..a70694f5 100644 --- a/kustomize/monitoring/grafana/dashboards/pgbouncer_direct.json +++ b/kustomize/monitoring/grafana/dashboards/pgbouncer_direct.json @@ -28,7 +28,7 @@ "name": "Prometheus", "version": "1.0.0" } - ], + ], "annotations": { "list": [ { @@ -660,7 +660,7 @@ ], "value": [ "$__all" - ] + ] }, "datasource": "PROMETHEUS", "definition": "label_values(ccp_pgbouncer_databases_pool_size{cluster_name=\"[[cluster_name]]\", pod=\"[[pgbnode]]\"},name)", diff --git a/kustomize/monitoring/grafana/dashboards/postgresql_details.json b/kustomize/monitoring/grafana/dashboards/postgresql_details.json index 90866777..d38cf826 100644 --- a/kustomize/monitoring/grafana/dashboards/postgresql_details.json +++ b/kustomize/monitoring/grafana/dashboards/postgresql_details.json @@ -151,7 +151,7 @@ "pluginVersion": "7.4.5", "targets": [ { - "expr": "min(ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} < ccp_backrest_last_diff_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} or ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} < ccp_backrest_last_full_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} or ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"}) ", + "expr": "min(ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} or ccp_backrest_last_diff_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} or ccp_backrest_last_full_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"}) ", "format": "time_series", "interval": "", "intervalFactor": 1, @@ -1701,29 +1701,29 @@ "step": 2 }, { - "expr": "sum(ccp_stat_bgwriter_buffers_backend{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"})", + "expr": "sum(ccp_stat_io_bgwriter_writes{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"}) or sum(ccp_stat_bgwriter_buffers_backend{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"})", "format": "time_series", "intervalFactor": 2, - "legendFormat": "Backend", - "metric": "ccp_stat_bgwriter_buffers_backend", + "legendFormat": "Writes", + "metric": "ccp_stat_io_bgwriter_writes", "refId": "B", "step": 2 }, { - "expr": "sum(ccp_stat_bgwriter_buffers_backend_fsync{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"})", + "expr": "sum(ccp_stat_io_bgwriter_fsyncs{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"}) or sum(ccp_stat_bgwriter_buffers_backend_fsync{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "FSync", - "metric": "ccp_stat_bgwriter_buffers_backend_fsync", + "metric": "ccp_stat_io_bgwriter_fsyncs", "refId": "C", "step": 2 }, { - "expr": "sum(ccp_stat_bgwriter_buffers_checkpoint{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"})", + "expr": "sum(ccp_stat_checkpointer_buffers_written{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"}) or sum(ccp_stat_bgwriter_buffers_checkpoint{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "CheckPoint", - "metric": "ccp_stat_bgwriter_buffers_checkpoint", + "metric": "ccp_stat_checkpointer_buffers_written", "refId": "D", "step": 2 }, diff --git a/kustomize/monitoring/grafana/dashboards/postgresql_overview.json b/kustomize/monitoring/grafana/dashboards/postgresql_overview.json index b965463f..f9bf2e94 100644 --- a/kustomize/monitoring/grafana/dashboards/postgresql_overview.json +++ b/kustomize/monitoring/grafana/dashboards/postgresql_overview.json @@ -163,7 +163,7 @@ "targets": [ { "$hashKey": "object:243", - "expr": "sum(pg_up{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"}) or sum(up{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"})", + "expr": "sum(pg_up{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"}) or sum(patroni_postgres_running{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"})", "format": "time_series", "interval": "", "intervalFactor": 1, diff --git a/kustomize/monitoring/grafana/dashboards/prometheus_alerts.json b/kustomize/monitoring/grafana/dashboards/prometheus_alerts.json index f41aa481..e0090cf8 100644 --- a/kustomize/monitoring/grafana/dashboards/prometheus_alerts.json +++ b/kustomize/monitoring/grafana/dashboards/prometheus_alerts.json @@ -136,7 +136,7 @@ "pluginVersion": "7.4.5", "targets": [ { - "expr": "count(count by (kubernetes_namespace) (pg_up)) or count(count by (kubernetes_namespace) (up))", + "expr": "sum(count by (kubernetes_namespace) (pg_up{pg_cluster!=''})) + sum(count by (kubernetes_namespace) (patroni_postgres_running{pg_cluster!=''}))", "format": "time_series", "instant": true, "interval": "", @@ -208,7 +208,7 @@ "pluginVersion": "7.4.5", "targets": [ { - "expr": "count(count by (pg_cluster) (pg_up)) or count(count by (pg_cluster) (up))", + "expr": "sum(count by (pg_cluster) (pg_up{pg_cluster!=''})) + sum(count by (pg_cluster) (patroni_postgres_running{pg_cluster!=''}))", "format": "time_series", "instant": true, "interval": "", @@ -280,7 +280,7 @@ "pluginVersion": "7.4.5", "targets": [ { - "expr": "count(pg_up) or count(up)", + "expr": "sum(count(pg_up{pg_cluster!=''})) + sum(count(patroni_postgres_running{pg_cluster!=''}))", "format": "time_series", "instant": true, "interval": "", diff --git a/kustomize/monitoring/grafana/deployment.yaml b/kustomize/monitoring/grafana/deployment.yaml index ddfea378..67d60d8b 100644 --- a/kustomize/monitoring/grafana/deployment.yaml +++ b/kustomize/monitoring/grafana/deployment.yaml @@ -8,7 +8,7 @@ spec: spec: containers: - name: grafana - image: grafana/grafana:11.1.5 + image: grafana/grafana:11.1.13 ports: - containerPort: 3000 env: diff --git a/kustomize/monitoring/prometheus/config/crunchy-alert-rules-pg.yml b/kustomize/monitoring/prometheus/config/crunchy-alert-rules-pg.yml index 83f666e4..b68a197e 100644 --- a/kustomize/monitoring/prometheus/config/crunchy-alert-rules-pg.yml +++ b/kustomize/monitoring/prometheus/config/crunchy-alert-rules-pg.yml @@ -22,7 +22,7 @@ groups: ########## SYSTEM RULES ########## - alert: ExporterDown - expr: avg_over_time(up[5m]) < 0.5 + expr: avg_over_time(up{job=~"crunchy-otel-collector|crunchy-postgres-exporter",exported_job!="patroni"}[5m]) < 0.5 for: 10s labels: service: system @@ -35,15 +35,35 @@ groups: ########## POSTGRESQL RULES ########## - alert: PGIsUp - expr: "pg_up < 1 or up < 1" + expr: "pg_up < 1 or patroni_postgres_running < 1" for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: - summary: 'postgres_exporter running on {{ $labels.job }} is unable to communicate with the configured database' + summary: 'Metrics exporter running on {{ $labels.job }} is unable to communicate with the configured database' + - alert: PGNoPrimary + expr: max by (cluster_name) (ccp_is_in_recovery_status) < 2 + for: 30s + labels: + service: postgresql + severity: critical + severity_num: 300 + annotations: + summary: 'cluster {{ $labels.cluster_name }} does not have a primary instance' + +# Alert on missing or absent replicas +# - alert: PGNoReplica +# expr: min by (cluster_name) (ccp_is_in_recovery_status) > 1 +# for: 30s +# labels: +# service: postgresql +# severity: critical +# severity_num: 300 +# annotations: +# summary: 'cluster {{ $labels.cluster_name }} does not have a replica instance' # Example to check for current version of PostgreSQL. Metric returns the version that the exporter is running on, so you can set a rule to check for the minimum version you'd like all systems to be on. Number returned is the 6 digit integer representation contained in the setting "server_version_num". #