From e082fa831faa54a926bd4e32c7cd4f6cdfb21640 Mon Sep 17 00:00:00 2001
From: crunchy-containers-bot <sysops+containersbot@crunchydata.com>
Date: Thu, 28 Aug 2025 16:31:54 +0000
Subject: [PATCH] patching Kustomize monitoring installer

---
 .../monitoring/alertmanager/deployment.yaml   |  2 +-
 .../grafana/dashboards/pgbouncer_direct.json  |  4 +--
 .../dashboards/postgresql_details.json        | 16 ++++++------
 .../dashboards/postgresql_overview.json       |  2 +-
 .../grafana/dashboards/prometheus_alerts.json |  6 ++---
 kustomize/monitoring/grafana/deployment.yaml  |  2 +-
 .../config/crunchy-alert-rules-pg.yml         | 26 ++++++++++++++++---
 7 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/kustomize/monitoring/alertmanager/deployment.yaml b/kustomize/monitoring/alertmanager/deployment.yaml
index 8844a6e6..5453205f 100644
--- a/kustomize/monitoring/alertmanager/deployment.yaml
+++ b/kustomize/monitoring/alertmanager/deployment.yaml
@@ -8,7 +8,7 @@ spec:
     spec:
       containers:
       - name: alertmanager
-        image: prom/alertmanager:v0.27.0
+        image: prom/alertmanager:v0.28.1
         args:
         - --config.file=/etc/alertmanager/alertmanager.yml
         - --storage.path=/alertmanager
diff --git a/kustomize/monitoring/grafana/dashboards/pgbouncer_direct.json b/kustomize/monitoring/grafana/dashboards/pgbouncer_direct.json
index 68c23793..a70694f5 100644
--- a/kustomize/monitoring/grafana/dashboards/pgbouncer_direct.json
+++ b/kustomize/monitoring/grafana/dashboards/pgbouncer_direct.json
@@ -28,7 +28,7 @@
       "name": "Prometheus",
       "version": "1.0.0"
     }
-  ],  
+  ],
   "annotations": {
     "list": [
       {
@@ -660,7 +660,7 @@
           ],
           "value": [
             "$__all"
-          ]      
+          ]
         },
         "datasource": "PROMETHEUS",
         "definition": "label_values(ccp_pgbouncer_databases_pool_size{cluster_name=\"[[cluster_name]]\", pod=\"[[pgbnode]]\"},name)",
diff --git a/kustomize/monitoring/grafana/dashboards/postgresql_details.json b/kustomize/monitoring/grafana/dashboards/postgresql_details.json
index 90866777..d38cf826 100644
--- a/kustomize/monitoring/grafana/dashboards/postgresql_details.json
+++ b/kustomize/monitoring/grafana/dashboards/postgresql_details.json
@@ -151,7 +151,7 @@
       "pluginVersion": "7.4.5",
       "targets": [
         {
-          "expr": "min(ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} < ccp_backrest_last_diff_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} or ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} < ccp_backrest_last_full_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} or ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"}) ",
+          "expr": "min(ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} or ccp_backrest_last_diff_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} or ccp_backrest_last_full_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"}) ",
           "format": "time_series",
           "interval": "",
           "intervalFactor": 1,
@@ -1701,29 +1701,29 @@
           "step": 2
         },
         {
-          "expr": "sum(ccp_stat_bgwriter_buffers_backend{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"})",
+          "expr": "sum(ccp_stat_io_bgwriter_writes{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"}) or sum(ccp_stat_bgwriter_buffers_backend{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"})",
           "format": "time_series",
           "intervalFactor": 2,
-          "legendFormat": "Backend",
-          "metric": "ccp_stat_bgwriter_buffers_backend",
+          "legendFormat": "Writes",
+          "metric": "ccp_stat_io_bgwriter_writes",
           "refId": "B",
           "step": 2
         },
         {
-          "expr": "sum(ccp_stat_bgwriter_buffers_backend_fsync{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"})",
+          "expr": "sum(ccp_stat_io_bgwriter_fsyncs{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"}) or sum(ccp_stat_bgwriter_buffers_backend_fsync{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"})",
           "format": "time_series",
           "intervalFactor": 2,
           "legendFormat": "FSync",
-          "metric": "ccp_stat_bgwriter_buffers_backend_fsync",
+          "metric": "ccp_stat_io_bgwriter_fsyncs",
           "refId": "C",
           "step": 2
         },
         {
-          "expr": "sum(ccp_stat_bgwriter_buffers_checkpoint{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"})",
+          "expr": "sum(ccp_stat_checkpointer_buffers_written{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"}) or sum(ccp_stat_bgwriter_buffers_checkpoint{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"})",
           "format": "time_series",
           "intervalFactor": 2,
           "legendFormat": "CheckPoint",
-          "metric": "ccp_stat_bgwriter_buffers_checkpoint",
+          "metric": "ccp_stat_checkpointer_buffers_written",
           "refId": "D",
           "step": 2
         },
diff --git a/kustomize/monitoring/grafana/dashboards/postgresql_overview.json b/kustomize/monitoring/grafana/dashboards/postgresql_overview.json
index b965463f..f9bf2e94 100644
--- a/kustomize/monitoring/grafana/dashboards/postgresql_overview.json
+++ b/kustomize/monitoring/grafana/dashboards/postgresql_overview.json
@@ -163,7 +163,7 @@
       "targets": [
         {
           "$hashKey": "object:243",
-          "expr": "sum(pg_up{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"}) or sum(up{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"})",
+          "expr": "sum(pg_up{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"}) or sum(patroni_postgres_running{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"})",
           "format": "time_series",
           "interval": "",
           "intervalFactor": 1,
diff --git a/kustomize/monitoring/grafana/dashboards/prometheus_alerts.json b/kustomize/monitoring/grafana/dashboards/prometheus_alerts.json
index f41aa481..e0090cf8 100644
--- a/kustomize/monitoring/grafana/dashboards/prometheus_alerts.json
+++ b/kustomize/monitoring/grafana/dashboards/prometheus_alerts.json
@@ -136,7 +136,7 @@
       "pluginVersion": "7.4.5",
       "targets": [
         {
-          "expr": "count(count by (kubernetes_namespace) (pg_up)) or count(count by (kubernetes_namespace) (up))",
+          "expr": "sum(count by (kubernetes_namespace) (pg_up{pg_cluster!=''})) + sum(count by (kubernetes_namespace) (patroni_postgres_running{pg_cluster!=''}))",
           "format": "time_series",
           "instant": true,
           "interval": "",
@@ -208,7 +208,7 @@
       "pluginVersion": "7.4.5",
       "targets": [
         {
-          "expr": "count(count by (pg_cluster) (pg_up)) or count(count by (pg_cluster) (up))",
+          "expr": "sum(count by (pg_cluster) (pg_up{pg_cluster!=''})) + sum(count by (pg_cluster) (patroni_postgres_running{pg_cluster!=''}))",
           "format": "time_series",
           "instant": true,
           "interval": "",
@@ -280,7 +280,7 @@
       "pluginVersion": "7.4.5",
       "targets": [
         {
-          "expr": "count(pg_up) or count(up)",
+          "expr": "sum(count(pg_up{pg_cluster!=''})) + sum(count(patroni_postgres_running{pg_cluster!=''}))",
           "format": "time_series",
           "instant": true,
           "interval": "",
diff --git a/kustomize/monitoring/grafana/deployment.yaml b/kustomize/monitoring/grafana/deployment.yaml
index ddfea378..67d60d8b 100644
--- a/kustomize/monitoring/grafana/deployment.yaml
+++ b/kustomize/monitoring/grafana/deployment.yaml
@@ -8,7 +8,7 @@ spec:
     spec:
       containers:
       - name: grafana
-        image: grafana/grafana:11.1.5
+        image: grafana/grafana:11.1.13
         ports:
         - containerPort: 3000
         env:
diff --git a/kustomize/monitoring/prometheus/config/crunchy-alert-rules-pg.yml b/kustomize/monitoring/prometheus/config/crunchy-alert-rules-pg.yml
index 83f666e4..b68a197e 100644
--- a/kustomize/monitoring/prometheus/config/crunchy-alert-rules-pg.yml
+++ b/kustomize/monitoring/prometheus/config/crunchy-alert-rules-pg.yml
@@ -22,7 +22,7 @@ groups:
 
 ########## SYSTEM RULES ##########
   - alert: ExporterDown
-    expr: avg_over_time(up[5m]) < 0.5
+    expr: avg_over_time(up{job=~"crunchy-otel-collector|crunchy-postgres-exporter",exported_job!="patroni"}[5m]) < 0.5
     for: 10s
     labels:
       service: system
@@ -35,15 +35,35 @@ groups:
 
 ########## POSTGRESQL RULES ##########
   - alert: PGIsUp
-    expr: "pg_up < 1 or up < 1"
+    expr: "pg_up < 1 or patroni_postgres_running < 1"
     for: 60s
     labels:
       service: postgresql
       severity: critical
       severity_num: 300
     annotations:
-      summary: 'postgres_exporter running on {{ $labels.job }} is unable to communicate with the configured database'
+      summary: 'Metrics exporter running on {{ $labels.job }} is unable to communicate with the configured database'
 
+  - alert: PGNoPrimary
+    expr: max by (cluster_name) (ccp_is_in_recovery_status) < 2
+    for: 30s
+    labels:
+      service: postgresql
+      severity: critical
+      severity_num: 300
+    annotations:
+      summary: 'cluster {{ $labels.cluster_name }} does not have a primary instance'
+
+# Alert on missing or absent replicas
+#  - alert: PGNoReplica
+#    expr: min by (cluster_name) (ccp_is_in_recovery_status) > 1
+#    for: 30s
+#    labels:
+#      service: postgresql
+#      severity: critical
+#      severity_num: 300
+#    annotations:
+#      summary: 'cluster {{ $labels.cluster_name }} does not have a replica instance'
 
 # Example to check for current version of PostgreSQL. Metric returns the version that the exporter is running on, so you can set a rule to check for the minimum version you'd like all systems to be on. Number returned is the 6 digit integer representation contained in the setting "server_version_num".
 #