Skip to content

Commit 89a07af

Browse files
authored
patching Kustomize monitoring installer (CrunchyData#311)
1 parent 4f8d0d9 commit 89a07af

File tree

7 files changed

+39
-19
lines changed

7 files changed

+39
-19
lines changed

kustomize/monitoring/alertmanager/deployment.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ spec:
88
spec:
99
containers:
1010
- name: alertmanager
11-
image: prom/alertmanager:v0.27.0
11+
image: prom/alertmanager:v0.28.1
1212
args:
1313
- --config.file=/etc/alertmanager/alertmanager.yml
1414
- --storage.path=/alertmanager

kustomize/monitoring/grafana/dashboards/pgbouncer_direct.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
"name": "Prometheus",
2929
"version": "1.0.0"
3030
}
31-
],
31+
],
3232
"annotations": {
3333
"list": [
3434
{
@@ -660,7 +660,7 @@
660660
],
661661
"value": [
662662
"$__all"
663-
]
663+
]
664664
},
665665
"datasource": "PROMETHEUS",
666666
"definition": "label_values(ccp_pgbouncer_databases_pool_size{cluster_name=\"[[cluster_name]]\", pod=\"[[pgbnode]]\"},name)",

kustomize/monitoring/grafana/dashboards/postgresql_details.json

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@
151151
"pluginVersion": "7.4.5",
152152
"targets": [
153153
{
154-
"expr": "min(ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} < ccp_backrest_last_diff_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} or ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} < ccp_backrest_last_full_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} or ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"}) ",
154+
"expr": "min(ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} or ccp_backrest_last_diff_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} or ccp_backrest_last_full_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"}) ",
155155
"format": "time_series",
156156
"interval": "",
157157
"intervalFactor": 1,
@@ -1701,29 +1701,29 @@
17011701
"step": 2
17021702
},
17031703
{
1704-
"expr": "sum(ccp_stat_bgwriter_buffers_backend{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"})",
1704+
"expr": "sum(ccp_stat_io_bgwriter_writes{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"}) or sum(ccp_stat_bgwriter_buffers_backend{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"})",
17051705
"format": "time_series",
17061706
"intervalFactor": 2,
1707-
"legendFormat": "Backend",
1708-
"metric": "ccp_stat_bgwriter_buffers_backend",
1707+
"legendFormat": "Writes",
1708+
"metric": "ccp_stat_io_bgwriter_writes",
17091709
"refId": "B",
17101710
"step": 2
17111711
},
17121712
{
1713-
"expr": "sum(ccp_stat_bgwriter_buffers_backend_fsync{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"})",
1713+
"expr": "sum(ccp_stat_io_bgwriter_fsyncs{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"}) or sum(ccp_stat_bgwriter_buffers_backend_fsync{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"})",
17141714
"format": "time_series",
17151715
"intervalFactor": 2,
17161716
"legendFormat": "FSync",
1717-
"metric": "ccp_stat_bgwriter_buffers_backend_fsync",
1717+
"metric": "ccp_stat_io_bgwriter_fsyncs",
17181718
"refId": "C",
17191719
"step": 2
17201720
},
17211721
{
1722-
"expr": "sum(ccp_stat_bgwriter_buffers_checkpoint{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"})",
1722+
"expr": "sum(ccp_stat_checkpointer_buffers_written{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"}) or sum(ccp_stat_bgwriter_buffers_checkpoint{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"})",
17231723
"format": "time_series",
17241724
"intervalFactor": 2,
17251725
"legendFormat": "CheckPoint",
1726-
"metric": "ccp_stat_bgwriter_buffers_checkpoint",
1726+
"metric": "ccp_stat_checkpointer_buffers_written",
17271727
"refId": "D",
17281728
"step": 2
17291729
},

kustomize/monitoring/grafana/dashboards/postgresql_overview.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@
163163
"targets": [
164164
{
165165
"$hashKey": "object:243",
166-
"expr": "sum(pg_up{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"}) or sum(up{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"})",
166+
"expr": "sum(pg_up{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"}) or sum(patroni_postgres_running{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"})",
167167
"format": "time_series",
168168
"interval": "",
169169
"intervalFactor": 1,

kustomize/monitoring/grafana/dashboards/prometheus_alerts.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@
136136
"pluginVersion": "7.4.5",
137137
"targets": [
138138
{
139-
"expr": "count(count by (kubernetes_namespace) (pg_up)) or count(count by (kubernetes_namespace) (up))",
139+
"expr": "sum(count by (kubernetes_namespace) (pg_up{pg_cluster!=''})) + sum(count by (kubernetes_namespace) (patroni_postgres_running{pg_cluster!=''}))",
140140
"format": "time_series",
141141
"instant": true,
142142
"interval": "",
@@ -208,7 +208,7 @@
208208
"pluginVersion": "7.4.5",
209209
"targets": [
210210
{
211-
"expr": "count(count by (pg_cluster) (pg_up)) or count(count by (pg_cluster) (up))",
211+
"expr": "sum(count by (pg_cluster) (pg_up{pg_cluster!=''})) + sum(count by (pg_cluster) (patroni_postgres_running{pg_cluster!=''}))",
212212
"format": "time_series",
213213
"instant": true,
214214
"interval": "",
@@ -280,7 +280,7 @@
280280
"pluginVersion": "7.4.5",
281281
"targets": [
282282
{
283-
"expr": "count(pg_up) or count(up)",
283+
"expr": "sum(count(pg_up{pg_cluster!=''})) + sum(count(patroni_postgres_running{pg_cluster!=''}))",
284284
"format": "time_series",
285285
"instant": true,
286286
"interval": "",

kustomize/monitoring/grafana/deployment.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ spec:
88
spec:
99
containers:
1010
- name: grafana
11-
image: grafana/grafana:11.1.5
11+
image: grafana/grafana:11.1.13
1212
ports:
1313
- containerPort: 3000
1414
env:

kustomize/monitoring/prometheus/config/crunchy-alert-rules-pg.yml

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ groups:
2222

2323
########## SYSTEM RULES ##########
2424
- alert: ExporterDown
25-
expr: avg_over_time(up[5m]) < 0.5
25+
expr: avg_over_time(up{job=~"crunchy-otel-collector|crunchy-postgres-exporter",exported_job!="patroni"}[5m]) < 0.5
2626
for: 10s
2727
labels:
2828
service: system
@@ -35,15 +35,35 @@ groups:
3535

3636
########## POSTGRESQL RULES ##########
3737
- alert: PGIsUp
38-
expr: "pg_up < 1 or up < 1"
38+
expr: "pg_up < 1 or patroni_postgres_running < 1"
3939
for: 60s
4040
labels:
4141
service: postgresql
4242
severity: critical
4343
severity_num: 300
4444
annotations:
45-
summary: 'postgres_exporter running on {{ $labels.job }} is unable to communicate with the configured database'
45+
summary: 'Metrics exporter running on {{ $labels.job }} is unable to communicate with the configured database'
4646

47+
- alert: PGNoPrimary
48+
expr: max by (cluster_name) (ccp_is_in_recovery_status) < 2
49+
for: 30s
50+
labels:
51+
service: postgresql
52+
severity: critical
53+
severity_num: 300
54+
annotations:
55+
summary: 'cluster {{ $labels.cluster_name }} does not have a primary instance'
56+
57+
# Alert on missing or absent replicas
58+
# - alert: PGNoReplica
59+
# expr: min by (cluster_name) (ccp_is_in_recovery_status) > 1
60+
# for: 30s
61+
# labels:
62+
# service: postgresql
63+
# severity: critical
64+
# severity_num: 300
65+
# annotations:
66+
# summary: 'cluster {{ $labels.cluster_name }} does not have a replica instance'
4767

4868
# Example to check for current version of PostgreSQL. Metric returns the version that the exporter is running on, so you can set a rule to check for the minimum version you'd like all systems to be on. Number returned is the 6 digit integer representation contained in the setting "server_version_num".
4969
#

0 commit comments

Comments
 (0)