Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion kustomize/monitoring/alertmanager/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ spec:
spec:
containers:
- name: alertmanager
image: prom/alertmanager:v0.27.0
image: prom/alertmanager:v0.28.1
args:
- --config.file=/etc/alertmanager/alertmanager.yml
- --storage.path=/alertmanager
Expand Down
4 changes: 2 additions & 2 deletions kustomize/monitoring/grafana/dashboards/pgbouncer_direct.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"name": "Prometheus",
"version": "1.0.0"
}
],
],
"annotations": {
"list": [
{
Expand Down Expand Up @@ -660,7 +660,7 @@
],
"value": [
"$__all"
]
]
},
"datasource": "PROMETHEUS",
"definition": "label_values(ccp_pgbouncer_databases_pool_size{cluster_name=\"[[cluster_name]]\", pod=\"[[pgbnode]]\"},name)",
Expand Down
16 changes: 8 additions & 8 deletions kustomize/monitoring/grafana/dashboards/postgresql_details.json
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@
"pluginVersion": "7.4.5",
"targets": [
{
"expr": "min(ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} < ccp_backrest_last_diff_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} or ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} < ccp_backrest_last_full_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} or ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"}) ",
"expr": "min(ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} or ccp_backrest_last_diff_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} or ccp_backrest_last_full_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"}) ",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
Expand Down Expand Up @@ -1701,29 +1701,29 @@
"step": 2
},
{
"expr": "sum(ccp_stat_bgwriter_buffers_backend{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"})",
"expr": "sum(ccp_stat_io_bgwriter_writes{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"}) or sum(ccp_stat_bgwriter_buffers_backend{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"})",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Backend",
"metric": "ccp_stat_bgwriter_buffers_backend",
"legendFormat": "Writes",
"metric": "ccp_stat_io_bgwriter_writes",
"refId": "B",
"step": 2
},
{
"expr": "sum(ccp_stat_bgwriter_buffers_backend_fsync{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"})",
"expr": "sum(ccp_stat_io_bgwriter_fsyncs{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"}) or sum(ccp_stat_bgwriter_buffers_backend_fsync{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"})",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "FSync",
"metric": "ccp_stat_bgwriter_buffers_backend_fsync",
"metric": "ccp_stat_io_bgwriter_fsyncs",
"refId": "C",
"step": 2
},
{
"expr": "sum(ccp_stat_bgwriter_buffers_checkpoint{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"})",
"expr": "sum(ccp_stat_checkpointer_buffers_written{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"}) or sum(ccp_stat_bgwriter_buffers_checkpoint{pg_cluster=\"[[cluster]]\",pod=~\"[[pod]]\"})",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "CheckPoint",
"metric": "ccp_stat_bgwriter_buffers_checkpoint",
"metric": "ccp_stat_checkpointer_buffers_written",
"refId": "D",
"step": 2
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@
"targets": [
{
"$hashKey": "object:243",
"expr": "sum(pg_up{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"}) or sum(up{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"})",
"expr": "sum(pg_up{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"}) or sum(patroni_postgres_running{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"})",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@
"pluginVersion": "7.4.5",
"targets": [
{
"expr": "count(count by (kubernetes_namespace) (pg_up)) or count(count by (kubernetes_namespace) (up))",
"expr": "sum(count by (kubernetes_namespace) (pg_up{pg_cluster!=''})) + sum(count by (kubernetes_namespace) (patroni_postgres_running{pg_cluster!=''}))",
"format": "time_series",
"instant": true,
"interval": "",
Expand Down Expand Up @@ -208,7 +208,7 @@
"pluginVersion": "7.4.5",
"targets": [
{
"expr": "count(count by (pg_cluster) (pg_up)) or count(count by (pg_cluster) (up))",
"expr": "sum(count by (pg_cluster) (pg_up{pg_cluster!=''})) + sum(count by (pg_cluster) (patroni_postgres_running{pg_cluster!=''}))",
"format": "time_series",
"instant": true,
"interval": "",
Expand Down Expand Up @@ -280,7 +280,7 @@
"pluginVersion": "7.4.5",
"targets": [
{
"expr": "count(pg_up) or count(up)",
"expr": "sum(count(pg_up{pg_cluster!=''})) + sum(count(patroni_postgres_running{pg_cluster!=''}))",
"format": "time_series",
"instant": true,
"interval": "",
Expand Down
2 changes: 1 addition & 1 deletion kustomize/monitoring/grafana/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ spec:
spec:
containers:
- name: grafana
image: grafana/grafana:11.1.5
image: grafana/grafana:11.1.13
ports:
- containerPort: 3000
env:
Expand Down
26 changes: 23 additions & 3 deletions kustomize/monitoring/prometheus/config/crunchy-alert-rules-pg.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ groups:

########## SYSTEM RULES ##########
- alert: ExporterDown
expr: avg_over_time(up[5m]) < 0.5
expr: avg_over_time(up{job=~"crunchy-otel-collector|crunchy-postgres-exporter",exported_job!="patroni"}[5m]) < 0.5
for: 10s
labels:
service: system
Expand All @@ -35,15 +35,35 @@ groups:

########## POSTGRESQL RULES ##########
- alert: PGIsUp
expr: "pg_up < 1 or up < 1"
expr: "pg_up < 1 or patroni_postgres_running < 1"
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
summary: 'postgres_exporter running on {{ $labels.job }} is unable to communicate with the configured database'
summary: 'Metrics exporter running on {{ $labels.job }} is unable to communicate with the configured database'

- alert: PGNoPrimary
expr: max by (cluster_name) (ccp_is_in_recovery_status) < 2
for: 30s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
summary: 'cluster {{ $labels.cluster_name }} does not have a primary instance'

# Alert on missing or absent replicas
# - alert: PGNoReplica
# expr: min by (cluster_name) (ccp_is_in_recovery_status) > 1
# for: 30s
# labels:
# service: postgresql
# severity: critical
# severity_num: 300
# annotations:
# summary: 'cluster {{ $labels.cluster_name }} does not have a replica instance'

# Example to check for current version of PostgreSQL. Metric returns the version that the exporter is running on, so you can set a rule to check for the minimum version you'd like all systems to be on. Number returned is the 6 digit integer representation contained in the setting "server_version_num".
#
Expand Down