diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index 7a73b63..4433626 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -17,12 +17,12 @@ jobs: - name: Setup Go uses: actions/setup-go@v2 with: - go-version: 1.22 + go-version: 1.23 - name: Install Helm uses: azure/setup-helm@v4 with: - version: v3.14.4 + version: v3.17.1 - name: Install yq run: | diff --git a/.github/workflows/nightly-build.yaml b/.github/workflows/nightly-build.yaml index 40a6935..e64f92c 100644 --- a/.github/workflows/nightly-build.yaml +++ b/.github/workflows/nightly-build.yaml @@ -21,7 +21,7 @@ jobs: - name: Install Helm uses: azure/setup-helm@v4 with: - version: v3.14.4 + version: v3.17.1 - name: Install yq run: | @@ -31,11 +31,10 @@ jobs: - name: make build run: | make build > output.log 2>&1 - continue-on-error: true + continue-on-error: false - name: Upload script output - if: failure() - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: script-output path: output.log diff --git a/CHANGELOG.md b/CHANGELOG.md index f97b441..8bb9049 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # CHANGELOG +## v0.3.0 + +- Adding prebuilt workspace dashboard & alerts + ## v0.2.1 - Upgraded subcharts @@ -23,4 +27,4 @@ ## v0.0.1 -- Initial release \ No newline at end of file +- Initial release diff --git a/README.gotmpl b/README.gotmpl index 840296e..411d638 100644 --- a/README.gotmpl +++ b/README.gotmpl @@ -199,6 +199,22 @@ stringData: password: "" # this matches the "passwordKey" field above ``` +To add an Ingress for Grafana, define this in your `values.yaml`: + +```yaml +grafana: + grafana.ini: + server: + domain: observability.example.com + root_url: "%(protocol)s://%(domain)s/grafana" + serve_from_sub_path: true + ingress: + enabled: true + hosts: + - "observability.example.com" + path: "/" +``` + ## Subcharts {{ template "chart.requirementsTable" . }} diff --git a/README.md b/README.md index f479746..616428a 100644 --- a/README.md +++ b/README.md @@ -199,14 +199,30 @@ stringData: password: "" # this matches the "passwordKey" field above ``` +To add an Ingress for Grafana, define this in your `values.yaml`: + +```yaml +grafana: + grafana.ini: + server: + domain: observability.example.com + root_url: "%(protocol)s://%(domain)s/grafana" + serve_from_sub_path: true + ingress: + enabled: true + hosts: + - "observability.example.com" + path: "/" +``` + ## Subcharts | Repository | Name | Version | |------------|------|---------| -| https://grafana.github.io/helm-charts | grafana | ^v7.3.7 | -| https://grafana.github.io/helm-charts | grafana-agent(grafana-agent) | ^0.37.0 | -| https://grafana.github.io/helm-charts | loki | ^v6.3.4 | -| https://prometheus-community.github.io/helm-charts | prometheus | ^v25.18.0 | +| https://grafana.github.io/helm-charts | grafana | ~v7.3.7 | +| https://grafana.github.io/helm-charts | grafana-agent(grafana-agent) | ~0.37.0 | +| https://grafana.github.io/helm-charts | loki | ~v6.7.3 | +| https://prometheus-community.github.io/helm-charts | prometheus | ~v25.24.1 | Each subchart can be disabled by setting the `enabled` field to `false`. @@ -228,7 +244,7 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | Key | Type | Default | Description | |-----|------|---------|-------------| -| global.coder.alerts | object | `{"coderd":{"groups":{"CPU":{"delay":"10m","enabled":true,"period":"10m","thresholds":{"critical":0.9,"warning":0.8}},"Memory":{"delay":"10m","enabled":true,"thresholds":{"critical":0.9,"warning":0.8}},"Replicas":{"delay":"5m","enabled":true,"thresholds":{"critical":1,"notify":3,"warning":2}},"Restarts":{"delay":"1m","enabled":true,"period":"10m","thresholds":{"critical":3,"notify":1,"warning":2}},"WorkspaceBuildFailures":{"delay":"10m","enabled":true,"period":"10m","thresholds":{"critical":10,"notify":2,"warning":5}}}},"enterprise":{"groups":{"Licences":{"delay":"1m","enabled":true,"thresholds":{"critical":1,"warning":0.9}}}},"provisionerd":{"groups":{"Replicas":{"delay":"5m","enabled":true,"thresholds":{"critical":1,"notify":3,"warning":2}}}}}` | alerts for the various aspects of Coder | +| global.coder.alerts | object | `{"coderd":{"groups":{"CPU":{"delay":"10m","enabled":true,"period":"10m","thresholds":{"critical":0.9,"warning":0.8}},"IneligiblePrebuilds":{"delay":"10m","enabled":true,"thresholds":{"notify":1}},"Memory":{"delay":"10m","enabled":true,"thresholds":{"critical":0.9,"warning":0.8}},"Replicas":{"delay":"5m","enabled":true,"thresholds":{"critical":1,"notify":3,"warning":2}},"Restarts":{"delay":"1m","enabled":true,"period":"10m","thresholds":{"critical":3,"notify":1,"warning":2}},"UnprovisionedPrebuiltWorkspaces":{"delay":"10m","enabled":true,"thresholds":{"warn":1}},"WorkspaceBuildFailures":{"delay":"10m","enabled":true,"period":"10m","thresholds":{"critical":10,"notify":2,"warning":5}}}},"enterprise":{"groups":{"Licences":{"delay":"1m","enabled":true,"thresholds":{"critical":1,"warning":0.9}}}},"provisionerd":{"groups":{"Replicas":{"delay":"5m","enabled":true,"thresholds":{"critical":1,"notify":3,"warning":2}}}}}` | alerts for the various aspects of Coder | | global.coder.coderdSelector | string | `"pod=~`coder.*`, pod!~`.*provisioner.*`"` | series selector for Prometheus/Loki to locate provisioner pods. ensure this uses backticks for quotes! | | global.coder.controlPlaneNamespace | string | `"coder"` | the namespace into which the control plane has been deployed. | | global.coder.externalProvisionersNamespace | string | `"coder"` | the namespace into which any external provisioners have been deployed. | @@ -242,7 +258,7 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | global.dashboards.timerange | string | `"12h"` | how far back dashboards should look | | global.externalScheme | string | `"http"` | | | global.externalZone | string | `"svc.cluster.local"` | | -| global.postgres | object | `{"alerts":{"groups":{"Basic":{"delay":"1m","enabled":true},"Connections":{"delay":"5m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}},"Notifications":{"delay":"15m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}}}},"database":"coder","hostname":"localhost","mountSecret":"secret-postgres","password":null,"port":5432,"sslmode":"disable","username":"coder"}` | postgres connection information NOTE: these settings are global so we can parameterise some values which get rendered by subcharts | +| global.postgres | object | `{"alerts":{"groups":{"Basic":{"delay":"1m","enabled":true},"Connections":{"delay":"5m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}},"Notifications":{"delay":"15m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}}}},"database":"coder","exporter":{"image":"quay.io/prometheuscommunity/postgres-exporter"},"hostname":"localhost","mountSecret":"secret-postgres","password":null,"port":5432,"sslmode":"disable","username":"coder"}` | postgres connection information NOTE: these settings are global so we can parameterise some values which get rendered by subcharts | | global.postgres.alerts | object | `{"groups":{"Basic":{"delay":"1m","enabled":true},"Connections":{"delay":"5m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}},"Notifications":{"delay":"15m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}}}}` | alerts for postgres | | global.telemetry | object | `{"metrics":{"scrape_interval":"15s","scrape_timeout":"12s"}}` | control telemetry collection | | global.telemetry.metrics | object | `{"scrape_interval":"15s","scrape_timeout":"12s"}` | control metric collection | @@ -360,6 +376,10 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | grafana.extraConfigmapMounts[4].mountPath | string | `"/var/lib/grafana/dashboards/coder/4"` | | | grafana.extraConfigmapMounts[4].name | string | `"dashboards-workspace-detail"` | | | grafana.extraConfigmapMounts[4].readOnly | bool | `false` | | +| grafana.extraConfigmapMounts[5].configMap | string | `"dashboards-prebuilds"` | | +| grafana.extraConfigmapMounts[5].mountPath | string | `"/var/lib/grafana/dashboards/coder/5"` | | +| grafana.extraConfigmapMounts[5].name | string | `"dashboards-prebuilds"` | | +| grafana.extraConfigmapMounts[5].readOnly | bool | `false` | | | grafana.fullnameOverride | string | `"grafana"` | | | grafana.persistence.enabled | bool | `true` | | | grafana.persistence.size | string | `"10Gi"` | | @@ -464,6 +484,8 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | prometheus.server.service.type | string | `"ClusterIP"` | | | prometheus.server.statefulSet.enabled | bool | `true` | | | prometheus.serverFiles."prometheus.yml".rule_files[0] | string | `"/etc/config/alerts/*.yaml"` | | -| prometheus.serverFiles."prometheus.yml".scrape_configs | string | `nil` | | +| prometheus.serverFiles."prometheus.yml".scrape_configs | list | `[]` | | | prometheus.testFramework.enabled | bool | `false` | | +| runbookViewer.image | string | `"dannyben/madness"` | | +| sqlExporter.image | string | `"burningalchemist/sql_exporter"` | | diff --git a/coder-observability/Chart.lock b/coder-observability/Chart.lock index 2aeb46c..1782a88 100644 --- a/coder-observability/Chart.lock +++ b/coder-observability/Chart.lock @@ -4,12 +4,12 @@ dependencies: version: 7.3.12 - name: prometheus repository: https://prometheus-community.github.io/helm-charts - version: 25.24.1 + version: 25.24.2 - name: loki repository: https://grafana.github.io/helm-charts - version: 6.7.3 + version: 6.7.4 - name: grafana-agent repository: https://grafana.github.io/helm-charts version: 0.37.0 -digest: sha256:10a5d2b617b691e0ed87ca9e31c86618e05ca3b8031ddb3b417610f47e8bb069 -generated: "2024-07-26T10:52:20.819468+02:00" +digest: sha256:05e0dae0200cabf5cb9e2cfb18a4e166fcaceefaf39827addff4299b18c31d4e +generated: "2025-01-16T07:54:38.036598102Z" diff --git a/coder-observability/Chart.yaml b/coder-observability/Chart.yaml index 8de1651..9e40bfa 100644 --- a/coder-observability/Chart.yaml +++ b/coder-observability/Chart.yaml @@ -8,20 +8,20 @@ dependencies: - name: grafana condition: grafana.enabled repository: https://grafana.github.io/helm-charts - version: '^v7.3.7' + version: '~v7.3.7' - name: prometheus condition: prometheus.enabled repository: https://prometheus-community.github.io/helm-charts - version: '^v25.18.0' + version: '~v25.24.1' - name: loki condition: loki.enabled repository: https://grafana.github.io/helm-charts - version: '^v6.3.4' + version: '~v6.7.3' - name: grafana-agent alias: grafana-agent condition: grafana-agent.enabled repository: https://grafana.github.io/helm-charts - version: '^0.37.0' + version: '~0.37.0' maintainers: - name: Coder Technologies, Inc. url: https://github.com/coder/observability/issues diff --git a/coder-observability/runbooks/coderd.md b/coder-observability/runbooks/coderd.md index 62c80f5..4a42444 100644 --- a/coder-observability/runbooks/coderd.md +++ b/coder-observability/runbooks/coderd.md @@ -76,3 +76,60 @@ Terraform plugin. Your Enterprise license is approaching or has exceeded the number of seats purchased. Please contact your Coder sales contact, or visit https://coder.com/contact/sales. + +## CoderdIneligiblePrebuilds + +Prebuilds only become eligible to be claimed by users once the workspace's agent is a) running and b) all of its startup +scripts have completed. + +If a prebuilt workspace is not eligible, view its agent logs to diagnose the problem. + +## CoderdUnprovisionedPrebuiltWorkspaces + +The number of running prebuilt workspaces is lower than the desired instances. This could be for several reasons, +ordered by likehood: + +### Experiment/License + +The prebuilds feature is currently gated behind an experiment *and* a premium license. + +Ensure that the prebuilds experiment is enabled with `CODER_EXPERIMENTS=workspace-prebuilds`, and that you have a premium +license added. + +### Preset Validation Issue + +Templates which have prebuilds configured will require a configured preset defined, with ALL of the required parameters +set in the preset. If any of these are missing, or any of the parameters - as defined - fail validation, then the prebuilds +subsystem will refuse to attempt a workspace build. + +Consult the coderd logs for more information; look out for errors or warnings from the prebuilds subsystem. + +### Template Misconfiguration or Error + +Prebuilt workspaces cannot be provisioned due to some issue at `terraform apply`-time. This could be due to misconfigured +cloud resources, improper authorization, or any number of other issues. + +Visit the Workspaces page, change the search term to `owner:prebuilds`, and view on the previously failed builds. The +error will likely be quite obvious. + +### Provisioner Latency + +If your provisioners are overloaded and cannot process provisioner jobs quickly enough, prebuilt workspaces may be affected. +There is no prioritization at present for prebuilt workspace jobs. + +Ensure your provisioners are appropriately resources (i.e. you have enough instances) to handle the concurrent build demand. + +### Use of Workspace Tags + +If you are using `coder_workspace_tags` ([docs](https://coder.com/docs/admin/templates/extending-templates/workspace-tags)) +in your template, chances are you do not have any provisioners running or they are under-resourced (see **Provisioner Latency**). + +Ensure your running provisioners are configured with your desired tags. + +### Reconciliation Loop Issue + +The prebuilds subsystem runs a _reconciliation loop_ which monitors the state of prebuilt workspaces to ensure the desired +number of instances are present at all times. Workspace Prebuilds is currently a BETA feature and so there could be a bug +in this _reconciliation loop_, which should be reported to Coder. + +Examine your coderd logs for any errors or warnings relating to prebuilds. \ No newline at end of file diff --git a/coder-observability/templates/configmap-prometheus-alerts.yaml b/coder-observability/templates/configmap-prometheus-alerts.yaml index eec7171..bf9bcc4 100644 --- a/coder-observability/templates/configmap-prometheus-alerts.yaml +++ b/coder-observability/templates/configmap-prometheus-alerts.yaml @@ -4,7 +4,7 @@ metadata: name: metrics-alerts namespace: {{ .Release.Namespace }} data: - {{- $service := dict "service" "coder" -}} + {{- $service := dict "service" "coderd" -}} {{- with .Values.global.coder.alerts.coderd }} {{/* start-section */}} coderd.yaml: |- @@ -104,6 +104,47 @@ data: {{- end }} {{- end }} + {{- with .groups.IneligiblePrebuilds }} + {{- $group := . }} + {{- if .enabled }} + - name: Coderd Ineligible Prebuilds + rules: + {{ $alert := "CoderdIneligiblePrebuilds" }} + {{- range $severity, $threshold := .thresholds }} + - alert: {{ $alert }} + expr: max by (template_name, preset_name) (coderd_prebuilt_workspaces_running - coderd_prebuilt_workspaces_eligible) > 0 + for: {{ $group.delay }} + annotations: + summary: > + {{ `{{ $value }}` }} prebuilt workspace(s) are currently ineligible for claiming for the "{{ `{{ $labels.template_name }}` }}" template and "{{ `{{ $labels.preset_name }}` }}" preset. + This usually indicates that the agent has not started correctly, or is still running its startup scripts after an extended period of time. + labels: + severity: {{ $severity }} + runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} + {{- end }} + {{- end }} + {{- end }} + + {{- with .groups.UnprovisionedPrebuiltWorkspaces }} + {{- $group := . }} + {{- if .enabled }} + - name: Coderd Unprovisioned Prebuilt Workspaces + rules: + {{ $alert := "CoderdUnprovisionedPrebuiltWorkspaces" }} + {{- range $severity, $threshold := .thresholds }} + - alert: {{ $alert }} + expr: max by (template_name, preset_name) (coderd_prebuilt_workspaces_desired - coderd_prebuilt_workspaces_running) > 0 + for: {{ $group.delay }} + annotations: + summary: > + {{ `{{ $value }}` }} prebuilt workspace(s) not yet been provisioned for the "{{ `{{ $labels.template_name }}` }}" template and "{{ `{{ $labels.preset_name }}` }}" preset. + labels: + severity: {{ $severity }} + runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} + {{- end }} + {{- end }} + {{- end }} + {{- end }} {{/* end-section */}} diff --git a/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl b/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl new file mode 100644 index 0000000..938b501 --- /dev/null +++ b/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl @@ -0,0 +1,1050 @@ +{{ define "prebuilds-dashboard.json" }} +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 10, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "0": { + "color": "orange", + "index": 2, + "text": "Not enabled" + }, + "1": { + "color": "green", + "index": 0, + "text": "Enabled" + } + }, + "type": "value" + }, + { + "options": { + "match": "null", + "result": { + "color": "orange", + "index": 1, + "text": "Not enabled" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 15, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": { + "valueSize": 15 + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "min(coderd_experiments{experiment=\"workspace-prebuilds\"})", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Experiment enabled?", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "text", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 0 + }, + "id": 49, + "interval": "30s", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(max(coderd_prebuilt_workspaces_desired) by (template_name, preset_name)) or vector(0)", + "instant": true, + "interval": "", + "legendFormat": "Desired", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(max(coderd_prebuilt_workspaces_running) by (template_name, preset_name)) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Running", + "range": false, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(max(coderd_prebuilt_workspaces_eligible) by (template_name, preset_name)) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Eligible", + "range": false, + "refId": "E" + } + ], + "title": "Current: Global", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "text", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 0 + }, + "id": 48, + "interval": "30s", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(max by (template_name, preset_name) (coderd_prebuilt_workspaces_created_total)) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Created", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(max by (template_name, preset_name) (coderd_prebuilt_workspaces_failed_total)) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Failed", + "range": false, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(max by (template_name, preset_name) (coderd_prebuilt_workspaces_claimed_total)) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Claimed", + "range": false, + "refId": "A" + } + ], + "title": "All Time: Global", + "type": "stat" + }, + { + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 4 + }, + "id": 2, + "panels": [], + "repeat": "template", + "repeatDirection": "h", + "title": "$template", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "text", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 0, + "y": 5 + }, + "id": 31, + "interval": "30s", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "repeat": "preset", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "max(coderd_prebuilt_workspaces_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "instant": true, + "interval": "", + "legendFormat": "Desired", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "max(coderd_prebuilt_workspaces_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Running", + "range": false, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "max(coderd_prebuilt_workspaces_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Eligible", + "range": false, + "refId": "E" + } + ], + "title": "Current: $preset", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 10, + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 18, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "fieldMinMax": false, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Desired" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "custom.fillOpacity", + "value": 85 + }, + { + "id": "custom.fillBelowTo", + "value": "Running" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Running" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + }, + { + "id": "custom.fillBelowTo", + "value": "Eligible" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Eligible" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 4, + "y": 5 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.3", + "repeat": "preset", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "max(coderd_prebuilt_workspaces_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "instant": false, + "interval": "", + "legendFormat": "Desired", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "max(coderd_prebuilt_workspaces_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Running", + "range": true, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "max(coderd_prebuilt_workspaces_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Eligible", + "range": true, + "refId": "E" + } + ], + "title": "Pool Capacity: $preset", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 10, + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 13, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "fieldMinMax": false, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Failed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Created" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Desired" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Running" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Eligible" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Claimed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 12, + "y": 5 + }, + "id": 38, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.3", + "repeat": "preset", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "floor(max(increase(coderd_prebuilt_workspaces_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Created", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "floor(max(increase(coderd_prebuilt_workspaces_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Failed", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "floor(max(increase(coderd_prebuilt_workspaces_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Claimed", + "range": true, + "refId": "F" + } + ], + "title": "Pool Operations: $preset", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "text", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 20, + "y": 5 + }, + "id": 1, + "interval": "30s", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "repeat": "preset", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "max(coderd_prebuilt_workspaces_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Created", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "max(coderd_prebuilt_workspaces_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Failed", + "range": false, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "max(coderd_prebuilt_workspaces_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Claimed", + "range": false, + "refId": "A" + } + ], + "title": "All Time: $preset", + "type": "stat" + } + ], + "refresh": "{{- include "dashboard-refresh" . -}}", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "allValue": "", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(coderd_prebuilt_workspaces_desired,template_name)", + "hide": 0, + "includeAll": false, + "label": "Template", + "multi": false, + "name": "template", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(coderd_prebuilt_workspaces_desired,template_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "allValue": "", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(coderd_prebuilt_workspaces_desired{template_name=~\"$template\"},preset_name)", + "hide": 0, + "includeAll": true, + "label": "Preset", + "multi": true, + "name": "preset", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(coderd_prebuilt_workspaces_desired{template_name=~\"$template\"},preset_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-{{- include "dashboard-range" . -}}", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Prebuilds", + "uid": "cej6jysyme22oa", + "version": 13, + "weekStart": "" +} +{{ end }} \ No newline at end of file diff --git a/coder-observability/templates/dashboards/configmap-dashboards-prebuilds.yaml b/coder-observability/templates/dashboards/configmap-dashboards-prebuilds.yaml new file mode 100644 index 0000000..14d5908 --- /dev/null +++ b/coder-observability/templates/dashboards/configmap-dashboards-prebuilds.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: dashboards-prebuilds + namespace: {{ .Release.Namespace }} +data: + prebuilds.json: |- {{- include "prebuilds-dashboard.json" . | trim | nindent 4 }} \ No newline at end of file diff --git a/coder-observability/templates/statefulset-postgres-exporter.yaml b/coder-observability/templates/statefulset-postgres-exporter.yaml index bcef353..229c650 100644 --- a/coder-observability/templates/statefulset-postgres-exporter.yaml +++ b/coder-observability/templates/statefulset-postgres-exporter.yaml @@ -20,7 +20,7 @@ spec: spec: containers: - name: postgres-exporter - image: quay.io/prometheuscommunity/postgres-exporter + image: {{ .Values.global.postgres.exporter.image }} args: - --collector.long_running_transactions ports: diff --git a/coder-observability/templates/statefulset-runbook-viewer.yaml b/coder-observability/templates/statefulset-runbook-viewer.yaml index 0ab2179..64f50e4 100644 --- a/coder-observability/templates/statefulset-runbook-viewer.yaml +++ b/coder-observability/templates/statefulset-runbook-viewer.yaml @@ -19,7 +19,7 @@ spec: spec: containers: - name: madness - image: dannyben/madness + image: {{ .Values.runbookViewer.image }} ports: - containerPort: 3000 name: madness diff --git a/coder-observability/templates/statefulset-sql-exporter.yaml b/coder-observability/templates/statefulset-sql-exporter.yaml index 3ef64c2..628339e 100644 --- a/coder-observability/templates/statefulset-sql-exporter.yaml +++ b/coder-observability/templates/statefulset-sql-exporter.yaml @@ -21,7 +21,7 @@ spec: spec: containers: - name: sql-exporter - image: burningalchemist/sql_exporter + image: {{ .Values.sqlExporter.image }} args: - -config.file=/cfg/config.yaml ports: diff --git a/coder-observability/values.yaml b/coder-observability/values.yaml index c66a6ac..f35e12b 100644 --- a/coder-observability/values.yaml +++ b/coder-observability/values.yaml @@ -76,6 +76,16 @@ global: notify: 2 warning: 5 critical: 10 + IneligiblePrebuilds: + enabled: true + delay: 10m + thresholds: + notify: 1 + UnprovisionedPrebuiltWorkspaces: + enabled: true + delay: 10m + thresholds: + warn: 1 provisionerd: groups: Replicas: @@ -115,6 +125,8 @@ global: sslmode: disable # ensure that your secret has a field named `PGPASSWORD` mountSecret: "secret-postgres" + exporter: + image: "quay.io/prometheuscommunity/postgres-exporter" # global.postgres.alerts -- alerts for postgres alerts: @@ -146,6 +158,12 @@ global: # global.dashboards.queryTimeout -- how long until a query in Grafana will timeout after queryTimeout: 900 +runbookViewer: + image: "dannyben/madness" + +sqlExporter: + image: "burningalchemist/sql_exporter" + grafana-agent: enabled: true fullnameOverride: grafana-agent @@ -406,6 +424,10 @@ grafana: mountPath: /var/lib/grafana/dashboards/coder/4 configMap: dashboards-workspace-detail readOnly: false + - name: dashboards-prebuilds + mountPath: /var/lib/grafana/dashboards/coder/5 + configMap: dashboards-prebuilds + readOnly: false prometheus: enabled: true @@ -444,7 +466,7 @@ prometheus: serverFiles: prometheus.yml: # disables scraping of metrics by the Prometheus helm chart since this is managed by the collector - scrape_configs: + scrape_configs: [] # use custom rule files to be able to render templates (can't do that in values.yaml, unless that value is evaluated by a tpl call) rule_files: - /etc/config/alerts/*.yaml diff --git a/compiled/resources.yaml b/compiled/resources.yaml index c2d7968..6f4518e 100644 --- a/compiled/resources.yaml +++ b/compiled/resources.yaml @@ -735,306 +735,7 @@ data: scrape_timeout: 10s rule_files: - /etc/config/alerts/*.yaml - scrape_configs: - - job_name: prometheus - static_configs: - - targets: - - localhost:9090 - - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - job_name: kubernetes-apiservers - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - action: keep - regex: default;kubernetes;https - source_labels: - - __meta_kubernetes_namespace - - __meta_kubernetes_service_name - - __meta_kubernetes_endpoint_port_name - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: true - - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - job_name: kubernetes-nodes - kubernetes_sd_configs: - - role: node - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - replacement: kubernetes.default.svc:443 - target_label: __address__ - - regex: (.+) - replacement: /api/v1/nodes/$1/proxy/metrics - source_labels: - - __meta_kubernetes_node_name - target_label: __metrics_path__ - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: true - - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - job_name: kubernetes-nodes-cadvisor - kubernetes_sd_configs: - - role: node - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - replacement: kubernetes.default.svc:443 - target_label: __address__ - - regex: (.+) - replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor - source_labels: - - __meta_kubernetes_node_name - target_label: __metrics_path__ - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: true - - honor_labels: true - job_name: kubernetes-service-endpoints - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_scrape - - action: drop - regex: true - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow - - action: replace - regex: (https?) - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_scheme - target_label: __scheme__ - - action: replace - regex: (.+) - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_path - target_label: __metrics_path__ - - action: replace - regex: (.+?)(?::\d+)?;(\d+) - replacement: $1:$2 - source_labels: - - __address__ - - __meta_kubernetes_service_annotation_prometheus_io_port - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) - replacement: __param_$1 - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - action: replace - source_labels: - - __meta_kubernetes_namespace - target_label: namespace - - action: replace - source_labels: - - __meta_kubernetes_service_name - target_label: service - - action: replace - source_labels: - - __meta_kubernetes_pod_node_name - target_label: node - - honor_labels: true - job_name: kubernetes-service-endpoints-slow - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow - - action: replace - regex: (https?) - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_scheme - target_label: __scheme__ - - action: replace - regex: (.+) - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_path - target_label: __metrics_path__ - - action: replace - regex: (.+?)(?::\d+)?;(\d+) - replacement: $1:$2 - source_labels: - - __address__ - - __meta_kubernetes_service_annotation_prometheus_io_port - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) - replacement: __param_$1 - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - action: replace - source_labels: - - __meta_kubernetes_namespace - target_label: namespace - - action: replace - source_labels: - - __meta_kubernetes_service_name - target_label: service - - action: replace - source_labels: - - __meta_kubernetes_pod_node_name - target_label: node - scrape_interval: 5m - scrape_timeout: 30s - - honor_labels: true - job_name: prometheus-pushgateway - kubernetes_sd_configs: - - role: service - relabel_configs: - - action: keep - regex: pushgateway - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_probe - - honor_labels: true - job_name: kubernetes-services - kubernetes_sd_configs: - - role: service - metrics_path: /probe - params: - module: - - http_2xx - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_probe - - source_labels: - - __address__ - target_label: __param_target - - replacement: blackbox - target_label: __address__ - - source_labels: - - __param_target - target_label: instance - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: - - __meta_kubernetes_namespace - target_label: namespace - - source_labels: - - __meta_kubernetes_service_name - target_label: service - - honor_labels: true - job_name: kubernetes-pods - kubernetes_sd_configs: - - role: pod - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scrape - - action: drop - regex: true - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow - - action: replace - regex: (https?) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scheme - target_label: __scheme__ - - action: replace - regex: (.+) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_path - target_label: __metrics_path__ - - action: replace - regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) - replacement: '[$2]:$1' - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: replace - regex: (\d+);((([0-9]+?)(\.|$)){4}) - replacement: $2:$1 - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) - replacement: __param_$1 - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - action: replace - source_labels: - - __meta_kubernetes_namespace - target_label: namespace - - action: replace - source_labels: - - __meta_kubernetes_pod_name - target_label: pod - - action: drop - regex: Pending|Succeeded|Failed|Completed - source_labels: - - __meta_kubernetes_pod_phase - - action: replace - source_labels: - - __meta_kubernetes_pod_node_name - target_label: node - - honor_labels: true - job_name: kubernetes-pods-slow - kubernetes_sd_configs: - - role: pod - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow - - action: replace - regex: (https?) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scheme - target_label: __scheme__ - - action: replace - regex: (.+) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_path - target_label: __metrics_path__ - - action: replace - regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) - replacement: '[$2]:$1' - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: replace - regex: (\d+);((([0-9]+?)(\.|$)){4}) - replacement: $2:$1 - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) - replacement: __param_$1 - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - action: replace - source_labels: - - __meta_kubernetes_namespace - target_label: namespace - - action: replace - source_labels: - - __meta_kubernetes_pod_name - target_label: pod - - action: drop - regex: Pending|Succeeded|Failed|Completed - source_labels: - - __meta_kubernetes_pod_phase - - action: replace - source_labels: - - __meta_kubernetes_pod_node_name - target_label: node - scrape_interval: 5m - scrape_timeout: 30s + scrape_configs: [] alerting: alertmanagers: - kubernetes_sd_configs: @@ -1076,8 +777,8 @@ metadata: name: metrics-alerts namespace: coder-observability data: - coderd.yaml: "groups:\n- name: CPU Usage\n rules:\n \n - alert: CoderdCPUUsage\n expr: max by (pod) (rate(container_cpu_usage_seconds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) / max by(pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"cpu\"}) > 0.9\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of CPU, which may impact application performance.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdcpuusage\n - alert: CoderdCPUUsage\n expr: max by (pod) (rate(container_cpu_usage_seconds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) / max by(pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"cpu\"}) > 0.8\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of CPU, which may impact application performance.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdcpuusage\n- name: Memory Usage\n rules:\n \n - alert: CoderdMemoryUsage\n expr: max by (pod) (container_memory_working_set_bytes{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) / max by (pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"memory\"}) > 0.9\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdmemoryusage\n - alert: CoderdMemoryUsage\n expr: max by (pod) (container_memory_working_set_bytes{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) / max by (pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"memory\"}) > 0.8\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdmemoryusage\n- name: Pod Restarts\n rules:\n \n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 3\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdrestarts\n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 1\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdrestarts\n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 2\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdrestarts\n- name: Coderd Replicas\n rules:\n \n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 1\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 1.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdreplicas\n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 3\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 3.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdreplicas\n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 2\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 2.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdreplicas\n- name: Coderd Workspace Build Failures\n rules:\n \n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 10\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdworkspacebuildfailures\n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 2\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdworkspacebuildfailures\n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 5\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdworkspacebuildfailures " - provisionerd.yaml: "groups:\n- name: Provisionerd Replicas\n rules:\n \n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 1\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 1.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#provisionerdreplicas\n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 3\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 3.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#provisionerdreplicas\n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 2\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 2.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#provisionerdreplicas " + coderd.yaml: "groups:\n- name: CPU Usage\n rules:\n \n - alert: CoderdCPUUsage\n expr: max by (pod) (rate(container_cpu_usage_seconds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) / max by(pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"cpu\"}) > 0.9\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of CPU, which may impact application performance.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdcpuusage\n - alert: CoderdCPUUsage\n expr: max by (pod) (rate(container_cpu_usage_seconds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) / max by(pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"cpu\"}) > 0.8\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of CPU, which may impact application performance.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdcpuusage\n- name: Memory Usage\n rules:\n \n - alert: CoderdMemoryUsage\n expr: max by (pod) (container_memory_working_set_bytes{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) / max by (pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"memory\"}) > 0.9\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdmemoryusage\n - alert: CoderdMemoryUsage\n expr: max by (pod) (container_memory_working_set_bytes{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) / max by (pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"memory\"}) > 0.8\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdmemoryusage\n- name: Pod Restarts\n rules:\n \n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 3\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 1\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 2\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n- name: Coderd Replicas\n rules:\n \n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 1\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 1.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 3\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 3.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 2\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 2.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n- name: Coderd Workspace Build Failures\n rules:\n \n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 10\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 2\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 5\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n- name: Coderd Ineligible Prebuilds\n rules:\n \n - alert: CoderdIneligiblePrebuilds\n expr: max by (template_name, preset_name) (coderd_prebuilt_workspaces_running - coderd_prebuilt_workspaces_eligible) > 0\n for: 10m\n annotations:\n summary: >\n {{ $value }} prebuilt workspace(s) are currently ineligible for claiming for the \"{{ $labels.template_name }}\" template and \"{{ $labels.preset_name }}\" preset.\n This usually indicates that the agent has not started correctly, or is still running its startup scripts after an extended period of time.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdineligibleprebuilds\n- name: Coderd Unprovisioned Prebuilt Workspaces\n rules:\n \n - alert: CoderdUnprovisionedPrebuiltWorkspaces\n expr: max by (template_name, preset_name) (coderd_prebuilt_workspaces_desired - coderd_prebuilt_workspaces_running) > 0\n for: 10m\n annotations:\n summary: >\n {{ $value }} prebuilt workspace(s) not yet been provisioned for the \"{{ $labels.template_name }}\" template and \"{{ $labels.preset_name }}\" preset.\n labels:\n severity: warn\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdunprovisionedprebuiltworkspaces " + provisionerd.yaml: "groups:\n- name: Provisionerd Replicas\n rules:\n \n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 1\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 1.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#provisionerdreplicas\n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 3\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 3.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#provisionerdreplicas\n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 2\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 2.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#provisionerdreplicas " enterprise.yaml: "groups:\n - name: Licences\n rules:\n \n - alert: CoderLicenseSeats\n expr: 'max(coderd_license_active_users) / max(coderd_license_limit_users) >=1'\n for: 1m\n annotations:\n summary: Your Coder enterprise licence usage is now at {{ $value | humanizePercentage }} capacity.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/enterprise#coderlicenseseats\n - alert: CoderLicenseSeats\n expr: 'max(coderd_license_active_users) / max(coderd_license_limit_users) >=0.9'\n for: 1m\n annotations:\n summary: Your Coder enterprise licence usage is now at {{ $value | humanizePercentage }} capacity.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/enterprise#coderlicenseseats " postgres.yaml: "groups:\n- name: Notifications\n rules:\n \n - alert: PostgresNotificationQueueFillingUp\n expr: pg_pubsub_usage > 0.9\n for: 15m\n annotations:\n summary: The postgres instance {{ $labels.instance }} has a notification that is filling up, which may impact application performance.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresnotificationqueuefillingup\n - alert: PostgresNotificationQueueFillingUp\n expr: pg_pubsub_usage > 0.5\n for: 15m\n annotations:\n summary: The postgres instance {{ $labels.instance }} has a notification that is filling up, which may impact application performance.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresnotificationqueuefillingup\n - alert: PostgresNotificationQueueFillingUp\n expr: pg_pubsub_usage > 0.8\n for: 15m\n annotations:\n summary: The postgres instance {{ $labels.instance }} has a notification that is filling up, which may impact application performance.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresnotificationqueuefillingup\n\n- name: Liveness\n rules:\n \n - alert: PostgresDown\n expr: pg_up == 0\n for: 1m\n annotations:\n summary: The postgres instance {{ $labels.instance }} is down!\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresdown\n\n\n- name: Connections\n rules:\n \n - alert: PostgresConnectionsRunningLow\n expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * 0.9)\n for: 5m\n labels:\n summary: The postgres instance {{ $labels.instance }} is running low on connections which may impact application performance.\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresconnectionsrunninglow\n - alert: PostgresConnectionsRunningLow\n expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * 0.5)\n for: 5m\n labels:\n summary: The postgres instance {{ $labels.instance }} is running low on connections which may impact application performance.\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresconnectionsrunninglow\n - alert: PostgresConnectionsRunningLow\n expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * 0.8)\n for: 5m\n labels:\n summary: The postgres instance {{ $labels.instance }} is running low on connections which may impact application performance.\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresconnectionsrunninglow" --- @@ -1088,9 +789,9 @@ metadata: name: runbooks namespace: coder-observability annotations: - checksum/config: 7ab4d06d5b454cc584880f9b58fb50fe85f2803bbd0021edb957c5a2d73b640e + checksum/config: b0c41033d0385ee3d46488f08e85bcef0d939614dcb99194e0c5913dbf0c2c33 data: - coderd.md: | + coderd.md: |- # Coderd Runbooks ## CoderdCPUUsage @@ -1169,6 +870,63 @@ data: Your Enterprise license is approaching or has exceeded the number of seats purchased. Please contact your Coder sales contact, or visit https://coder.com/contact/sales. + + ## CoderdIneligiblePrebuilds + + Prebuilds only become eligible to be claimed by users once the workspace's agent is a) running and b) all of its startup + scripts have completed. + + If a prebuilt workspace is not eligible, view its agent logs to diagnose the problem. + + ## CoderdUnprovisionedPrebuiltWorkspaces + + The number of running prebuilt workspaces is lower than the desired instances. This could be for several reasons, + ordered by likehood: + + ### Experiment/License + + The prebuilds feature is currently gated behind an experiment *and* a premium license. + + Ensure that the prebuilds experiment is enabled with `CODER_EXPERIMENTS=workspace-prebuilds`, and that you have a premium + license added. + + ### Preset Validation Issue + + Templates which have prebuilds configured will require a configured preset defined, with ALL of the required parameters + set in the preset. If any of these are missing, or any of the parameters - as defined - fail validation, then the prebuilds + subsystem will refuse to attempt a workspace build. + + Consult the coderd logs for more information; look out for errors or warnings from the prebuilds subsystem. + + ### Template Misconfiguration or Error + + Prebuilt workspaces cannot be provisioned due to some issue at `terraform apply`-time. This could be due to misconfigured + cloud resources, improper authorization, or any number of other issues. + + Visit the Workspaces page, change the search term to `owner:prebuilds`, and view on the previously failed builds. The + error will likely be quite obvious. + + ### Provisioner Latency + + If your provisioners are overloaded and cannot process provisioner jobs quickly enough, prebuilt workspaces may be affected. + There is no prioritization at present for prebuilt workspace jobs. + + Ensure your provisioners are appropriately resources (i.e. you have enough instances) to handle the concurrent build demand. + + ### Use of Workspace Tags + + If you are using `coder_workspace_tags` ([docs](https://coder.com/docs/admin/templates/extending-templates/workspace-tags)) + in your template, chances are you do not have any provisioners running or they are under-resourced (see **Provisioner Latency**). + + Ensure your running provisioners are configured with your desired tags. + + ### Reconciliation Loop Issue + + The prebuilds subsystem runs a _reconciliation loop_ which monitors the state of prebuilt workspaces to ensure the desired + number of instances are present at all times. Workspace Prebuilds is currently a BETA feature and so there could be a bug + in this _reconciliation loop_, which should be reported to Coder. + + Examine your coderd logs for any errors or warnings relating to prebuilds. postgres.md: | # Postgres Runbooks @@ -2504,33 +2262,1045 @@ data: } ] }, - "unit": "s" - }, - "overrides": [ + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Errors" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 12, + "y": 15 + }, + "id": 21, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "quantile(0.9, coder_pubsub_send_latency_seconds)", + "instant": false, + "legendFormat": "Send", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "quantile(0.9, coder_pubsub_receive_latency_seconds)", + "hide": false, + "instant": false, + "legendFormat": "Receive", + "range": true, + "refId": "B" + } + ], + "title": "Pubsub Latency (P90)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 0, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 18 + }, + "id": 35, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum by(pod) (rate(coderd_api_requests_processed_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[$__rate_interval]))", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "API Requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "", + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 18 + }, + "id": 36, + "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false + }, + "content": "This shows the number of requests per second each `coderd` replica is handling.\n\nHeavy skewing towards a single `coderd` replica indicates faulty loadbalancing.", + "mode": "markdown" + }, + "pluginVersion": "10.4.0", + "transparent": true, + "type": "text" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-12h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Control Plane", + "uid": "coderd", + "version": 6, + "weekStart": "" + } +--- +# Source: coder-observability/templates/dashboards/configmap-dashboards-prebuilds.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: dashboards-prebuilds + namespace: coder-observability +data: + prebuilds.json: |- + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 10, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "0": { + "color": "orange", + "index": 2, + "text": "Not enabled" + }, + "1": { + "color": "green", + "index": 0, + "text": "Enabled" + } + }, + "type": "value" + }, + { + "options": { + "match": "null", + "result": { + "color": "orange", + "index": 1, + "text": "Not enabled" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 15, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": { + "valueSize": 15 + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "min(coderd_experiments{experiment=\"workspace-prebuilds\"})", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Experiment enabled?", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "text", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 0 + }, + "id": 49, + "interval": "30s", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(max(coderd_prebuilt_workspaces_desired) by (template_name, preset_name)) or vector(0)", + "instant": true, + "interval": "", + "legendFormat": "Desired", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(max(coderd_prebuilt_workspaces_running) by (template_name, preset_name)) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Running", + "range": false, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(max(coderd_prebuilt_workspaces_eligible) by (template_name, preset_name)) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Eligible", + "range": false, + "refId": "E" + } + ], + "title": "Current: Global", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "text", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 0 + }, + "id": 48, + "interval": "30s", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(max by (template_name, preset_name) (coderd_prebuilt_workspaces_created_total)) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Created", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(max by (template_name, preset_name) (coderd_prebuilt_workspaces_failed_total)) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Failed", + "range": false, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(max by (template_name, preset_name) (coderd_prebuilt_workspaces_claimed_total)) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Claimed", + "range": false, + "refId": "A" + } + ], + "title": "All Time: Global", + "type": "stat" + }, + { + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 4 + }, + "id": 2, + "panels": [], + "repeat": "template", + "repeatDirection": "h", + "title": "$template", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "text", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 0, + "y": 5 + }, + "id": 31, + "interval": "30s", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "repeat": "preset", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "max(coderd_prebuilt_workspaces_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "instant": true, + "interval": "", + "legendFormat": "Desired", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "max(coderd_prebuilt_workspaces_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Running", + "range": false, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "max(coderd_prebuilt_workspaces_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Eligible", + "range": false, + "refId": "E" + } + ], + "title": "Current: $preset", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 10, + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 18, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "fieldMinMax": false, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Desired" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "custom.fillOpacity", + "value": 85 + }, + { + "id": "custom.fillBelowTo", + "value": "Running" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Running" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + }, + { + "id": "custom.fillBelowTo", + "value": "Eligible" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Eligible" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 4, + "y": 5 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.3", + "repeat": "preset", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "max(coderd_prebuilt_workspaces_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "instant": false, + "interval": "", + "legendFormat": "Desired", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "max(coderd_prebuilt_workspaces_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Running", + "range": true, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "max(coderd_prebuilt_workspaces_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Eligible", + "range": true, + "refId": "E" + } + ], + "title": "Pool Capacity: $preset", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 10, + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 13, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "fieldMinMax": false, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Failed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Created" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Desired" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Running" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Eligible" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, { "matcher": { "id": "byName", - "options": "Errors" + "options": "Claimed" }, "properties": [ { - "id": "unit", - "value": "short" - }, - { - "id": "thresholds", + "id": "color", "value": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 1 - } - ] + "fixedColor": "dark-green", + "mode": "fixed" } } ] @@ -2538,29 +3308,27 @@ data: ] }, "gridPos": { - "h": 3, - "w": 4, + "h": 7, + "w": 8, "x": 12, - "y": 15 + "y": 5 }, - "id": 21, + "id": 38, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "mean" - ], - "fields": "", - "values": false + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true + "tooltip": { + "mode": "single", + "sort": "none" + } }, - "pluginVersion": "10.4.0", + "pluginVersion": "10.4.3", + "repeat": "preset", + "repeatDirection": "v", "targets": [ { "datasource": { @@ -2568,11 +3336,13 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "quantile(0.9, coder_pubsub_send_latency_seconds)", + "expr": "floor(max(increase(coderd_prebuilt_workspaces_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", + "hide": false, "instant": false, - "legendFormat": "Send", + "interval": "", + "legendFormat": "Created", "range": true, - "refId": "A" + "refId": "B" }, { "datasource": { @@ -2580,16 +3350,31 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "quantile(0.9, coder_pubsub_receive_latency_seconds)", + "expr": "floor(max(increase(coderd_prebuilt_workspaces_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", "hide": false, "instant": false, - "legendFormat": "Receive", + "interval": "", + "legendFormat": "Failed", "range": true, - "refId": "B" + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "floor(max(increase(coderd_prebuilt_workspaces_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Claimed", + "range": true, + "refId": "F" } ], - "title": "Pubsub Latency (P90)", - "type": "stat" + "title": "Pool Operations: $preset", + "type": "timeseries" }, { "datasource": { @@ -2600,78 +3385,53 @@ data: "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "bars", - "fillOpacity": 100, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 0, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "fixedColor": "text", + "mode": "fixed" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { - "color": "red", + "color": "green", "value": null }, { - "color": "green", - "value": 1 + "color": "red", + "value": 80 } ] - }, - "unit": "reqps" + } }, "overrides": [] }, "gridPos": { - "h": 6, - "w": 6, - "x": 0, - "y": 18 + "h": 7, + "w": 4, + "x": 20, + "y": 5 }, - "id": 35, + "id": 1, + "interval": "30s", "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - "tooltip": { - "mode": "single", - "sort": "none" - } + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true }, - "pluginVersion": "10.4.0", + "pluginVersion": "10.4.3", + "repeat": "preset", + "repeatDirection": "v", "targets": [ { "datasource": { @@ -2680,48 +3440,103 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum by(pod) (rate(coderd_api_requests_processed_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[$__rate_interval]))", - "instant": false, - "legendFormat": "__auto", - "range": true, + "expr": "max(coderd_prebuilt_workspaces_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Created", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "max(coderd_prebuilt_workspaces_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Failed", + "range": false, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "max(coderd_prebuilt_workspaces_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Claimed", + "range": false, "refId": "A" } ], - "title": "API Requests", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "description": "", - "gridPos": { - "h": 6, - "w": 6, - "x": 6, - "y": 18 - }, - "id": 36, - "options": { - "code": { - "language": "plaintext", - "showLineNumbers": false, - "showMiniMap": false - }, - "content": "This shows the number of requests per second each `coderd` replica is handling.\n\nHeavy skewing towards a single `coderd` replica indicates faulty loadbalancing.", - "mode": "markdown" - }, - "pluginVersion": "10.4.0", - "transparent": true, - "type": "text" + "title": "All Time: $preset", + "type": "stat" } ], "refresh": "30s", "schemaVersion": 39, "tags": [], "templating": { - "list": [] + "list": [ + { + "allValue": "", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(coderd_prebuilt_workspaces_desired,template_name)", + "hide": 0, + "includeAll": false, + "label": "Template", + "multi": false, + "name": "template", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(coderd_prebuilt_workspaces_desired,template_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "allValue": "", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(coderd_prebuilt_workspaces_desired{template_name=~\"$template\"},preset_name)", + "hide": 0, + "includeAll": true, + "label": "Preset", + "multi": true, + "name": "preset", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(coderd_prebuilt_workspaces_desired{template_name=~\"$template\"},preset_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] }, "time": { "from": "now-12h", @@ -2729,9 +3544,9 @@ data: }, "timepicker": {}, "timezone": "browser", - "title": "Control Plane", - "uid": "coderd", - "version": 6, + "title": "Prebuilds", + "uid": "cej6jysyme22oa", + "version": 13, "weekStart": "" } --- @@ -10162,7 +10977,7 @@ spec: template: metadata: annotations: - checksum/config: 308677931777ae343a565387f5edecd66c53876ef7d120e9df3c6196c1884b30 + checksum/config: bc7add19cdc0df1566dec1bf8f9421082357d4393124d6ea2df28d7e5888cc8a labels: app.kubernetes.io/name: loki app.kubernetes.io/instance: coder-observability @@ -10246,7 +11061,7 @@ spec: template: metadata: annotations: - checksum/config: 616b4c9b39f90d71edfd156f86c7f57751f662542a83f3e345c30496c3da7d27 + checksum/config: 4dbe50185304244ab527314b7723d048ea4544f97d0a4d8e0192863861811005 prometheus.io/scrape: "true" labels: app.kubernetes.io/part-of: memberlist @@ -10518,6 +11333,10 @@ spec: mountPath: /var/lib/grafana/dashboards/coder/4 subPath: readOnly: false + - name: dashboards-prebuilds + mountPath: /var/lib/grafana/dashboards/coder/5 + subPath: + readOnly: false - name: storage mountPath: "/var/lib/grafana" - name: config @@ -10587,6 +11406,9 @@ spec: - name: dashboards-workspace-detail configMap: name: dashboards-workspace-detail + - name: dashboards-prebuilds + configMap: + name: dashboards-prebuilds - name: dashboards-infra configMap: name: grafana-dashboards-infra @@ -10725,7 +11547,7 @@ spec: template: metadata: annotations: - checksum/config: 616b4c9b39f90d71edfd156f86c7f57751f662542a83f3e345c30496c3da7d27 + checksum/config: 4dbe50185304244ab527314b7723d048ea4544f97d0a4d8e0192863861811005 prometheus.io/scrape: "true" labels: app.kubernetes.io/name: loki @@ -11042,7 +11864,7 @@ spec: template: metadata: annotations: - checksum/config: 616b4c9b39f90d71edfd156f86c7f57751f662542a83f3e345c30496c3da7d27 + checksum/config: 4dbe50185304244ab527314b7723d048ea4544f97d0a4d8e0192863861811005 prometheus.io/scrape: "true" labels: app.kubernetes.io/name: loki @@ -11242,7 +12064,7 @@ spec: app.kubernetes.io/name: prometheus app.kubernetes.io/instance: coder-observability app.kubernetes.io/version: v2.53.1 - helm.sh/chart: prometheus-25.24.1 + helm.sh/chart: prometheus-25.24.2 app.kubernetes.io/managed-by: Helm app.kubernetes.io/part-of: prometheus spec: @@ -11413,7 +12235,7 @@ spec: template: metadata: annotations: - checksum/config: 7ab4d06d5b454cc584880f9b58fb50fe85f2803bbd0021edb957c5a2d73b640e + checksum/config: b0c41033d0385ee3d46488f08e85bcef0d939614dcb99194e0c5913dbf0c2c33 labels: app: runbook-viewer spec: diff --git a/scripts/compile.sh b/scripts/compile.sh index 13515de..a00ce4f 100755 --- a/scripts/compile.sh +++ b/scripts/compile.sh @@ -1,6 +1,12 @@ #!/usr/bin/env bash set -euo pipefail +# check versions +HELM_VERSION=3.17 +YQ_VERSION=4.42 +[[ "$(helm version)" == *v${HELM_VERSION}* ]] || { echo "Expected helm version v${HELM_VERSION} but got $(helm version)" >&2; exit 1; } +[[ "$(yq --version)" == *v${YQ_VERSION}* ]] || { echo "Expected yq version v${YQ_VERSION} but got $(yq --version)" >&2; exit 1; } + source "$(dirname "${BASH_SOURCE[0]}")/lib.sh" helm repo add prometheus-community https://prometheus-community.github.io/helm-charts @@ -13,7 +19,7 @@ helm dependency update coder-observability/ # We *expect* that the versions will change in the rendered template output, so we ignore those, but # if there are changes to the manifests themselves then we need to fail the build to force manual review. helm template --namespace coder-observability -f coder-observability/values.yaml coder-observability coder-observability/ | \ - yq 'del(.spec.template.spec.containers[].image, .metadata.labels."helm.sh/chart", .metadata.labels."app.kubernetes.io/version")' - \ + yq e 'del(.spec.template.spec.containers[].image, .metadata.labels."helm.sh/chart", .metadata.labels."app.kubernetes.io/version")' - \ > compiled/resources.yaml check_unstaged "compiled" \ No newline at end of file