From f66dfc753c28393f5afb7e8c62c9d4c6fde33412 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sun, 8 Feb 2026 21:32:34 +0100 Subject: [PATCH] grafana: add NixOS operations dashboard Loki-based dashboard for tracking NixOS operations including: - Upgrade activity and success/failure stats - Build activity during upgrades - Bootstrap logs for new VM deployments - ACME certificate renewal activity Log panels use LogQL json parsing with | keep host to show clean messages with host labels. Co-Authored-By: Claude Opus 4.5 --- .../grafana/dashboards/nixos-operations.json | 296 ++++++++++++++++++ 1 file changed, 296 insertions(+) create mode 100644 services/grafana/dashboards/nixos-operations.json diff --git a/services/grafana/dashboards/nixos-operations.json b/services/grafana/dashboards/nixos-operations.json new file mode 100644 index 0000000..feb35a5 --- /dev/null +++ b/services/grafana/dashboards/nixos-operations.json @@ -0,0 +1,296 @@ +{ + "uid": "nixos-operations", + "title": "NixOS Operations", + "tags": ["loki", "nixos", "operations", "homelab"], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "1m", + "time": { + "from": "now-24h", + "to": "now" + }, + "templating": { + "list": [ + { + "name": "host", + "type": "query", + "datasource": {"type": "loki", "uid": "loki"}, + "query": "label_values(host)", + "refresh": 2, + "includeAll": true, + "multi": true, + "current": {"text": "All", "value": "$__all"} + } + ] + }, + "panels": [ + { + "id": 1, + "title": "Upgrade Log Volume", + "type": "stat", + "gridPos": {"h": 4, "w": 6, "x": 0, "y": 0}, + "datasource": {"type": "loki", "uid": "loki"}, + "targets": [ + { + "expr": "sum(count_over_time({systemd_unit=\"nixos-upgrade.service\", host=~\"$host\"} [$__range]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [{"color": "blue", "value": null}] + }, + "noValue": "0" + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + }, + "description": "Total log entries from nixos-upgrade.service in selected time range" + }, + { + "id": 2, + "title": "Successful Upgrades", + "type": "stat", + "gridPos": {"h": 4, "w": 6, "x": 6, "y": 0}, + "datasource": {"type": "loki", "uid": "loki"}, + "targets": [ + { + "expr": "sum(count_over_time({systemd_unit=\"nixos-upgrade.service\", host=~\"$host\"} |= \"Done. The new configuration is\" [$__range]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [{"color": "green", "value": null}] + }, + "noValue": "0" + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + }, + "description": "Upgrades that completed successfully" + }, + { + "id": 3, + "title": "Upgrade Errors", + "type": "stat", + "gridPos": {"h": 4, "w": 6, "x": 12, "y": 0}, + "datasource": {"type": "loki", "uid": "loki"}, + "targets": [ + { + "expr": "sum(count_over_time({systemd_unit=\"nixos-upgrade.service\", host=~\"$host\"} |~ \"(?i)error|failed\" [$__range]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "red", "value": 1} + ] + }, + "noValue": "0" + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + }, + "description": "Upgrade log entries containing errors" + }, + { + "id": 4, + "title": "Bootstrap Events", + "type": "stat", + "gridPos": {"h": 4, "w": 6, "x": 18, "y": 0}, + "datasource": {"type": "loki", "uid": "loki"}, + "targets": [ + { + "expr": "sum(count_over_time({job=\"bootstrap\", host=~\"$host\"} [$__range]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [{"color": "purple", "value": null}] + }, + "noValue": "0" + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + }, + "description": "Bootstrap log entries from new VM deployments" + }, + { + "id": 5, + "title": "Upgrade Activity by Host", + "type": "timeseries", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 4}, + "datasource": {"type": "loki", "uid": "loki"}, + "targets": [ + { + "expr": "sum by (host) (count_over_time({systemd_unit=\"nixos-upgrade.service\", host=~\"$host\"} [5m]))", + "legendFormat": "{{host}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "lineWidth": 1, + "fillOpacity": 30, + "showPoints": "never", + "stacking": {"mode": "normal"} + } + } + }, + "options": { + "legend": {"displayMode": "list", "placement": "bottom"}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "description": "When upgrades ran on each host" + }, + { + "id": 6, + "title": "ACME Certificate Activity", + "type": "timeseries", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 4}, + "datasource": {"type": "loki", "uid": "loki"}, + "targets": [ + { + "expr": "sum by (host) (count_over_time({systemd_unit=~\"acme.*\", host=~\"$host\"} [5m]))", + "legendFormat": "{{host}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "lineWidth": 1, + "fillOpacity": 30, + "showPoints": "never", + "stacking": {"mode": "normal"} + } + } + }, + "options": { + "legend": {"displayMode": "list", "placement": "bottom"}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "description": "ACME certificate renewal activity" + }, + { + "id": 7, + "title": "Recent Upgrade Completions", + "type": "logs", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 12}, + "datasource": {"type": "loki", "uid": "loki"}, + "targets": [ + { + "expr": "{systemd_unit=\"nixos-upgrade.service\", host=~\"$host\"} |= \"Done. The new configuration is\" | json | line_format \"{{.MESSAGE}}\" | keep host", + "refId": "A" + } + ], + "options": { + "showTime": true, + "showLabels": true, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "sortOrder": "Descending" + }, + "description": "Successful upgrade completion messages showing the new system path" + }, + { + "id": 8, + "title": "Build Activity", + "type": "logs", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 12}, + "datasource": {"type": "loki", "uid": "loki"}, + "targets": [ + { + "expr": "{systemd_unit=\"nixos-upgrade.service\", host=~\"$host\"} |= \"building\" | json | line_format \"{{.MESSAGE}}\" | keep host", + "refId": "A" + } + ], + "options": { + "showTime": true, + "showLabels": true, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "sortOrder": "Descending" + }, + "description": "Derivations being built during upgrades" + }, + { + "id": 9, + "title": "Bootstrap Logs", + "type": "logs", + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 20}, + "datasource": {"type": "loki", "uid": "loki"}, + "targets": [ + { + "expr": "{job=\"bootstrap\", host=~\"$host\"}", + "refId": "A" + } + ], + "options": { + "showTime": true, + "showLabels": true, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "sortOrder": "Descending" + }, + "description": "Logs from VM bootstrap process (new deployments)" + }, + { + "id": 10, + "title": "Upgrade Errors & Failures", + "type": "logs", + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 28}, + "datasource": {"type": "loki", "uid": "loki"}, + "targets": [ + { + "expr": "{systemd_unit=\"nixos-upgrade.service\", host=~\"$host\"} |~ \"(?i)error|failed\" | json | line_format \"{{.MESSAGE}}\" | keep host", + "refId": "A" + } + ], + "options": { + "showTime": true, + "showLabels": true, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "sortOrder": "Descending" + }, + "description": "Errors and failures during NixOS upgrades" + } + ] +}