From 89d0a6f3580062c606e8becd73941b607ee06ab7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sun, 8 Feb 2026 21:06:59 +0100 Subject: [PATCH] grafana: add systemd services dashboard Dashboard for monitoring systemd across the fleet: - Summary stats: failed/active/inactive units, restarts, timers - Failed units table (shows any units in failed state) - Service restarts table (top 15 services by restart count) - Active units per host bar chart - NixOS upgrade timer table with last trigger time - Backup timers table (restic jobs) - Service restarts over time chart - Hostname filter to focus on specific hosts Co-Authored-By: Claude Opus 4.5 --- services/grafana/dashboards/systemd.json | 553 +++++++++++++++++++++++ 1 file changed, 553 insertions(+) create mode 100644 services/grafana/dashboards/systemd.json diff --git a/services/grafana/dashboards/systemd.json b/services/grafana/dashboards/systemd.json new file mode 100644 index 0000000..8162527 --- /dev/null +++ b/services/grafana/dashboards/systemd.json @@ -0,0 +1,553 @@ +{ + "uid": "systemd-homelab", + "title": "Systemd Services - Homelab", + "tags": ["systemd", "services", "homelab"], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "1m", + "time": { + "from": "now-24h", + "to": "now" + }, + "templating": { + "list": [ + { + "name": "hostname", + "type": "query", + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "query": "label_values(systemd_unit_state, hostname)", + "refresh": 2, + "includeAll": true, + "multi": true, + "current": {"text": "All", "value": "$__all"} + } + ] + }, + "panels": [ + { + "id": 1, + "title": "Failed Units", + "type": "stat", + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count(systemd_unit_state{state=\"failed\", hostname=~\"$hostname\"} == 1) or vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "red", "value": 1} + ] + }, + "noValue": "0" + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + } + }, + { + "id": 2, + "title": "Active Units", + "type": "stat", + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count(systemd_unit_state{state=\"active\", hostname=~\"$hostname\"} == 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [{"color": "green", "value": null}] + } + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + } + }, + { + "id": 3, + "title": "Hosts Monitored", + "type": "stat", + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count(count by (hostname) (systemd_unit_state{hostname=~\"$hostname\"}))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [{"color": "blue", "value": null}] + } + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + } + }, + { + "id": 4, + "title": "Total Service Restarts", + "type": "stat", + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "sum(systemd_service_restart_total{hostname=~\"$hostname\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 10}, + {"color": "orange", "value": 50} + ] + }, + "noValue": "0" + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + } + }, + { + "id": 5, + "title": "Inactive Units", + "type": "stat", + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count(systemd_unit_state{state=\"inactive\", hostname=~\"$hostname\"} == 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [{"color": "purple", "value": null}] + } + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + } + }, + { + "id": 6, + "title": "Timers", + "type": "stat", + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count(systemd_timer_last_trigger_seconds{hostname=~\"$hostname\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [{"color": "blue", "value": null}] + } + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + } + }, + { + "id": 7, + "title": "Failed Units", + "type": "table", + "gridPos": {"h": 6, "w": 12, "x": 0, "y": 4}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "systemd_unit_state{state=\"failed\", hostname=~\"$hostname\"} == 1", + "format": "table", + "instant": true, + "refId": "A" + } + ], + "fieldConfig": { + "defaults": {}, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Host"}, + "properties": [{"id": "custom.width", "value": 120}] + }, + { + "matcher": {"id": "byName", "options": "Unit"}, + "properties": [{"id": "custom.width", "value": 300}] + } + ] + }, + "options": { + "showHeader": true, + "sortBy": [{"displayName": "Host", "desc": false}] + }, + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "__name__": true, + "dns_role": true, + "instance": true, + "job": true, + "role": true, + "state": true, + "tier": true, + "type": true + }, + "renameByName": { + "hostname": "Host", + "name": "Unit" + } + } + } + ], + "description": "Units currently in failed state" + }, + { + "id": 8, + "title": "Service Restarts (Top 15)", + "type": "table", + "gridPos": {"h": 6, "w": 12, "x": 12, "y": 4}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "topk(15, systemd_service_restart_total{hostname=~\"$hostname\"} > 0)", + "format": "table", + "instant": true, + "refId": "A" + } + ], + "fieldConfig": { + "defaults": {}, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Host"}, + "properties": [{"id": "custom.width", "value": 120}] + }, + { + "matcher": {"id": "byName", "options": "Service"}, + "properties": [{"id": "custom.width", "value": 280}] + }, + { + "matcher": {"id": "byName", "options": "Restarts"}, + "properties": [{"id": "custom.width", "value": 80}] + } + ] + }, + "options": { + "showHeader": true, + "sortBy": [{"displayName": "Restarts", "desc": true}] + }, + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "__name__": true, + "dns_role": true, + "instance": true, + "job": true, + "role": true, + "tier": true + }, + "renameByName": { + "hostname": "Host", + "name": "Service", + "Value": "Restarts" + } + } + } + ], + "description": "Services that have been restarted (since host boot)" + }, + { + "id": 9, + "title": "Active Units per Host", + "type": "bargauge", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 10}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "sort_desc(count by (hostname) (systemd_unit_state{state=\"active\", hostname=~\"$hostname\"} == 1))", + "legendFormat": "{{hostname}}", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [{"color": "green", "value": null}] + }, + "min": 0 + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "orientation": "horizontal", + "displayMode": "gradient", + "showUnfilled": true + } + }, + { + "id": 10, + "title": "NixOS Upgrade Timers", + "type": "table", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 10}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "systemd_timer_last_trigger_seconds{name=\"nixos-upgrade.timer\", hostname=~\"$hostname\"}", + "format": "table", + "instant": true, + "refId": "last" + }, + { + "expr": "time() - systemd_timer_last_trigger_seconds{name=\"nixos-upgrade.timer\", hostname=~\"$hostname\"}", + "format": "table", + "instant": true, + "refId": "ago" + } + ], + "fieldConfig": { + "defaults": {}, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Host"}, + "properties": [{"id": "custom.width", "value": 130}] + }, + { + "matcher": {"id": "byName", "options": "Last Trigger"}, + "properties": [ + {"id": "unit", "value": "dateTimeAsLocalNoDateIfToday"}, + {"id": "custom.width", "value": 180} + ] + }, + { + "matcher": {"id": "byName", "options": "Time Ago"}, + "properties": [ + {"id": "unit", "value": "s"}, + {"id": "custom.width", "value": 120}, + {"id": "thresholds", "value": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}, + {"id": "custom.cellOptions", "value": {"type": "color-text"}} + ] + } + ] + }, + "options": { + "showHeader": true, + "sortBy": [{"displayName": "Time Ago", "desc": true}] + }, + "transformations": [ + { + "id": "joinByField", + "options": {"byField": "hostname", "mode": "outer"} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Time 1": true, + "__name__": true, + "__name__ 1": true, + "dns_role": true, + "dns_role 1": true, + "instance": true, + "instance 1": true, + "job": true, + "job 1": true, + "name": true, + "name 1": true, + "role": true, + "role 1": true, + "tier": true, + "tier 1": true + }, + "indexByName": { + "hostname": 0, + "Value #last": 1, + "Value #ago": 2 + }, + "renameByName": { + "hostname": "Host", + "Value #last": "Last Trigger", + "Value #ago": "Time Ago" + } + } + } + ], + "description": "When nixos-upgrade.timer last ran on each host. Yellow >24h, Red >48h." + }, + { + "id": 11, + "title": "Backup Timers", + "type": "table", + "gridPos": {"h": 6, "w": 12, "x": 0, "y": 18}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "systemd_timer_last_trigger_seconds{name=~\"restic.*\", hostname=~\"$hostname\"}", + "format": "table", + "instant": true, + "refId": "last" + }, + { + "expr": "time() - systemd_timer_last_trigger_seconds{name=~\"restic.*\", hostname=~\"$hostname\"}", + "format": "table", + "instant": true, + "refId": "ago" + } + ], + "fieldConfig": { + "defaults": {}, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Host"}, + "properties": [{"id": "custom.width", "value": 120}] + }, + { + "matcher": {"id": "byName", "options": "Timer"}, + "properties": [{"id": "custom.width", "value": 220}] + }, + { + "matcher": {"id": "byName", "options": "Last Trigger"}, + "properties": [ + {"id": "unit", "value": "dateTimeAsLocalNoDateIfToday"}, + {"id": "custom.width", "value": 180} + ] + }, + { + "matcher": {"id": "byName", "options": "Time Ago"}, + "properties": [ + {"id": "unit", "value": "s"}, + {"id": "custom.width", "value": 100}, + {"id": "thresholds", "value": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}, + {"id": "custom.cellOptions", "value": {"type": "color-text"}} + ] + } + ] + }, + "options": { + "showHeader": true, + "sortBy": [{"displayName": "Time Ago", "desc": true}] + }, + "transformations": [ + { + "id": "joinByField", + "options": {"byField": "name", "mode": "outer"} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Time 1": true, + "__name__": true, + "__name__ 1": true, + "dns_role": true, + "dns_role 1": true, + "instance": true, + "instance 1": true, + "job": true, + "job 1": true, + "role": true, + "role 1": true, + "tier": true, + "tier 1": true, + "hostname 1": true + }, + "indexByName": { + "hostname": 0, + "name": 1, + "Value #last": 2, + "Value #ago": 3 + }, + "renameByName": { + "hostname": "Host", + "name": "Timer", + "Value #last": "Last Trigger", + "Value #ago": "Time Ago" + } + } + } + ], + "description": "Restic backup timers" + }, + { + "id": 12, + "title": "Service Restarts Over Time", + "type": "timeseries", + "gridPos": {"h": 6, "w": 12, "x": 12, "y": 18}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "sum by (hostname) (increase(systemd_service_restart_total{hostname=~\"$hostname\"}[1h]))", + "legendFormat": "{{hostname}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "lineWidth": 1, + "fillOpacity": 20, + "showPoints": "never", + "stacking": {"mode": "normal"} + } + } + }, + "options": { + "legend": {"displayMode": "list", "placement": "bottom"}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "description": "Service restart rate per hour" + } + ] +}