From a5d5827dcc9931d20c39d9345bd167f7448bdfb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sun, 8 Feb 2026 20:50:08 +0100 Subject: [PATCH] grafana: add NixOS fleet dashboard Dashboard for monitoring NixOS deployments across the homelab: - Hosts behind remote / needing reboot stat panels - Fleet status table with revision, behind status, reboot needed, age - Generation age bar chart (shows stale configs) - Generations per host bar chart - Deployment activity time series (see when hosts were updated) - Flake input ages table - Pie charts for hosts by revision and tier - Tier filter variable Co-Authored-By: Claude Opus 4.5 --- services/grafana/dashboards/nixos-fleet.json | 558 +++++++++++++++++++ 1 file changed, 558 insertions(+) create mode 100644 services/grafana/dashboards/nixos-fleet.json diff --git a/services/grafana/dashboards/nixos-fleet.json b/services/grafana/dashboards/nixos-fleet.json new file mode 100644 index 0000000..d776831 --- /dev/null +++ b/services/grafana/dashboards/nixos-fleet.json @@ -0,0 +1,558 @@ +{ + "uid": "nixos-fleet-homelab", + "title": "NixOS Fleet - Homelab", + "tags": ["nixos", "fleet", "homelab"], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "1m", + "time": { + "from": "now-7d", + "to": "now" + }, + "templating": { + "list": [ + { + "name": "tier", + "type": "query", + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "query": "label_values(nixos_flake_info, tier)", + "refresh": 2, + "includeAll": true, + "multi": false, + "current": {"text": "All", "value": "$__all"} + } + ] + }, + "panels": [ + { + "id": 1, + "title": "Hosts Behind Remote", + "type": "stat", + "gridPos": {"h": 4, "w": 6, "x": 0, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count(nixos_flake_revision_behind{tier=~\"$tier\"} == 1)", + "legendFormat": "Behind", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 1}, + {"color": "red", "value": 5} + ] + }, + "noValue": "0" + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none", + "textMode": "auto" + }, + "description": "Number of hosts where current revision differs from remote master" + }, + { + "id": 2, + "title": "Hosts Needing Reboot", + "type": "stat", + "gridPos": {"h": 4, "w": 6, "x": 6, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count(nixos_config_mismatch{tier=~\"$tier\"} == 1)", + "legendFormat": "Need Reboot", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 1}, + {"color": "orange", "value": 3}, + {"color": "red", "value": 5} + ] + }, + "noValue": "0" + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + }, + "description": "Hosts where booted generation differs from current (switched but not rebooted)" + }, + { + "id": 3, + "title": "Total Hosts", + "type": "stat", + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count(nixos_flake_info{tier=~\"$tier\"})", + "legendFormat": "Hosts", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [{"color": "blue", "value": null}] + } + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + } + }, + { + "id": 4, + "title": "Nixpkgs Age", + "type": "stat", + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "max(nixos_flake_input_age_seconds{input=\"nixpkgs\", tier=~\"$tier\"})", + "legendFormat": "Nixpkgs", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 604800}, + {"color": "orange", "value": 1209600}, + {"color": "red", "value": 2592000} + ] + } + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + }, + "description": "Age of nixpkgs flake input (yellow >7d, orange >14d, red >30d)" + }, + { + "id": 5, + "title": "Hosts Up-to-date", + "type": "stat", + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count(nixos_flake_revision_behind{tier=~\"$tier\"} == 0)", + "legendFormat": "Up-to-date", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [{"color": "green", "value": null}] + }, + "noValue": "0" + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + } + }, + { + "id": 6, + "title": "Fleet Status", + "type": "table", + "gridPos": {"h": 10, "w": 24, "x": 0, "y": 4}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "nixos_flake_info{tier=~\"$tier\"}", + "format": "table", + "instant": true, + "refId": "info" + }, + { + "expr": "nixos_flake_revision_behind{tier=~\"$tier\"}", + "format": "table", + "instant": true, + "refId": "behind" + }, + { + "expr": "nixos_config_mismatch{tier=~\"$tier\"}", + "format": "table", + "instant": true, + "refId": "mismatch" + }, + { + "expr": "nixos_generation_age_seconds{tier=~\"$tier\"}", + "format": "table", + "instant": true, + "refId": "age" + }, + { + "expr": "nixos_generation_count{tier=~\"$tier\"}", + "format": "table", + "instant": true, + "refId": "count" + } + ], + "fieldConfig": { + "defaults": {}, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Hostname"}, + "properties": [{"id": "custom.width", "value": 120}] + }, + { + "matcher": {"id": "byName", "options": "Current Rev"}, + "properties": [{"id": "custom.width", "value": 90}] + }, + { + "matcher": {"id": "byName", "options": "Remote Rev"}, + "properties": [{"id": "custom.width", "value": 90}] + }, + { + "matcher": {"id": "byName", "options": "Behind"}, + "properties": [ + {"id": "custom.width", "value": 70}, + {"id": "mappings", "value": [ + {"type": "value", "options": {"0": {"text": "No", "color": "green"}}}, + {"type": "value", "options": {"1": {"text": "Yes", "color": "red"}}} + ]}, + {"id": "custom.cellOptions", "value": {"type": "color-text"}} + ] + }, + { + "matcher": {"id": "byName", "options": "Need Reboot"}, + "properties": [ + {"id": "custom.width", "value": 100}, + {"id": "mappings", "value": [ + {"type": "value", "options": {"0": {"text": "No", "color": "green"}}}, + {"type": "value", "options": {"1": {"text": "Yes", "color": "orange"}}} + ]}, + {"id": "custom.cellOptions", "value": {"type": "color-text"}} + ] + }, + { + "matcher": {"id": "byName", "options": "Config Age"}, + "properties": [ + {"id": "unit", "value": "s"}, + {"id": "custom.width", "value": 100} + ] + }, + { + "matcher": {"id": "byName", "options": "Generations"}, + "properties": [{"id": "custom.width", "value": 100}] + }, + { + "matcher": {"id": "byName", "options": "Tier"}, + "properties": [{"id": "custom.width", "value": 60}] + }, + { + "matcher": {"id": "byName", "options": "Role"}, + "properties": [{"id": "custom.width", "value": 80}] + } + ] + }, + "options": { + "showHeader": true, + "sortBy": [{"displayName": "Hostname", "desc": false}] + }, + "transformations": [ + { + "id": "joinByField", + "options": {"byField": "hostname", "mode": "outer"} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Time 1": true, + "Time 2": true, + "Time 3": true, + "Time 4": true, + "Time 5": true, + "Value #info": true, + "dns_role": true, + "dns_role 1": true, + "dns_role 2": true, + "dns_role 3": true, + "dns_role 4": true, + "instance": true, + "instance 1": true, + "instance 2": true, + "instance 3": true, + "instance 4": true, + "job": true, + "job 1": true, + "job 2": true, + "job 3": true, + "job 4": true, + "nixos_version": true, + "nixpkgs_rev": true, + "role 1": true, + "role 2": true, + "role 3": true, + "role 4": true, + "tier 1": true, + "tier 2": true, + "tier 3": true, + "tier 4": true + }, + "indexByName": { + "hostname": 0, + "tier": 1, + "role": 2, + "current_rev": 3, + "remote_rev": 4, + "Value #behind": 5, + "Value #mismatch": 6, + "Value #age": 7, + "Value #count": 8 + }, + "renameByName": { + "hostname": "Hostname", + "tier": "Tier", + "role": "Role", + "current_rev": "Current Rev", + "remote_rev": "Remote Rev", + "Value #behind": "Behind", + "Value #mismatch": "Need Reboot", + "Value #age": "Config Age", + "Value #count": "Generations" + } + } + } + ] + }, + { + "id": 7, + "title": "Generation Age by Host", + "type": "bargauge", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 14}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "sort_desc(nixos_generation_age_seconds{tier=~\"$tier\"})", + "legendFormat": "{{hostname}}", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 86400}, + {"color": "orange", "value": 259200}, + {"color": "red", "value": 604800} + ] + }, + "min": 0 + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "orientation": "horizontal", + "displayMode": "gradient", + "showUnfilled": true + }, + "description": "How long ago each host's current config was deployed (yellow >1d, orange >3d, red >7d)" + }, + { + "id": 8, + "title": "Generations per Host", + "type": "bargauge", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 14}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "sort_desc(nixos_generation_count{tier=~\"$tier\"})", + "legendFormat": "{{hostname}}", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "blue", "value": null}, + {"color": "purple", "value": 50} + ] + }, + "min": 0 + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "orientation": "horizontal", + "displayMode": "gradient", + "showUnfilled": true + }, + "description": "Total number of NixOS generations on each host" + }, + { + "id": 9, + "title": "Deployment Activity (Generation Age Over Time)", + "type": "timeseries", + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 22}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "nixos_generation_age_seconds{tier=~\"$tier\"}", + "legendFormat": "{{hostname}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { + "lineWidth": 1, + "fillOpacity": 0, + "showPoints": "never", + "stacking": {"mode": "none"} + } + } + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "description": "Generation age increases over time, drops to near-zero when deployed. Useful to see deployment patterns." + }, + { + "id": 10, + "title": "Flake Input Ages", + "type": "table", + "gridPos": {"h": 6, "w": 12, "x": 0, "y": 30}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "max by (input) (nixos_flake_input_age_seconds)", + "format": "table", + "instant": true, + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [ + { + "matcher": {"id": "byName", "options": "input"}, + "properties": [{"id": "custom.width", "value": 150}] + } + ] + }, + "options": { + "showHeader": true, + "sortBy": [{"displayName": "Value", "desc": true}] + }, + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": {"Time": true}, + "renameByName": { + "input": "Flake Input", + "Value": "Age" + } + } + } + ], + "description": "Age of each flake input across the fleet" + }, + { + "id": 11, + "title": "Hosts by Revision", + "type": "piechart", + "gridPos": {"h": 6, "w": 6, "x": 12, "y": 30}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count by (current_rev) (nixos_flake_info{tier=~\"$tier\"})", + "legendFormat": "{{current_rev}}", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": {} + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "legend": {"displayMode": "table", "placement": "right", "values": ["value"]}, + "pieType": "pie" + }, + "description": "Distribution of hosts by their current flake revision" + }, + { + "id": 12, + "title": "Hosts by Tier", + "type": "piechart", + "gridPos": {"h": 6, "w": 6, "x": 18, "y": 30}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count by (tier) (nixos_flake_info)", + "legendFormat": "{{tier}}", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": {} + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "legend": {"displayMode": "table", "placement": "right", "values": ["value"]}, + "pieType": "pie" + }, + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "^$", + "renamePattern": "prod" + } + } + ], + "description": "Distribution of hosts by tier (test vs prod)" + } + ] +}