From 4bf0eeeadb4ba3d945edff85bd611f6f17904b47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sun, 8 Feb 2026 20:39:21 +0100 Subject: [PATCH 1/8] grafana: add dashboards and fix permissions - Change default OIDC role from Viewer to Editor for Explore access - Add declarative dashboard provisioning - Add node-exporter dashboard (CPU, memory, disk, load, network, I/O) - Add Loki logs dashboard with host/job filters Co-Authored-By: Claude Opus 4.5 --- services/grafana/dashboards/logs.json | 85 +++++++ .../grafana/dashboards/node-exporter.json | 208 ++++++++++++++++++ services/grafana/default.nix | 17 +- 3 files changed, 308 insertions(+), 2 deletions(-) create mode 100644 services/grafana/dashboards/logs.json create mode 100644 services/grafana/dashboards/node-exporter.json diff --git a/services/grafana/dashboards/logs.json b/services/grafana/dashboards/logs.json new file mode 100644 index 0000000..e8678eb --- /dev/null +++ b/services/grafana/dashboards/logs.json @@ -0,0 +1,85 @@ +{ + "uid": "logs-homelab", + "title": "Logs - Homelab", + "tags": ["loki", "logs", "homelab"], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "30s", + "templating": { + "list": [ + { + "name": "host", + "type": "query", + "datasource": {"type": "loki", "uid": "loki"}, + "query": "label_values(host)", + "refresh": 2, + "includeAll": true, + "multi": false, + "current": {"text": "All", "value": "$__all"} + }, + { + "name": "job", + "type": "query", + "datasource": {"type": "loki", "uid": "loki"}, + "query": "label_values(job)", + "refresh": 2, + "includeAll": true, + "multi": false, + "current": {"text": "All", "value": "$__all"} + }, + { + "name": "search", + "type": "textbox", + "current": {"text": "", "value": ""}, + "label": "Search" + } + ] + }, + "panels": [ + { + "id": 1, + "title": "Log Volume", + "type": "timeseries", + "gridPos": {"h": 6, "w": 24, "x": 0, "y": 0}, + "datasource": {"type": "loki", "uid": "loki"}, + "targets": [ + { + "expr": "sum by (host) (count_over_time({host=~\"$host\", job=~\"$job\"} |~ \"$search\" [1m]))", + "legendFormat": "{{host}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "options": { + "legend": {"displayMode": "list", "placement": "bottom"} + } + }, + { + "id": 2, + "title": "Logs", + "type": "logs", + "gridPos": {"h": 18, "w": 24, "x": 0, "y": 6}, + "datasource": {"type": "loki", "uid": "loki"}, + "targets": [ + { + "expr": "{host=~\"$host\", job=~\"$job\"} |~ \"$search\"", + "refId": "A" + } + ], + "options": { + "showTime": true, + "showLabels": true, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "sortOrder": "Descending" + } + } + ] +} diff --git a/services/grafana/dashboards/node-exporter.json b/services/grafana/dashboards/node-exporter.json new file mode 100644 index 0000000..5da2746 --- /dev/null +++ b/services/grafana/dashboards/node-exporter.json @@ -0,0 +1,208 @@ +{ + "uid": "node-exporter-homelab", + "title": "Node Exporter - Homelab", + "tags": ["node-exporter", "prometheus", "homelab"], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "30s", + "templating": { + "list": [ + { + "name": "instance", + "type": "query", + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "query": "label_values(node_uname_info, instance)", + "refresh": 2, + "includeAll": false, + "multi": false, + "current": {} + } + ] + }, + "panels": [ + { + "id": 1, + "title": "CPU Usage", + "type": "timeseries", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\", instance=~\"$instance\"}[5m])) * 100)", + "legendFormat": "CPU %", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 70}, + {"color": "red", "value": 90} + ] + } + } + } + }, + { + "id": 2, + "title": "Memory Usage", + "type": "timeseries", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "(1 - (node_memory_MemAvailable_bytes{instance=~\"$instance\"} / node_memory_MemTotal_bytes{instance=~\"$instance\"})) * 100", + "legendFormat": "Memory %", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 70}, + {"color": "red", "value": 90} + ] + } + } + } + }, + { + "id": 3, + "title": "Disk Usage", + "type": "gauge", + "gridPos": {"h": 8, "w": 8, "x": 0, "y": 8}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "100 - ((node_filesystem_avail_bytes{instance=~\"$instance\",mountpoint=\"/\",fstype!=\"rootfs\"} / node_filesystem_size_bytes{instance=~\"$instance\",mountpoint=\"/\",fstype!=\"rootfs\"}) * 100)", + "legendFormat": "Root /", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 70}, + {"color": "red", "value": 85} + ] + } + } + } + }, + { + "id": 4, + "title": "System Load", + "type": "timeseries", + "gridPos": {"h": 8, "w": 8, "x": 8, "y": 8}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "node_load1{instance=~\"$instance\"}", + "legendFormat": "1m", + "refId": "A" + }, + { + "expr": "node_load5{instance=~\"$instance\"}", + "legendFormat": "5m", + "refId": "B" + }, + { + "expr": "node_load15{instance=~\"$instance\"}", + "legendFormat": "15m", + "refId": "C" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short" + } + } + }, + { + "id": 5, + "title": "Uptime", + "type": "stat", + "gridPos": {"h": 8, "w": 8, "x": 16, "y": 8}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "time() - node_boot_time_seconds{instance=~\"$instance\"}", + "legendFormat": "Uptime", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + } + } + }, + { + "id": 6, + "title": "Network Traffic", + "type": "timeseries", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "rate(node_network_receive_bytes_total{instance=~\"$instance\",device!~\"lo|veth.*|br.*|docker.*\"}[5m])", + "legendFormat": "Receive {{device}}", + "refId": "A" + }, + { + "expr": "-rate(node_network_transmit_bytes_total{instance=~\"$instance\",device!~\"lo|veth.*|br.*|docker.*\"}[5m])", + "legendFormat": "Transmit {{device}}", + "refId": "B" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps" + } + } + }, + { + "id": 7, + "title": "Disk I/O", + "type": "timeseries", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "rate(node_disk_read_bytes_total{instance=~\"$instance\",device!~\"dm-.*\"}[5m])", + "legendFormat": "Read {{device}}", + "refId": "A" + }, + { + "expr": "-rate(node_disk_written_bytes_total{instance=~\"$instance\",device!~\"dm-.*\"}[5m])", + "legendFormat": "Write {{device}}", + "refId": "B" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps" + } + } + } + ] +} diff --git a/services/grafana/default.nix b/services/grafana/default.nix index ca22e8c..75413c1 100644 --- a/services/grafana/default.nix +++ b/services/grafana/default.nix @@ -28,8 +28,8 @@ email_attribute_path = "email"; login_attribute_path = "preferred_username"; name_attribute_path = "name"; - # Map admins group to Admin role, everyone else to Viewer - role_attribute_path = "contains(groups[*], 'admins') && 'Admin' || 'Viewer'"; + # Map admins group to Admin role, everyone else to Editor (for Explore access) + role_attribute_path = "contains(groups[*], 'admins') && 'Admin' || 'Editor'"; allow_sign_up = true; }; }; @@ -53,6 +53,19 @@ } ]; }; + + # Declarative dashboards + provision.dashboards.settings = { + apiVersion = 1; + providers = [ + { + name = "homelab"; + type = "file"; + options.path = ./dashboards; + disableDeletion = true; + } + ]; + }; }; # Vault secret for OAuth2 client secret -- 2.49.1 From 1c13ec12a4d259686b7421a546cfd0b185ca04c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sun, 8 Feb 2026 20:45:52 +0100 Subject: [PATCH 2/8] grafana: add temperature dashboard Dashboard includes: - Current temperatures per room (stat panel) - Average home temperature (gauge) - Current humidity (stat panel) - 30-day temperature history with mean/min/max in legend - Temperature trend (rate of change per hour) - 24h min/max/avg table per room - 30-day humidity history Filters out device_temperature (internal sensor) metrics. Co-Authored-By: Claude Opus 4.5 --- services/grafana/dashboards/temperature.json | 399 +++++++++++++++++++ 1 file changed, 399 insertions(+) create mode 100644 services/grafana/dashboards/temperature.json diff --git a/services/grafana/dashboards/temperature.json b/services/grafana/dashboards/temperature.json new file mode 100644 index 0000000..7c80e6a --- /dev/null +++ b/services/grafana/dashboards/temperature.json @@ -0,0 +1,399 @@ +{ + "uid": "temperature-homelab", + "title": "Temperature - Homelab", + "tags": ["home-assistant", "temperature", "homelab"], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "1m", + "time": { + "from": "now-30d", + "to": "now" + }, + "templating": { + "list": [] + }, + "panels": [ + { + "id": 1, + "title": "Current Temperatures", + "type": "stat", + "gridPos": {"h": 6, "w": 12, "x": 0, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "hass_sensor_temperature_celsius{entity!~\".*device_temperature\"}", + "legendFormat": "{{friendly_name}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "celsius", + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "blue", "value": null}, + {"color": "green", "value": 18}, + {"color": "yellow", "value": 24}, + {"color": "orange", "value": 27}, + {"color": "red", "value": 30} + ] + }, + "mappings": [] + }, + "overrides": [] + }, + "options": { + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto" + }, + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "Temp (.*) Temperature", + "renamePattern": "$1" + } + } + ] + }, + { + "id": 2, + "title": "Average Home Temperature", + "type": "gauge", + "gridPos": {"h": 6, "w": 6, "x": 12, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "avg(hass_sensor_temperature_celsius{entity!~\".*device_temperature|.*server.*\"})", + "legendFormat": "Average", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "celsius", + "min": 15, + "max": 30, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "blue", "value": null}, + {"color": "green", "value": 18}, + {"color": "yellow", "value": 24}, + {"color": "red", "value": 28} + ] + } + } + }, + "options": { + "reduceOptions": { + "calcs": ["lastNotNull"] + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + } + }, + { + "id": 3, + "title": "Current Humidity", + "type": "stat", + "gridPos": {"h": 6, "w": 6, "x": 18, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "hass_sensor_humidity_percent{entity!~\".*server.*\"}", + "legendFormat": "{{friendly_name}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": null}, + {"color": "yellow", "value": 30}, + {"color": "green", "value": 40}, + {"color": "yellow", "value": 60}, + {"color": "red", "value": 70} + ] + } + } + }, + "options": { + "reduceOptions": { + "calcs": ["lastNotNull"] + }, + "orientation": "horizontal", + "colorMode": "value", + "graphMode": "none" + }, + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "Temp (.*) Humidity", + "renamePattern": "$1" + } + } + ] + }, + { + "id": 4, + "title": "Temperature History (30 Days)", + "type": "timeseries", + "gridPos": {"h": 10, "w": 24, "x": 0, "y": 6}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "hass_sensor_temperature_celsius{entity!~\".*device_temperature\"}", + "legendFormat": "{{friendly_name}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "celsius", + "custom": { + "lineWidth": 1, + "fillOpacity": 10, + "pointSize": 5, + "showPoints": "never", + "spanNulls": 3600000 + } + } + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": ["mean", "min", "max"] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "Temp (.*) Temperature", + "renamePattern": "$1" + } + }, + { + "id": "renameByRegex", + "options": { + "regex": "temp_server Temperature", + "renamePattern": "Server" + } + } + ] + }, + { + "id": 5, + "title": "Temperature Trend (1h rate of change)", + "type": "timeseries", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "deriv(hass_sensor_temperature_celsius{entity!~\".*device_temperature\"}[1h]) * 3600", + "legendFormat": "{{friendly_name}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "celsius", + "custom": { + "lineWidth": 1, + "fillOpacity": 20, + "showPoints": "never", + "spanNulls": 3600000 + }, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "blue", "value": null}, + {"color": "green", "value": -0.5}, + {"color": "green", "value": 0.5}, + {"color": "red", "value": 1} + ] + }, + "displayName": "${__field.labels.friendly_name}" + } + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "Temp (.*) Temperature", + "renamePattern": "$1" + } + }, + { + "id": "renameByRegex", + "options": { + "regex": "temp_server Temperature", + "renamePattern": "Server" + } + } + ], + "description": "Rate of temperature change per hour. Positive = warming, Negative = cooling." + }, + { + "id": 6, + "title": "24h Min / Max / Avg", + "type": "table", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "min_over_time(hass_sensor_temperature_celsius{entity!~\".*device_temperature\"}[24h])", + "legendFormat": "{{friendly_name}}", + "refId": "min", + "instant": true + }, + { + "expr": "max_over_time(hass_sensor_temperature_celsius{entity!~\".*device_temperature\"}[24h])", + "legendFormat": "{{friendly_name}}", + "refId": "max", + "instant": true + }, + { + "expr": "avg_over_time(hass_sensor_temperature_celsius{entity!~\".*device_temperature\"}[24h])", + "legendFormat": "{{friendly_name}}", + "refId": "avg", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "celsius", + "decimals": 1 + }, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Room"}, + "properties": [{"id": "custom.width", "value": 150}] + } + ] + }, + "options": { + "showHeader": true, + "sortBy": [{"displayName": "Room", "desc": false}] + }, + "transformations": [ + { + "id": "joinByField", + "options": { + "byField": "friendly_name", + "mode": "outer" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "domain": true, + "entity": true, + "hostname": true, + "instance": true, + "job": true + }, + "renameByName": { + "friendly_name": "Room", + "Value #min": "Min (24h)", + "Value #max": "Max (24h)", + "Value #avg": "Avg (24h)" + } + } + }, + { + "id": "renameByRegex", + "options": { + "regex": "Temp (.*) Temperature", + "renamePattern": "$1" + } + } + ] + }, + { + "id": 7, + "title": "Humidity History (30 Days)", + "type": "timeseries", + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 24}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "hass_sensor_humidity_percent", + "legendFormat": "{{friendly_name}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "custom": { + "lineWidth": 1, + "fillOpacity": 10, + "showPoints": "never", + "spanNulls": 3600000 + } + } + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": ["mean", "min", "max"] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "Temp (.*) Humidity", + "renamePattern": "$1" + } + }, + { + "id": "renameByRegex", + "options": { + "regex": "temp_server Humidity", + "renamePattern": "Server" + } + } + ] + } + ] +} -- 2.49.1 From a5d5827dcc9931d20c39d9345bd167f7448bdfb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sun, 8 Feb 2026 20:50:08 +0100 Subject: [PATCH 3/8] grafana: add NixOS fleet dashboard Dashboard for monitoring NixOS deployments across the homelab: - Hosts behind remote / needing reboot stat panels - Fleet status table with revision, behind status, reboot needed, age - Generation age bar chart (shows stale configs) - Generations per host bar chart - Deployment activity time series (see when hosts were updated) - Flake input ages table - Pie charts for hosts by revision and tier - Tier filter variable Co-Authored-By: Claude Opus 4.5 --- services/grafana/dashboards/nixos-fleet.json | 558 +++++++++++++++++++ 1 file changed, 558 insertions(+) create mode 100644 services/grafana/dashboards/nixos-fleet.json diff --git a/services/grafana/dashboards/nixos-fleet.json b/services/grafana/dashboards/nixos-fleet.json new file mode 100644 index 0000000..d776831 --- /dev/null +++ b/services/grafana/dashboards/nixos-fleet.json @@ -0,0 +1,558 @@ +{ + "uid": "nixos-fleet-homelab", + "title": "NixOS Fleet - Homelab", + "tags": ["nixos", "fleet", "homelab"], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "1m", + "time": { + "from": "now-7d", + "to": "now" + }, + "templating": { + "list": [ + { + "name": "tier", + "type": "query", + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "query": "label_values(nixos_flake_info, tier)", + "refresh": 2, + "includeAll": true, + "multi": false, + "current": {"text": "All", "value": "$__all"} + } + ] + }, + "panels": [ + { + "id": 1, + "title": "Hosts Behind Remote", + "type": "stat", + "gridPos": {"h": 4, "w": 6, "x": 0, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count(nixos_flake_revision_behind{tier=~\"$tier\"} == 1)", + "legendFormat": "Behind", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 1}, + {"color": "red", "value": 5} + ] + }, + "noValue": "0" + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none", + "textMode": "auto" + }, + "description": "Number of hosts where current revision differs from remote master" + }, + { + "id": 2, + "title": "Hosts Needing Reboot", + "type": "stat", + "gridPos": {"h": 4, "w": 6, "x": 6, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count(nixos_config_mismatch{tier=~\"$tier\"} == 1)", + "legendFormat": "Need Reboot", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 1}, + {"color": "orange", "value": 3}, + {"color": "red", "value": 5} + ] + }, + "noValue": "0" + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + }, + "description": "Hosts where booted generation differs from current (switched but not rebooted)" + }, + { + "id": 3, + "title": "Total Hosts", + "type": "stat", + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count(nixos_flake_info{tier=~\"$tier\"})", + "legendFormat": "Hosts", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [{"color": "blue", "value": null}] + } + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + } + }, + { + "id": 4, + "title": "Nixpkgs Age", + "type": "stat", + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "max(nixos_flake_input_age_seconds{input=\"nixpkgs\", tier=~\"$tier\"})", + "legendFormat": "Nixpkgs", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 604800}, + {"color": "orange", "value": 1209600}, + {"color": "red", "value": 2592000} + ] + } + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + }, + "description": "Age of nixpkgs flake input (yellow >7d, orange >14d, red >30d)" + }, + { + "id": 5, + "title": "Hosts Up-to-date", + "type": "stat", + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count(nixos_flake_revision_behind{tier=~\"$tier\"} == 0)", + "legendFormat": "Up-to-date", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [{"color": "green", "value": null}] + }, + "noValue": "0" + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + } + }, + { + "id": 6, + "title": "Fleet Status", + "type": "table", + "gridPos": {"h": 10, "w": 24, "x": 0, "y": 4}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "nixos_flake_info{tier=~\"$tier\"}", + "format": "table", + "instant": true, + "refId": "info" + }, + { + "expr": "nixos_flake_revision_behind{tier=~\"$tier\"}", + "format": "table", + "instant": true, + "refId": "behind" + }, + { + "expr": "nixos_config_mismatch{tier=~\"$tier\"}", + "format": "table", + "instant": true, + "refId": "mismatch" + }, + { + "expr": "nixos_generation_age_seconds{tier=~\"$tier\"}", + "format": "table", + "instant": true, + "refId": "age" + }, + { + "expr": "nixos_generation_count{tier=~\"$tier\"}", + "format": "table", + "instant": true, + "refId": "count" + } + ], + "fieldConfig": { + "defaults": {}, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Hostname"}, + "properties": [{"id": "custom.width", "value": 120}] + }, + { + "matcher": {"id": "byName", "options": "Current Rev"}, + "properties": [{"id": "custom.width", "value": 90}] + }, + { + "matcher": {"id": "byName", "options": "Remote Rev"}, + "properties": [{"id": "custom.width", "value": 90}] + }, + { + "matcher": {"id": "byName", "options": "Behind"}, + "properties": [ + {"id": "custom.width", "value": 70}, + {"id": "mappings", "value": [ + {"type": "value", "options": {"0": {"text": "No", "color": "green"}}}, + {"type": "value", "options": {"1": {"text": "Yes", "color": "red"}}} + ]}, + {"id": "custom.cellOptions", "value": {"type": "color-text"}} + ] + }, + { + "matcher": {"id": "byName", "options": "Need Reboot"}, + "properties": [ + {"id": "custom.width", "value": 100}, + {"id": "mappings", "value": [ + {"type": "value", "options": {"0": {"text": "No", "color": "green"}}}, + {"type": "value", "options": {"1": {"text": "Yes", "color": "orange"}}} + ]}, + {"id": "custom.cellOptions", "value": {"type": "color-text"}} + ] + }, + { + "matcher": {"id": "byName", "options": "Config Age"}, + "properties": [ + {"id": "unit", "value": "s"}, + {"id": "custom.width", "value": 100} + ] + }, + { + "matcher": {"id": "byName", "options": "Generations"}, + "properties": [{"id": "custom.width", "value": 100}] + }, + { + "matcher": {"id": "byName", "options": "Tier"}, + "properties": [{"id": "custom.width", "value": 60}] + }, + { + "matcher": {"id": "byName", "options": "Role"}, + "properties": [{"id": "custom.width", "value": 80}] + } + ] + }, + "options": { + "showHeader": true, + "sortBy": [{"displayName": "Hostname", "desc": false}] + }, + "transformations": [ + { + "id": "joinByField", + "options": {"byField": "hostname", "mode": "outer"} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Time 1": true, + "Time 2": true, + "Time 3": true, + "Time 4": true, + "Time 5": true, + "Value #info": true, + "dns_role": true, + "dns_role 1": true, + "dns_role 2": true, + "dns_role 3": true, + "dns_role 4": true, + "instance": true, + "instance 1": true, + "instance 2": true, + "instance 3": true, + "instance 4": true, + "job": true, + "job 1": true, + "job 2": true, + "job 3": true, + "job 4": true, + "nixos_version": true, + "nixpkgs_rev": true, + "role 1": true, + "role 2": true, + "role 3": true, + "role 4": true, + "tier 1": true, + "tier 2": true, + "tier 3": true, + "tier 4": true + }, + "indexByName": { + "hostname": 0, + "tier": 1, + "role": 2, + "current_rev": 3, + "remote_rev": 4, + "Value #behind": 5, + "Value #mismatch": 6, + "Value #age": 7, + "Value #count": 8 + }, + "renameByName": { + "hostname": "Hostname", + "tier": "Tier", + "role": "Role", + "current_rev": "Current Rev", + "remote_rev": "Remote Rev", + "Value #behind": "Behind", + "Value #mismatch": "Need Reboot", + "Value #age": "Config Age", + "Value #count": "Generations" + } + } + } + ] + }, + { + "id": 7, + "title": "Generation Age by Host", + "type": "bargauge", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 14}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "sort_desc(nixos_generation_age_seconds{tier=~\"$tier\"})", + "legendFormat": "{{hostname}}", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 86400}, + {"color": "orange", "value": 259200}, + {"color": "red", "value": 604800} + ] + }, + "min": 0 + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "orientation": "horizontal", + "displayMode": "gradient", + "showUnfilled": true + }, + "description": "How long ago each host's current config was deployed (yellow >1d, orange >3d, red >7d)" + }, + { + "id": 8, + "title": "Generations per Host", + "type": "bargauge", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 14}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "sort_desc(nixos_generation_count{tier=~\"$tier\"})", + "legendFormat": "{{hostname}}", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "blue", "value": null}, + {"color": "purple", "value": 50} + ] + }, + "min": 0 + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "orientation": "horizontal", + "displayMode": "gradient", + "showUnfilled": true + }, + "description": "Total number of NixOS generations on each host" + }, + { + "id": 9, + "title": "Deployment Activity (Generation Age Over Time)", + "type": "timeseries", + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 22}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "nixos_generation_age_seconds{tier=~\"$tier\"}", + "legendFormat": "{{hostname}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { + "lineWidth": 1, + "fillOpacity": 0, + "showPoints": "never", + "stacking": {"mode": "none"} + } + } + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "description": "Generation age increases over time, drops to near-zero when deployed. Useful to see deployment patterns." + }, + { + "id": 10, + "title": "Flake Input Ages", + "type": "table", + "gridPos": {"h": 6, "w": 12, "x": 0, "y": 30}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "max by (input) (nixos_flake_input_age_seconds)", + "format": "table", + "instant": true, + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [ + { + "matcher": {"id": "byName", "options": "input"}, + "properties": [{"id": "custom.width", "value": 150}] + } + ] + }, + "options": { + "showHeader": true, + "sortBy": [{"displayName": "Value", "desc": true}] + }, + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": {"Time": true}, + "renameByName": { + "input": "Flake Input", + "Value": "Age" + } + } + } + ], + "description": "Age of each flake input across the fleet" + }, + { + "id": 11, + "title": "Hosts by Revision", + "type": "piechart", + "gridPos": {"h": 6, "w": 6, "x": 12, "y": 30}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count by (current_rev) (nixos_flake_info{tier=~\"$tier\"})", + "legendFormat": "{{current_rev}}", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": {} + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "legend": {"displayMode": "table", "placement": "right", "values": ["value"]}, + "pieType": "pie" + }, + "description": "Distribution of hosts by their current flake revision" + }, + { + "id": 12, + "title": "Hosts by Tier", + "type": "piechart", + "gridPos": {"h": 6, "w": 6, "x": 18, "y": 30}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count by (tier) (nixos_flake_info)", + "legendFormat": "{{tier}}", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": {} + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "legend": {"displayMode": "table", "placement": "right", "values": ["value"]}, + "pieType": "pie" + }, + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "^$", + "renamePattern": "prod" + } + } + ], + "description": "Distribution of hosts by tier (test vs prod)" + } + ] +} -- 2.49.1 From d333aa0164f58c7e72c28a2c33a6db2fb0aa747a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sun, 8 Feb 2026 20:52:39 +0100 Subject: [PATCH 4/8] grafana: fix fleet table __name__ columns Exclude the __name__ columns that were leaking through the table transformations. Co-Authored-By: Claude Opus 4.5 --- services/grafana/dashboards/nixos-fleet.json | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/services/grafana/dashboards/nixos-fleet.json b/services/grafana/dashboards/nixos-fleet.json index d776831..5a7b61c 100644 --- a/services/grafana/dashboards/nixos-fleet.json +++ b/services/grafana/dashboards/nixos-fleet.json @@ -298,6 +298,12 @@ "Time 4": true, "Time 5": true, "Value #info": true, + "__name__": true, + "__name__ 1": true, + "__name__ 2": true, + "__name__ 3": true, + "__name__ 4": true, + "__name__ 5": true, "dns_role": true, "dns_role 1": true, "dns_role 2": true, -- 2.49.1 From 1e52eec02a7d9e88b13e20273f9e713d947c3c02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sun, 8 Feb 2026 20:58:52 +0100 Subject: [PATCH 5/8] monitoring: always include tier label in scrape configs Previously tier was only included if non-default (not "prod"), which meant prod hosts had no tier label. This made the Grafana tier filter only show "test" since "prod" never appeared in label_values(). Now tier is always included, so both "prod" and "test" appear in the fleet dashboard tier selector. Co-Authored-By: Claude Opus 4.5 --- lib/monitoring.nix | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/monitoring.nix b/lib/monitoring.nix index 57bffb4..b10703b 100644 --- a/lib/monitoring.nix +++ b/lib/monitoring.nix @@ -58,10 +58,9 @@ let }; # Build effective labels for a host - # Always includes hostname; only includes tier/priority/role if non-default + # Always includes hostname and tier; only includes priority/role if non-default buildEffectiveLabels = host: - { hostname = host.hostname; } - // (lib.optionalAttrs (host.tier != "prod") { tier = host.tier; }) + { hostname = host.hostname; tier = host.tier; } // (lib.optionalAttrs (host.priority != "high") { priority = host.priority; }) // (lib.optionalAttrs (host.role != null) { role = host.role; }) // host.labels; -- 2.49.1 From 05630eb4d46491e8bb2b327bedcf294e5c0861b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sun, 8 Feb 2026 21:02:28 +0100 Subject: [PATCH 6/8] grafana: add Proxmox dashboard Dashboard for monitoring Proxmox VMs: - Summary stats: VMs running/stopped, node CPU/memory, uptime - VM status table with name, status, CPU%, memory%, uptime - VM CPU usage over time - VM memory usage over time - Network traffic (RX/TX) per VM - Disk I/O (read/write) per VM - Storage usage gauges and capacity table - VM filter to focus on specific VMs Filters out template VMs, shows only actual guests. Co-Authored-By: Claude Opus 4.5 --- services/grafana/dashboards/proxmox.json | 605 +++++++++++++++++++++++ 1 file changed, 605 insertions(+) create mode 100644 services/grafana/dashboards/proxmox.json diff --git a/services/grafana/dashboards/proxmox.json b/services/grafana/dashboards/proxmox.json new file mode 100644 index 0000000..ffba8a0 --- /dev/null +++ b/services/grafana/dashboards/proxmox.json @@ -0,0 +1,605 @@ +{ + "uid": "proxmox-homelab", + "title": "Proxmox - Homelab", + "tags": ["proxmox", "virtualization", "homelab"], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "30s", + "time": { + "from": "now-6h", + "to": "now" + }, + "templating": { + "list": [ + { + "name": "vm", + "type": "query", + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "query": "label_values(pve_guest_info{template=\"0\"}, name)", + "refresh": 2, + "includeAll": true, + "multi": true, + "current": {"text": "All", "value": "$__all"} + } + ] + }, + "panels": [ + { + "id": 1, + "title": "VMs Running", + "type": "stat", + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count(pve_up{id=~\"qemu/.*\"} * on(id) pve_guest_info{template=\"0\"} == 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [{"color": "green", "value": null}] + } + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + } + }, + { + "id": 2, + "title": "VMs Stopped", + "type": "stat", + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count(pve_up{id=~\"qemu/.*\"} * on(id) pve_guest_info{template=\"0\"} == 0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 1}, + {"color": "red", "value": 3} + ] + }, + "noValue": "0" + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + } + }, + { + "id": 3, + "title": "Node CPU", + "type": "gauge", + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "pve_cpu_usage_ratio{id=~\"node/.*\"} * 100", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 70}, + {"color": "red", "value": 90} + ] + } + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "showThresholdLabels": false, + "showThresholdMarkers": true + } + }, + { + "id": 4, + "title": "Node Memory", + "type": "gauge", + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "pve_memory_usage_bytes{id=~\"node/.*\"} / pve_memory_size_bytes{id=~\"node/.*\"} * 100", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 70}, + {"color": "red", "value": 90} + ] + } + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "showThresholdLabels": false, + "showThresholdMarkers": true + } + }, + { + "id": 5, + "title": "Node Uptime", + "type": "stat", + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "pve_uptime_seconds{id=~\"node/.*\"}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [{"color": "blue", "value": null}] + } + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + } + }, + { + "id": 6, + "title": "Templates", + "type": "stat", + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count(pve_guest_info{template=\"1\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [{"color": "purple", "value": null}] + } + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + } + }, + { + "id": 7, + "title": "VM Status", + "type": "table", + "gridPos": {"h": 10, "w": 24, "x": 0, "y": 4}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "pve_guest_info{template=\"0\", name=~\"$vm\"}", + "format": "table", + "instant": true, + "refId": "info" + }, + { + "expr": "pve_up{id=~\"qemu/.*\"} * on(id) group_left(name) pve_guest_info{template=\"0\", name=~\"$vm\"}", + "format": "table", + "instant": true, + "refId": "status" + }, + { + "expr": "pve_cpu_usage_ratio{id=~\"qemu/.*\"} * on(id) group_left(name) pve_guest_info{template=\"0\", name=~\"$vm\"} * 100", + "format": "table", + "instant": true, + "refId": "cpu" + }, + { + "expr": "pve_memory_usage_bytes{id=~\"qemu/.*\"} * on(id) group_left(name) pve_guest_info{template=\"0\", name=~\"$vm\"} / on(id) pve_memory_size_bytes * 100", + "format": "table", + "instant": true, + "refId": "mem" + }, + { + "expr": "pve_uptime_seconds{id=~\"qemu/.*\"} * on(id) group_left(name) pve_guest_info{template=\"0\", name=~\"$vm\"}", + "format": "table", + "instant": true, + "refId": "uptime" + } + ], + "fieldConfig": { + "defaults": {}, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Name"}, + "properties": [{"id": "custom.width", "value": 150}] + }, + { + "matcher": {"id": "byName", "options": "Status"}, + "properties": [ + {"id": "custom.width", "value": 80}, + {"id": "mappings", "value": [ + {"type": "value", "options": {"0": {"text": "Stopped", "color": "red"}}}, + {"type": "value", "options": {"1": {"text": "Running", "color": "green"}}} + ]}, + {"id": "custom.cellOptions", "value": {"type": "color-text"}} + ] + }, + { + "matcher": {"id": "byName", "options": "CPU %"}, + "properties": [ + {"id": "unit", "value": "percent"}, + {"id": "decimals", "value": 1}, + {"id": "custom.width", "value": 80}, + {"id": "custom.cellOptions", "value": {"type": "gauge", "mode": "basic"}}, + {"id": "min", "value": 0}, + {"id": "max", "value": 100}, + {"id": "thresholds", "value": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 50}, {"color": "red", "value": 80}]}} + ] + }, + { + "matcher": {"id": "byName", "options": "Memory %"}, + "properties": [ + {"id": "unit", "value": "percent"}, + {"id": "decimals", "value": 1}, + {"id": "custom.width", "value": 100}, + {"id": "custom.cellOptions", "value": {"type": "gauge", "mode": "basic"}}, + {"id": "min", "value": 0}, + {"id": "max", "value": 100}, + {"id": "thresholds", "value": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 70}, {"color": "red", "value": 90}]}} + ] + }, + { + "matcher": {"id": "byName", "options": "Uptime"}, + "properties": [ + {"id": "unit", "value": "s"}, + {"id": "custom.width", "value": 100} + ] + }, + { + "matcher": {"id": "byName", "options": "ID"}, + "properties": [{"id": "custom.width", "value": 90}] + } + ] + }, + "options": { + "showHeader": true, + "sortBy": [{"displayName": "Name", "desc": false}] + }, + "transformations": [ + { + "id": "joinByField", + "options": {"byField": "name", "mode": "outer"} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Time 1": true, + "Time 2": true, + "Time 3": true, + "Time 4": true, + "Value #info": true, + "id 1": true, + "id 2": true, + "id 3": true, + "id 4": true, + "instance": true, + "instance 1": true, + "instance 2": true, + "instance 3": true, + "instance 4": true, + "job": true, + "job 1": true, + "job 2": true, + "job 3": true, + "job 4": true, + "name 1": true, + "name 2": true, + "name 3": true, + "name 4": true, + "node": true, + "tags": true, + "template": true, + "type": true + }, + "indexByName": { + "name": 0, + "id": 1, + "Value #status": 2, + "Value #cpu": 3, + "Value #mem": 4, + "Value #uptime": 5 + }, + "renameByName": { + "name": "Name", + "id": "ID", + "Value #status": "Status", + "Value #cpu": "CPU %", + "Value #mem": "Memory %", + "Value #uptime": "Uptime" + } + } + } + ] + }, + { + "id": 8, + "title": "VM CPU Usage", + "type": "timeseries", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 14}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "pve_cpu_usage_ratio{id=~\"qemu/.*\"} * on(id) group_left(name) pve_guest_info{template=\"0\", name=~\"$vm\"} * 100", + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "custom": { + "lineWidth": 1, + "fillOpacity": 10, + "showPoints": "never" + } + } + }, + "options": { + "legend": {"displayMode": "list", "placement": "bottom"}, + "tooltip": {"mode": "multi", "sort": "desc"} + } + }, + { + "id": 9, + "title": "VM Memory Usage", + "type": "timeseries", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 14}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "pve_memory_usage_bytes{id=~\"qemu/.*\"} * on(id) group_left(name) pve_guest_info{template=\"0\", name=~\"$vm\"}", + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes", + "min": 0, + "custom": { + "lineWidth": 1, + "fillOpacity": 10, + "showPoints": "never" + } + } + }, + "options": { + "legend": {"displayMode": "list", "placement": "bottom"}, + "tooltip": {"mode": "multi", "sort": "desc"} + } + }, + { + "id": 10, + "title": "VM Network Traffic", + "type": "timeseries", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 22}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "rate(pve_network_receive_bytes{id=~\"qemu/.*\"}[5m]) * on(id) group_left(name) pve_guest_info{template=\"0\", name=~\"$vm\"}", + "legendFormat": "{{name}} RX", + "refId": "A" + }, + { + "expr": "-rate(pve_network_transmit_bytes{id=~\"qemu/.*\"}[5m]) * on(id) group_left(name) pve_guest_info{template=\"0\", name=~\"$vm\"}", + "legendFormat": "{{name}} TX", + "refId": "B" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps", + "custom": { + "lineWidth": 1, + "fillOpacity": 10, + "showPoints": "never" + } + } + }, + "options": { + "legend": {"displayMode": "list", "placement": "bottom"}, + "tooltip": {"mode": "multi", "sort": "desc"} + } + }, + { + "id": 11, + "title": "VM Disk I/O", + "type": "timeseries", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 22}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "rate(pve_disk_read_bytes{id=~\"qemu/.*\"}[5m]) * on(id) group_left(name) pve_guest_info{template=\"0\", name=~\"$vm\"}", + "legendFormat": "{{name}} Read", + "refId": "A" + }, + { + "expr": "-rate(pve_disk_write_bytes{id=~\"qemu/.*\"}[5m]) * on(id) group_left(name) pve_guest_info{template=\"0\", name=~\"$vm\"}", + "legendFormat": "{{name}} Write", + "refId": "B" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps", + "custom": { + "lineWidth": 1, + "fillOpacity": 10, + "showPoints": "never" + } + } + }, + "options": { + "legend": {"displayMode": "list", "placement": "bottom"}, + "tooltip": {"mode": "multi", "sort": "desc"} + } + }, + { + "id": 12, + "title": "Storage Usage", + "type": "bargauge", + "gridPos": {"h": 6, "w": 12, "x": 0, "y": 30}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "pve_disk_usage_bytes{id=~\"storage/.*\"} / pve_disk_size_bytes{id=~\"storage/.*\"} * 100", + "legendFormat": "{{id}}", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 70}, + {"color": "red", "value": 85} + ] + } + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "orientation": "horizontal", + "displayMode": "gradient", + "showUnfilled": true + }, + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "storage/pve1/(.*)", + "renamePattern": "$1" + } + } + ] + }, + { + "id": 13, + "title": "Storage Capacity", + "type": "table", + "gridPos": {"h": 6, "w": 12, "x": 12, "y": 30}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "pve_disk_size_bytes{id=~\"storage/.*\"}", + "format": "table", + "instant": true, + "refId": "size" + }, + { + "expr": "pve_disk_usage_bytes{id=~\"storage/.*\"}", + "format": "table", + "instant": true, + "refId": "used" + }, + { + "expr": "pve_disk_size_bytes{id=~\"storage/.*\"} - pve_disk_usage_bytes{id=~\"storage/.*\"}", + "format": "table", + "instant": true, + "refId": "free" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes" + }, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Storage"}, + "properties": [{"id": "unit", "value": "none"}] + } + ] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "joinByField", + "options": {"byField": "id", "mode": "outer"} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Time 1": true, + "Time 2": true, + "instance": true, + "instance 1": true, + "instance 2": true, + "job": true, + "job 1": true, + "job 2": true + }, + "renameByName": { + "id": "Storage", + "Value #size": "Total", + "Value #used": "Used", + "Value #free": "Free" + } + } + }, + { + "id": "renameByRegex", + "options": { + "regex": "storage/pve1/(.*)", + "renamePattern": "$1" + } + } + ] + } + ] +} -- 2.49.1 From 03ebee4d822dd302bbcba9cd466823d999214f9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sun, 8 Feb 2026 21:04:41 +0100 Subject: [PATCH 7/8] grafana: fix proxmox table __name__ column Co-Authored-By: Claude Opus 4.5 --- services/grafana/dashboards/proxmox.json | 1 + 1 file changed, 1 insertion(+) diff --git a/services/grafana/dashboards/proxmox.json b/services/grafana/dashboards/proxmox.json index ffba8a0..0f61f2b 100644 --- a/services/grafana/dashboards/proxmox.json +++ b/services/grafana/dashboards/proxmox.json @@ -313,6 +313,7 @@ "Time 3": true, "Time 4": true, "Value #info": true, + "__name__": true, "id 1": true, "id 2": true, "id 3": true, -- 2.49.1 From 89d0a6f3580062c606e8becd73941b607ee06ab7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sun, 8 Feb 2026 21:06:59 +0100 Subject: [PATCH 8/8] grafana: add systemd services dashboard Dashboard for monitoring systemd across the fleet: - Summary stats: failed/active/inactive units, restarts, timers - Failed units table (shows any units in failed state) - Service restarts table (top 15 services by restart count) - Active units per host bar chart - NixOS upgrade timer table with last trigger time - Backup timers table (restic jobs) - Service restarts over time chart - Hostname filter to focus on specific hosts Co-Authored-By: Claude Opus 4.5 --- services/grafana/dashboards/systemd.json | 553 +++++++++++++++++++++++ 1 file changed, 553 insertions(+) create mode 100644 services/grafana/dashboards/systemd.json diff --git a/services/grafana/dashboards/systemd.json b/services/grafana/dashboards/systemd.json new file mode 100644 index 0000000..8162527 --- /dev/null +++ b/services/grafana/dashboards/systemd.json @@ -0,0 +1,553 @@ +{ + "uid": "systemd-homelab", + "title": "Systemd Services - Homelab", + "tags": ["systemd", "services", "homelab"], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "1m", + "time": { + "from": "now-24h", + "to": "now" + }, + "templating": { + "list": [ + { + "name": "hostname", + "type": "query", + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "query": "label_values(systemd_unit_state, hostname)", + "refresh": 2, + "includeAll": true, + "multi": true, + "current": {"text": "All", "value": "$__all"} + } + ] + }, + "panels": [ + { + "id": 1, + "title": "Failed Units", + "type": "stat", + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count(systemd_unit_state{state=\"failed\", hostname=~\"$hostname\"} == 1) or vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "red", "value": 1} + ] + }, + "noValue": "0" + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + } + }, + { + "id": 2, + "title": "Active Units", + "type": "stat", + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count(systemd_unit_state{state=\"active\", hostname=~\"$hostname\"} == 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [{"color": "green", "value": null}] + } + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + } + }, + { + "id": 3, + "title": "Hosts Monitored", + "type": "stat", + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count(count by (hostname) (systemd_unit_state{hostname=~\"$hostname\"}))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [{"color": "blue", "value": null}] + } + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + } + }, + { + "id": 4, + "title": "Total Service Restarts", + "type": "stat", + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "sum(systemd_service_restart_total{hostname=~\"$hostname\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 10}, + {"color": "orange", "value": 50} + ] + }, + "noValue": "0" + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + } + }, + { + "id": 5, + "title": "Inactive Units", + "type": "stat", + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count(systemd_unit_state{state=\"inactive\", hostname=~\"$hostname\"} == 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [{"color": "purple", "value": null}] + } + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + } + }, + { + "id": 6, + "title": "Timers", + "type": "stat", + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 0}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "count(systemd_timer_last_trigger_seconds{hostname=~\"$hostname\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [{"color": "blue", "value": null}] + } + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "graphMode": "none" + } + }, + { + "id": 7, + "title": "Failed Units", + "type": "table", + "gridPos": {"h": 6, "w": 12, "x": 0, "y": 4}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "systemd_unit_state{state=\"failed\", hostname=~\"$hostname\"} == 1", + "format": "table", + "instant": true, + "refId": "A" + } + ], + "fieldConfig": { + "defaults": {}, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Host"}, + "properties": [{"id": "custom.width", "value": 120}] + }, + { + "matcher": {"id": "byName", "options": "Unit"}, + "properties": [{"id": "custom.width", "value": 300}] + } + ] + }, + "options": { + "showHeader": true, + "sortBy": [{"displayName": "Host", "desc": false}] + }, + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "__name__": true, + "dns_role": true, + "instance": true, + "job": true, + "role": true, + "state": true, + "tier": true, + "type": true + }, + "renameByName": { + "hostname": "Host", + "name": "Unit" + } + } + } + ], + "description": "Units currently in failed state" + }, + { + "id": 8, + "title": "Service Restarts (Top 15)", + "type": "table", + "gridPos": {"h": 6, "w": 12, "x": 12, "y": 4}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "topk(15, systemd_service_restart_total{hostname=~\"$hostname\"} > 0)", + "format": "table", + "instant": true, + "refId": "A" + } + ], + "fieldConfig": { + "defaults": {}, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Host"}, + "properties": [{"id": "custom.width", "value": 120}] + }, + { + "matcher": {"id": "byName", "options": "Service"}, + "properties": [{"id": "custom.width", "value": 280}] + }, + { + "matcher": {"id": "byName", "options": "Restarts"}, + "properties": [{"id": "custom.width", "value": 80}] + } + ] + }, + "options": { + "showHeader": true, + "sortBy": [{"displayName": "Restarts", "desc": true}] + }, + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "__name__": true, + "dns_role": true, + "instance": true, + "job": true, + "role": true, + "tier": true + }, + "renameByName": { + "hostname": "Host", + "name": "Service", + "Value": "Restarts" + } + } + } + ], + "description": "Services that have been restarted (since host boot)" + }, + { + "id": 9, + "title": "Active Units per Host", + "type": "bargauge", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 10}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "sort_desc(count by (hostname) (systemd_unit_state{state=\"active\", hostname=~\"$hostname\"} == 1))", + "legendFormat": "{{hostname}}", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [{"color": "green", "value": null}] + }, + "min": 0 + } + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "orientation": "horizontal", + "displayMode": "gradient", + "showUnfilled": true + } + }, + { + "id": 10, + "title": "NixOS Upgrade Timers", + "type": "table", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 10}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "systemd_timer_last_trigger_seconds{name=\"nixos-upgrade.timer\", hostname=~\"$hostname\"}", + "format": "table", + "instant": true, + "refId": "last" + }, + { + "expr": "time() - systemd_timer_last_trigger_seconds{name=\"nixos-upgrade.timer\", hostname=~\"$hostname\"}", + "format": "table", + "instant": true, + "refId": "ago" + } + ], + "fieldConfig": { + "defaults": {}, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Host"}, + "properties": [{"id": "custom.width", "value": 130}] + }, + { + "matcher": {"id": "byName", "options": "Last Trigger"}, + "properties": [ + {"id": "unit", "value": "dateTimeAsLocalNoDateIfToday"}, + {"id": "custom.width", "value": 180} + ] + }, + { + "matcher": {"id": "byName", "options": "Time Ago"}, + "properties": [ + {"id": "unit", "value": "s"}, + {"id": "custom.width", "value": 120}, + {"id": "thresholds", "value": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}, + {"id": "custom.cellOptions", "value": {"type": "color-text"}} + ] + } + ] + }, + "options": { + "showHeader": true, + "sortBy": [{"displayName": "Time Ago", "desc": true}] + }, + "transformations": [ + { + "id": "joinByField", + "options": {"byField": "hostname", "mode": "outer"} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Time 1": true, + "__name__": true, + "__name__ 1": true, + "dns_role": true, + "dns_role 1": true, + "instance": true, + "instance 1": true, + "job": true, + "job 1": true, + "name": true, + "name 1": true, + "role": true, + "role 1": true, + "tier": true, + "tier 1": true + }, + "indexByName": { + "hostname": 0, + "Value #last": 1, + "Value #ago": 2 + }, + "renameByName": { + "hostname": "Host", + "Value #last": "Last Trigger", + "Value #ago": "Time Ago" + } + } + } + ], + "description": "When nixos-upgrade.timer last ran on each host. Yellow >24h, Red >48h." + }, + { + "id": 11, + "title": "Backup Timers", + "type": "table", + "gridPos": {"h": 6, "w": 12, "x": 0, "y": 18}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "systemd_timer_last_trigger_seconds{name=~\"restic.*\", hostname=~\"$hostname\"}", + "format": "table", + "instant": true, + "refId": "last" + }, + { + "expr": "time() - systemd_timer_last_trigger_seconds{name=~\"restic.*\", hostname=~\"$hostname\"}", + "format": "table", + "instant": true, + "refId": "ago" + } + ], + "fieldConfig": { + "defaults": {}, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Host"}, + "properties": [{"id": "custom.width", "value": 120}] + }, + { + "matcher": {"id": "byName", "options": "Timer"}, + "properties": [{"id": "custom.width", "value": 220}] + }, + { + "matcher": {"id": "byName", "options": "Last Trigger"}, + "properties": [ + {"id": "unit", "value": "dateTimeAsLocalNoDateIfToday"}, + {"id": "custom.width", "value": 180} + ] + }, + { + "matcher": {"id": "byName", "options": "Time Ago"}, + "properties": [ + {"id": "unit", "value": "s"}, + {"id": "custom.width", "value": 100}, + {"id": "thresholds", "value": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}}, + {"id": "custom.cellOptions", "value": {"type": "color-text"}} + ] + } + ] + }, + "options": { + "showHeader": true, + "sortBy": [{"displayName": "Time Ago", "desc": true}] + }, + "transformations": [ + { + "id": "joinByField", + "options": {"byField": "name", "mode": "outer"} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Time 1": true, + "__name__": true, + "__name__ 1": true, + "dns_role": true, + "dns_role 1": true, + "instance": true, + "instance 1": true, + "job": true, + "job 1": true, + "role": true, + "role 1": true, + "tier": true, + "tier 1": true, + "hostname 1": true + }, + "indexByName": { + "hostname": 0, + "name": 1, + "Value #last": 2, + "Value #ago": 3 + }, + "renameByName": { + "hostname": "Host", + "name": "Timer", + "Value #last": "Last Trigger", + "Value #ago": "Time Ago" + } + } + } + ], + "description": "Restic backup timers" + }, + { + "id": 12, + "title": "Service Restarts Over Time", + "type": "timeseries", + "gridPos": {"h": 6, "w": 12, "x": 12, "y": 18}, + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "targets": [ + { + "expr": "sum by (hostname) (increase(systemd_service_restart_total{hostname=~\"$hostname\"}[1h]))", + "legendFormat": "{{hostname}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "lineWidth": 1, + "fillOpacity": 20, + "showPoints": "never", + "stacking": {"mode": "normal"} + } + } + }, + "options": { + "legend": {"displayMode": "list", "placement": "bottom"}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "description": "Service restart rate per hour" + } + ] +} -- 2.49.1