grafana: add NixOS operations dashboard
Loki-based dashboard for tracking NixOS operations including: - Upgrade activity and success/failure stats - Build activity during upgrades - Bootstrap logs for new VM deployments - ACME certificate renewal activity Log panels use LogQL json parsing with | keep host to show clean messages with host labels. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
296
services/grafana/dashboards/nixos-operations.json
Normal file
296
services/grafana/dashboards/nixos-operations.json
Normal file
@@ -0,0 +1,296 @@
|
||||
{
|
||||
"uid": "nixos-operations",
|
||||
"title": "NixOS Operations",
|
||||
"tags": ["loki", "nixos", "operations", "homelab"],
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 39,
|
||||
"version": 1,
|
||||
"refresh": "1m",
|
||||
"time": {
|
||||
"from": "now-24h",
|
||||
"to": "now"
|
||||
},
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "host",
|
||||
"type": "query",
|
||||
"datasource": {"type": "loki", "uid": "loki"},
|
||||
"query": "label_values(host)",
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"current": {"text": "All", "value": "$__all"}
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Upgrade Log Volume",
|
||||
"type": "stat",
|
||||
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 0},
|
||||
"datasource": {"type": "loki", "uid": "loki"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(count_over_time({systemd_unit=\"nixos-upgrade.service\", host=~\"$host\"} [$__range]))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{"color": "blue", "value": null}]
|
||||
},
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||
"colorMode": "value",
|
||||
"graphMode": "none"
|
||||
},
|
||||
"description": "Total log entries from nixos-upgrade.service in selected time range"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Successful Upgrades",
|
||||
"type": "stat",
|
||||
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 0},
|
||||
"datasource": {"type": "loki", "uid": "loki"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(count_over_time({systemd_unit=\"nixos-upgrade.service\", host=~\"$host\"} |= \"Done. The new configuration is\" [$__range]))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{"color": "green", "value": null}]
|
||||
},
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||
"colorMode": "value",
|
||||
"graphMode": "none"
|
||||
},
|
||||
"description": "Upgrades that completed successfully"
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Upgrade Errors",
|
||||
"type": "stat",
|
||||
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 0},
|
||||
"datasource": {"type": "loki", "uid": "loki"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(count_over_time({systemd_unit=\"nixos-upgrade.service\", host=~\"$host\"} |~ \"(?i)error|failed\" [$__range]))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "red", "value": 1}
|
||||
]
|
||||
},
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||
"colorMode": "value",
|
||||
"graphMode": "none"
|
||||
},
|
||||
"description": "Upgrade log entries containing errors"
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Bootstrap Events",
|
||||
"type": "stat",
|
||||
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 0},
|
||||
"datasource": {"type": "loki", "uid": "loki"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(count_over_time({job=\"bootstrap\", host=~\"$host\"} [$__range]))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{"color": "purple", "value": null}]
|
||||
},
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||
"colorMode": "value",
|
||||
"graphMode": "none"
|
||||
},
|
||||
"description": "Bootstrap log entries from new VM deployments"
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Upgrade Activity by Host",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 4},
|
||||
"datasource": {"type": "loki", "uid": "loki"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (host) (count_over_time({systemd_unit=\"nixos-upgrade.service\", host=~\"$host\"} [5m]))",
|
||||
"legendFormat": "{{host}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"custom": {
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 30,
|
||||
"showPoints": "never",
|
||||
"stacking": {"mode": "normal"}
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {"displayMode": "list", "placement": "bottom"},
|
||||
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||
},
|
||||
"description": "When upgrades ran on each host"
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "ACME Certificate Activity",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 4},
|
||||
"datasource": {"type": "loki", "uid": "loki"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (host) (count_over_time({systemd_unit=~\"acme.*\", host=~\"$host\"} [5m]))",
|
||||
"legendFormat": "{{host}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"custom": {
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 30,
|
||||
"showPoints": "never",
|
||||
"stacking": {"mode": "normal"}
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {"displayMode": "list", "placement": "bottom"},
|
||||
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||
},
|
||||
"description": "ACME certificate renewal activity"
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "Recent Upgrade Completions",
|
||||
"type": "logs",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 12},
|
||||
"datasource": {"type": "loki", "uid": "loki"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "{systemd_unit=\"nixos-upgrade.service\", host=~\"$host\"} |= \"Done. The new configuration is\" | json | line_format \"{{.MESSAGE}}\" | keep host",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"showTime": true,
|
||||
"showLabels": true,
|
||||
"showCommonLabels": false,
|
||||
"wrapLogMessage": true,
|
||||
"prettifyLogMessage": false,
|
||||
"enableLogDetails": true,
|
||||
"sortOrder": "Descending"
|
||||
},
|
||||
"description": "Successful upgrade completion messages showing the new system path"
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"title": "Build Activity",
|
||||
"type": "logs",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 12},
|
||||
"datasource": {"type": "loki", "uid": "loki"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "{systemd_unit=\"nixos-upgrade.service\", host=~\"$host\"} |= \"building\" | json | line_format \"{{.MESSAGE}}\" | keep host",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"showTime": true,
|
||||
"showLabels": true,
|
||||
"showCommonLabels": false,
|
||||
"wrapLogMessage": true,
|
||||
"prettifyLogMessage": false,
|
||||
"enableLogDetails": true,
|
||||
"sortOrder": "Descending"
|
||||
},
|
||||
"description": "Derivations being built during upgrades"
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"title": "Bootstrap Logs",
|
||||
"type": "logs",
|
||||
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 20},
|
||||
"datasource": {"type": "loki", "uid": "loki"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "{job=\"bootstrap\", host=~\"$host\"}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"showTime": true,
|
||||
"showLabels": true,
|
||||
"showCommonLabels": false,
|
||||
"wrapLogMessage": true,
|
||||
"prettifyLogMessage": false,
|
||||
"enableLogDetails": true,
|
||||
"sortOrder": "Descending"
|
||||
},
|
||||
"description": "Logs from VM bootstrap process (new deployments)"
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"title": "Upgrade Errors & Failures",
|
||||
"type": "logs",
|
||||
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 28},
|
||||
"datasource": {"type": "loki", "uid": "loki"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "{systemd_unit=\"nixos-upgrade.service\", host=~\"$host\"} |~ \"(?i)error|failed\" | json | line_format \"{{.MESSAGE}}\" | keep host",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"showTime": true,
|
||||
"showLabels": true,
|
||||
"showCommonLabels": false,
|
||||
"wrapLogMessage": true,
|
||||
"prettifyLogMessage": false,
|
||||
"enableLogDetails": true,
|
||||
"sortOrder": "Descending"
|
||||
},
|
||||
"description": "Errors and failures during NixOS upgrades"
|
||||
}
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user