grafana: add NixOS fleet dashboard
Some checks failed
Run nix flake check / flake-check (push) Has been cancelled
Some checks failed
Run nix flake check / flake-check (push) Has been cancelled
Dashboard for monitoring NixOS deployments across the homelab: - Hosts behind remote / needing reboot stat panels - Fleet status table with revision, behind status, reboot needed, age - Generation age bar chart (shows stale configs) - Generations per host bar chart - Deployment activity time series (see when hosts were updated) - Flake input ages table - Pie charts for hosts by revision and tier - Tier filter variable Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
558
services/grafana/dashboards/nixos-fleet.json
Normal file
558
services/grafana/dashboards/nixos-fleet.json
Normal file
@@ -0,0 +1,558 @@
|
||||
{
|
||||
"uid": "nixos-fleet-homelab",
|
||||
"title": "NixOS Fleet - Homelab",
|
||||
"tags": ["nixos", "fleet", "homelab"],
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 39,
|
||||
"version": 1,
|
||||
"refresh": "1m",
|
||||
"time": {
|
||||
"from": "now-7d",
|
||||
"to": "now"
|
||||
},
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "tier",
|
||||
"type": "query",
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"query": "label_values(nixos_flake_info, tier)",
|
||||
"refresh": 2,
|
||||
"includeAll": true,
|
||||
"multi": false,
|
||||
"current": {"text": "All", "value": "$__all"}
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Hosts Behind Remote",
|
||||
"type": "stat",
|
||||
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 0},
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(nixos_flake_revision_behind{tier=~\"$tier\"} == 1)",
|
||||
"legendFormat": "Behind",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 1},
|
||||
{"color": "red", "value": 5}
|
||||
]
|
||||
},
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"description": "Number of hosts where current revision differs from remote master"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Hosts Needing Reboot",
|
||||
"type": "stat",
|
||||
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 0},
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(nixos_config_mismatch{tier=~\"$tier\"} == 1)",
|
||||
"legendFormat": "Need Reboot",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 1},
|
||||
{"color": "orange", "value": 3},
|
||||
{"color": "red", "value": 5}
|
||||
]
|
||||
},
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||
"colorMode": "value",
|
||||
"graphMode": "none"
|
||||
},
|
||||
"description": "Hosts where booted generation differs from current (switched but not rebooted)"
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Total Hosts",
|
||||
"type": "stat",
|
||||
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 0},
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(nixos_flake_info{tier=~\"$tier\"})",
|
||||
"legendFormat": "Hosts",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{"color": "blue", "value": null}]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||
"colorMode": "value",
|
||||
"graphMode": "none"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Nixpkgs Age",
|
||||
"type": "stat",
|
||||
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 0},
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "max(nixos_flake_input_age_seconds{input=\"nixpkgs\", tier=~\"$tier\"})",
|
||||
"legendFormat": "Nixpkgs",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 604800},
|
||||
{"color": "orange", "value": 1209600},
|
||||
{"color": "red", "value": 2592000}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||
"colorMode": "value",
|
||||
"graphMode": "none"
|
||||
},
|
||||
"description": "Age of nixpkgs flake input (yellow >7d, orange >14d, red >30d)"
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Hosts Up-to-date",
|
||||
"type": "stat",
|
||||
"gridPos": {"h": 4, "w": 4, "x": 20, "y": 0},
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(nixos_flake_revision_behind{tier=~\"$tier\"} == 0)",
|
||||
"legendFormat": "Up-to-date",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{"color": "green", "value": null}]
|
||||
},
|
||||
"noValue": "0"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||
"colorMode": "value",
|
||||
"graphMode": "none"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "Fleet Status",
|
||||
"type": "table",
|
||||
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 4},
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "nixos_flake_info{tier=~\"$tier\"}",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "info"
|
||||
},
|
||||
{
|
||||
"expr": "nixos_flake_revision_behind{tier=~\"$tier\"}",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "behind"
|
||||
},
|
||||
{
|
||||
"expr": "nixos_config_mismatch{tier=~\"$tier\"}",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "mismatch"
|
||||
},
|
||||
{
|
||||
"expr": "nixos_generation_age_seconds{tier=~\"$tier\"}",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "age"
|
||||
},
|
||||
{
|
||||
"expr": "nixos_generation_count{tier=~\"$tier\"}",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "count"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "Hostname"},
|
||||
"properties": [{"id": "custom.width", "value": 120}]
|
||||
},
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "Current Rev"},
|
||||
"properties": [{"id": "custom.width", "value": 90}]
|
||||
},
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "Remote Rev"},
|
||||
"properties": [{"id": "custom.width", "value": 90}]
|
||||
},
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "Behind"},
|
||||
"properties": [
|
||||
{"id": "custom.width", "value": 70},
|
||||
{"id": "mappings", "value": [
|
||||
{"type": "value", "options": {"0": {"text": "No", "color": "green"}}},
|
||||
{"type": "value", "options": {"1": {"text": "Yes", "color": "red"}}}
|
||||
]},
|
||||
{"id": "custom.cellOptions", "value": {"type": "color-text"}}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "Need Reboot"},
|
||||
"properties": [
|
||||
{"id": "custom.width", "value": 100},
|
||||
{"id": "mappings", "value": [
|
||||
{"type": "value", "options": {"0": {"text": "No", "color": "green"}}},
|
||||
{"type": "value", "options": {"1": {"text": "Yes", "color": "orange"}}}
|
||||
]},
|
||||
{"id": "custom.cellOptions", "value": {"type": "color-text"}}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "Config Age"},
|
||||
"properties": [
|
||||
{"id": "unit", "value": "s"},
|
||||
{"id": "custom.width", "value": 100}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "Generations"},
|
||||
"properties": [{"id": "custom.width", "value": 100}]
|
||||
},
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "Tier"},
|
||||
"properties": [{"id": "custom.width", "value": 60}]
|
||||
},
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "Role"},
|
||||
"properties": [{"id": "custom.width", "value": 80}]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"showHeader": true,
|
||||
"sortBy": [{"displayName": "Hostname", "desc": false}]
|
||||
},
|
||||
"transformations": [
|
||||
{
|
||||
"id": "joinByField",
|
||||
"options": {"byField": "hostname", "mode": "outer"}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"Time 1": true,
|
||||
"Time 2": true,
|
||||
"Time 3": true,
|
||||
"Time 4": true,
|
||||
"Time 5": true,
|
||||
"Value #info": true,
|
||||
"dns_role": true,
|
||||
"dns_role 1": true,
|
||||
"dns_role 2": true,
|
||||
"dns_role 3": true,
|
||||
"dns_role 4": true,
|
||||
"instance": true,
|
||||
"instance 1": true,
|
||||
"instance 2": true,
|
||||
"instance 3": true,
|
||||
"instance 4": true,
|
||||
"job": true,
|
||||
"job 1": true,
|
||||
"job 2": true,
|
||||
"job 3": true,
|
||||
"job 4": true,
|
||||
"nixos_version": true,
|
||||
"nixpkgs_rev": true,
|
||||
"role 1": true,
|
||||
"role 2": true,
|
||||
"role 3": true,
|
||||
"role 4": true,
|
||||
"tier 1": true,
|
||||
"tier 2": true,
|
||||
"tier 3": true,
|
||||
"tier 4": true
|
||||
},
|
||||
"indexByName": {
|
||||
"hostname": 0,
|
||||
"tier": 1,
|
||||
"role": 2,
|
||||
"current_rev": 3,
|
||||
"remote_rev": 4,
|
||||
"Value #behind": 5,
|
||||
"Value #mismatch": 6,
|
||||
"Value #age": 7,
|
||||
"Value #count": 8
|
||||
},
|
||||
"renameByName": {
|
||||
"hostname": "Hostname",
|
||||
"tier": "Tier",
|
||||
"role": "Role",
|
||||
"current_rev": "Current Rev",
|
||||
"remote_rev": "Remote Rev",
|
||||
"Value #behind": "Behind",
|
||||
"Value #mismatch": "Need Reboot",
|
||||
"Value #age": "Config Age",
|
||||
"Value #count": "Generations"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "Generation Age by Host",
|
||||
"type": "bargauge",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 14},
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sort_desc(nixos_generation_age_seconds{tier=~\"$tier\"})",
|
||||
"legendFormat": "{{hostname}}",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 86400},
|
||||
{"color": "orange", "value": 259200},
|
||||
{"color": "red", "value": 604800}
|
||||
]
|
||||
},
|
||||
"min": 0
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"description": "How long ago each host's current config was deployed (yellow >1d, orange >3d, red >7d)"
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"title": "Generations per Host",
|
||||
"type": "bargauge",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 14},
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sort_desc(nixos_generation_count{tier=~\"$tier\"})",
|
||||
"legendFormat": "{{hostname}}",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "blue", "value": null},
|
||||
{"color": "purple", "value": 50}
|
||||
]
|
||||
},
|
||||
"min": 0
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"description": "Total number of NixOS generations on each host"
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"title": "Deployment Activity (Generation Age Over Time)",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 22},
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "nixos_generation_age_seconds{tier=~\"$tier\"}",
|
||||
"legendFormat": "{{hostname}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"custom": {
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 0,
|
||||
"showPoints": "never",
|
||||
"stacking": {"mode": "none"}
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||
},
|
||||
"description": "Generation age increases over time, drops to near-zero when deployed. Useful to see deployment patterns."
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"title": "Flake Input Ages",
|
||||
"type": "table",
|
||||
"gridPos": {"h": 6, "w": 12, "x": 0, "y": 30},
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "max by (input) (nixos_flake_input_age_seconds)",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "input"},
|
||||
"properties": [{"id": "custom.width", "value": 150}]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"showHeader": true,
|
||||
"sortBy": [{"displayName": "Value", "desc": true}]
|
||||
},
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {"Time": true},
|
||||
"renameByName": {
|
||||
"input": "Flake Input",
|
||||
"Value": "Age"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"description": "Age of each flake input across the fleet"
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"title": "Hosts by Revision",
|
||||
"type": "piechart",
|
||||
"gridPos": {"h": 6, "w": 6, "x": 12, "y": 30},
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count by (current_rev) (nixos_flake_info{tier=~\"$tier\"})",
|
||||
"legendFormat": "{{current_rev}}",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||
"legend": {"displayMode": "table", "placement": "right", "values": ["value"]},
|
||||
"pieType": "pie"
|
||||
},
|
||||
"description": "Distribution of hosts by their current flake revision"
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"title": "Hosts by Tier",
|
||||
"type": "piechart",
|
||||
"gridPos": {"h": 6, "w": 6, "x": 18, "y": 30},
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count by (tier) (nixos_flake_info)",
|
||||
"legendFormat": "{{tier}}",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||
"legend": {"displayMode": "table", "placement": "right", "values": ["value"]},
|
||||
"pieType": "pie"
|
||||
},
|
||||
"transformations": [
|
||||
{
|
||||
"id": "renameByRegex",
|
||||
"options": {
|
||||
"regex": "^$",
|
||||
"renamePattern": "prod"
|
||||
}
|
||||
}
|
||||
],
|
||||
"description": "Distribution of hosts by tier (test vs prod)"
|
||||
}
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user