monitoring: auto-generate Prometheus scrape targets from host configs
Some checks failed
Run nix flake check / flake-check (pull_request) Successful in 2m49s
Run nix flake check / flake-check (push) Has been cancelled

Add homelab.monitoring NixOS options (enable, scrapeTargets) following
the same pattern as homelab.dns. Prometheus scrape configs are now
auto-generated from flake host configurations and external targets,
replacing hardcoded target lists.

Also cleans up alert rules: snake_case naming, fix zigbee2mqtt typo,
remove duplicate pushgateway alert, add for clauses to monitoring_rules,
remove hardcoded WireGuard public key, and add new alerts for
certificates, proxmox, caddy, smartctl temperature, filesystem
prediction, systemd state, file descriptors, and host reboots.

Fixes grafana scrape target port from 3100 to 3000.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-02-05 00:49:07 +01:00
parent 4e8cc124f2
commit dd1b64de27
12 changed files with 380 additions and 119 deletions

View File

@@ -26,7 +26,11 @@
};
};
};
# monitoring
homelab.monitoring.scrapeTargets = [{
job_name = "wireguard";
port = 9586;
}];
services.prometheus.exporters.wireguard = {
enable = true;
};

145
lib/monitoring.nix Normal file
View File

@@ -0,0 +1,145 @@
{ lib }:
let
# Extract IP address from CIDR notation (e.g., "10.69.13.5/24" -> "10.69.13.5")
extractIP = address:
let
parts = lib.splitString "/" address;
in
builtins.head parts;
# Check if a network interface name looks like a VPN/tunnel interface
isVpnInterface = ifaceName:
lib.hasPrefix "wg" ifaceName ||
lib.hasPrefix "tun" ifaceName ||
lib.hasPrefix "tap" ifaceName ||
lib.hasPrefix "vti" ifaceName;
# Extract monitoring info from a single host configuration
# Returns null if host should not be included
extractHostMonitoring = name: hostConfig:
let
cfg = hostConfig.config;
monConfig = (cfg.homelab or { }).monitoring or { enable = true; scrapeTargets = [ ]; };
dnsConfig = (cfg.homelab or { }).dns or { enable = true; };
hostname = cfg.networking.hostName;
networks = cfg.systemd.network.networks or { };
# Filter out VPN interfaces and find networks with static addresses
physicalNetworks = lib.filterAttrs
(netName: netCfg:
let
ifaceName = netCfg.matchConfig.Name or "";
in
!(isVpnInterface ifaceName) && (netCfg.address or [ ]) != [ ])
networks;
# Get addresses from physical networks only
networkAddresses = lib.flatten (
lib.mapAttrsToList
(netName: netCfg: netCfg.address or [ ])
physicalNetworks
);
firstAddress = if networkAddresses != [ ] then builtins.head networkAddresses else null;
in
if !(monConfig.enable or true) || !(dnsConfig.enable or true) || firstAddress == null then
null
else
{
inherit hostname;
ip = extractIP firstAddress;
scrapeTargets = monConfig.scrapeTargets or [ ];
};
# Generate node-exporter targets from all flake hosts
generateNodeExporterTargets = self: externalTargets:
let
nixosConfigs = self.nixosConfigurations or { };
hostList = lib.filter (x: x != null) (
lib.mapAttrsToList extractHostMonitoring nixosConfigs
);
flakeTargets = map (host: "${host.hostname}.home.2rjus.net:9100") hostList;
in
flakeTargets ++ (externalTargets.nodeExporter or [ ]);
# Generate scrape configs from all flake hosts and external targets
generateScrapeConfigs = self: externalTargets:
let
nixosConfigs = self.nixosConfigurations or { };
hostList = lib.filter (x: x != null) (
lib.mapAttrsToList extractHostMonitoring nixosConfigs
);
# Collect all scrapeTargets from all hosts, grouped by job_name
allTargets = lib.flatten (map
(host:
map
(target: {
inherit (target) job_name port metrics_path scheme scrape_interval honor_labels;
hostname = host.hostname;
})
host.scrapeTargets
)
hostList
);
# Group targets by job_name
grouped = lib.groupBy (t: t.job_name) allTargets;
# Generate a scrape config for each job
flakeScrapeConfigs = lib.mapAttrsToList
(jobName: targets:
let
first = builtins.head targets;
targetAddrs = map
(t:
let
portStr = toString t.port;
in
"${t.hostname}.home.2rjus.net:${portStr}")
targets;
config = {
job_name = jobName;
static_configs = [{
targets = targetAddrs;
}];
}
// (lib.optionalAttrs (first.metrics_path != "/metrics") {
metrics_path = first.metrics_path;
})
// (lib.optionalAttrs (first.scheme != "http") {
scheme = first.scheme;
})
// (lib.optionalAttrs (first.scrape_interval != null) {
scrape_interval = first.scrape_interval;
})
// (lib.optionalAttrs first.honor_labels {
honor_labels = true;
});
in
config
)
grouped;
# External scrape configs
externalScrapeConfigs = map
(ext: {
job_name = ext.job_name;
static_configs = [{
targets = ext.targets;
}];
} // (lib.optionalAttrs (ext ? metrics_path) {
metrics_path = ext.metrics_path;
}) // (lib.optionalAttrs (ext ? scheme) {
scheme = ext.scheme;
}) // (lib.optionalAttrs (ext ? scrape_interval) {
scrape_interval = ext.scrape_interval;
}))
(externalTargets.scrapeConfigs or [ ]);
in
flakeScrapeConfigs ++ externalScrapeConfigs;
in
{
inherit extractHostMonitoring generateNodeExporterTargets generateScrapeConfigs;
}

View File

@@ -2,5 +2,6 @@
{
imports = [
./dns.nix
./monitoring.nix
];
}

View File

@@ -0,0 +1,50 @@
{ config, lib, ... }:
let
cfg = config.homelab.monitoring;
in
{
options.homelab.monitoring = {
enable = lib.mkOption {
type = lib.types.bool;
default = true;
description = "Include this host in Prometheus node-exporter scrape targets";
};
scrapeTargets = lib.mkOption {
type = lib.types.listOf (lib.types.submodule {
options = {
job_name = lib.mkOption {
type = lib.types.str;
description = "Prometheus scrape job name";
};
port = lib.mkOption {
type = lib.types.port;
description = "Port to scrape metrics from";
};
metrics_path = lib.mkOption {
type = lib.types.str;
default = "/metrics";
description = "HTTP path to scrape metrics from";
};
scheme = lib.mkOption {
type = lib.types.str;
default = "http";
description = "HTTP scheme (http or https)";
};
scrape_interval = lib.mkOption {
type = lib.types.nullOr lib.types.str;
default = null;
description = "Override the global scrape interval for this target";
};
honor_labels = lib.mkOption {
type = lib.types.bool;
default = false;
description = "Whether to honor labels from the scraped target";
};
};
});
default = [ ];
description = "Additional Prometheus scrape targets exposed by this host";
};
};
}

View File

@@ -1,5 +1,9 @@
{ pkgs, unstable, ... }:
{
homelab.monitoring.scrapeTargets = [{
job_name = "step-ca";
port = 9000;
}];
sops.secrets."ca_root_pw" = {
sopsFile = ../../secrets/ca/secrets.yaml;
owner = "step-ca";

View File

@@ -1,5 +1,11 @@
{ pkgs, config, ... }:
{
homelab.monitoring.scrapeTargets = [{
job_name = "home-assistant";
port = 8123;
metrics_path = "/api/prometheus";
scrape_interval = "60s";
}];
# Enable the Home Assistant service
services.home-assistant = {
enable = true;

View File

@@ -3,4 +3,9 @@
imports = [
./proxy.nix
];
homelab.monitoring.scrapeTargets = [{
job_name = "caddy";
port = 80;
}];
}

View File

@@ -1,5 +1,9 @@
{ pkgs, ... }:
{
homelab.monitoring.scrapeTargets = [{
job_name = "jellyfin";
port = 8096;
}];
services.jellyfin = {
enable = true;
};

View File

@@ -0,0 +1,12 @@
# Monitoring targets for hosts not managed by this flake
# These are manually maintained and combined with auto-generated targets
{
nodeExporter = [
"gunter.home.2rjus.net:9100"
];
scrapeConfigs = [
{ job_name = "smartctl"; targets = [ "gunter.home.2rjus.net:9633" ]; }
{ job_name = "ghettoptt"; targets = [ "gunter.home.2rjus.net:8989" ]; }
{ job_name = "restic_rest"; targets = [ "10.69.12.52:8000" ]; }
];
}

View File

@@ -1,4 +1,11 @@
{ ... }:
{ self, lib, ... }:
let
monLib = import ../../lib/monitoring.nix { inherit lib; };
externalTargets = import ./external-targets.nix;
nodeExporterTargets = monLib.generateNodeExporterTargets self externalTargets;
autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets;
in
{
services.prometheus = {
enable = true;
@@ -45,26 +52,16 @@
];
scrapeConfigs = [
# Auto-generated node-exporter targets from flake hosts + external
{
job_name = "node-exporter";
static_configs = [
{
targets = [
"ca.home.2rjus.net:9100"
"gunter.home.2rjus.net:9100"
"ha1.home.2rjus.net:9100"
"http-proxy.home.2rjus.net:9100"
"jelly01.home.2rjus.net:9100"
"monitoring01.home.2rjus.net:9100"
"nix-cache01.home.2rjus.net:9100"
"ns1.home.2rjus.net:9100"
"ns2.home.2rjus.net:9100"
"pgdb1.home.2rjus.net:9100"
"nats1.home.2rjus.net:9100"
];
targets = nodeExporterTargets;
}
];
}
# Local monitoring services (not auto-generated)
{
job_name = "prometheus";
static_configs = [
@@ -85,7 +82,7 @@
job_name = "grafana";
static_configs = [
{
targets = [ "localhost:3100" ];
targets = [ "localhost:3000" ];
}
];
}
@@ -98,13 +95,23 @@
];
}
{
job_name = "restic_rest";
job_name = "pushgateway";
honor_labels = true;
static_configs = [
{
targets = [ "10.69.12.52:8000" ];
targets = [ "localhost:9091" ];
}
];
}
{
job_name = "labmon";
static_configs = [
{
targets = [ "monitoring01.home.2rjus.net:9969" ];
}
];
}
# pve-exporter with complex relabel config
{
job_name = "pve-exporter";
static_configs = [
@@ -133,91 +140,8 @@
}
];
}
{
job_name = "caddy";
static_configs = [
{
targets = [ "http-proxy.home.2rjus.net" ];
}
];
}
{
job_name = "jellyfin";
static_configs = [
{
targets = [ "jelly01.home.2rjus.net:8096" ];
}
];
}
{
job_name = "smartctl";
static_configs = [
{
targets = [ "gunter.home.2rjus.net:9633" ];
}
];
}
{
job_name = "wireguard";
static_configs = [
{
targets = [ "http-proxy.home.2rjus.net:9586" ];
}
];
}
{
job_name = "home-assistant";
scrape_interval = "60s";
metrics_path = "/api/prometheus";
static_configs = [
{
targets = [ "ha1.home.2rjus.net:8123" ];
}
];
}
{
job_name = "ghettoptt";
static_configs = [
{
targets = [ "gunter.home.2rjus.net:8989" ];
}
];
}
{
job_name = "step-ca";
static_configs = [
{
targets = [ "ca.home.2rjus.net:9000" ];
}
];
}
{
job_name = "labmon";
static_configs = [
{
targets = [ "monitoring01.home.2rjus.net:9969" ];
}
];
}
{
job_name = "pushgateway";
honor_labels = true;
static_configs = [
{
targets = [ "localhost:9091" ];
}
];
}
{
job_name = "nix-cache_caddy";
scheme = "https";
static_configs = [
{
targets = [ "nix-cache.home.2rjus.net" ];
}
];
}
];
] ++ autoScrapeConfigs;
pushgateway = {
enable = true;
web = {

View File

@@ -57,6 +57,38 @@ groups:
annotations:
summary: "Promtail service not running on {{ $labels.instance }}"
description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes."
- alert: filesystem_filling_up
expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0
for: 1h
labels:
severity: warning
annotations:
summary: "Filesystem predicted to fill within 24h on {{ $labels.instance }}"
description: "Based on the last 6h trend, the root filesystem on {{ $labels.instance }} is predicted to run out of space within 24 hours."
- alert: systemd_not_running
expr: node_systemd_system_running == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Systemd not in running state on {{ $labels.instance }}"
description: "Systemd is not in running state on {{ $labels.instance }}. The system may be in a degraded state."
- alert: high_file_descriptors
expr: node_filefd_allocated / node_filefd_maximum > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "High file descriptor usage on {{ $labels.instance }}"
description: "More than 80% of file descriptors are in use on {{ $labels.instance }}."
- alert: host_reboot
expr: changes(node_boot_time_seconds[10m]) > 0
for: 0m
labels:
severity: info
annotations:
summary: "Host {{ $labels.instance }} has rebooted"
description: "Host {{ $labels.instance }} has rebooted."
- name: nameserver_rules
rules:
- alert: unbound_down
@@ -75,7 +107,7 @@ groups:
annotations:
summary: "NSD not running on {{ $labels.instance }}"
description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
- name: http-proxy_rules
- name: http_proxy_rules
rules:
- alert: caddy_down
expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0
@@ -85,6 +117,22 @@ groups:
annotations:
summary: "Caddy not running on {{ $labels.instance }}"
description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
- alert: caddy_upstream_unhealthy
expr: caddy_reverse_proxy_upstreams_healthy == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Caddy upstream unhealthy for {{ $labels.upstream }}"
description: "Caddy reverse proxy upstream {{ $labels.upstream }} is unhealthy on {{ $labels.instance }}."
- alert: caddy_high_error_rate
expr: rate(caddy_http_request_errors_total[5m]) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "High HTTP error rate on {{ $labels.instance }}"
description: "Caddy is experiencing a high rate of HTTP errors on {{ $labels.instance }}."
- name: nats_rules
rules:
- alert: nats_down
@@ -97,7 +145,7 @@ groups:
description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
- name: nix_cache_rules
rules:
- alert: build-flakes_service_not_active_recently
- alert: build_flakes_service_not_active_recently
expr: count_over_time(node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="active"}[1h]) < 1
for: 0m
labels:
@@ -138,7 +186,7 @@ groups:
annotations:
summary: "Home assistant not running on {{ $labels.instance }}"
description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes."
- alert: zigbee2qmtt_down
- alert: zigbee2mqtt_down
expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0
for: 5m
labels:
@@ -156,7 +204,7 @@ groups:
description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
- name: smartctl_rules
rules:
- alert: SmartCriticalWarning
- alert: smart_critical_warning
expr: smartctl_device_critical_warning > 0
for: 0m
labels:
@@ -164,7 +212,7 @@ groups:
annotations:
summary: SMART critical warning (instance {{ $labels.instance }})
description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartMediaErrors
- alert: smart_media_errors
expr: smartctl_device_media_errors > 0
for: 0m
labels:
@@ -172,7 +220,7 @@ groups:
annotations:
summary: SMART media errors (instance {{ $labels.instance }})
description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartWearoutIndicator
- alert: smart_wearout_indicator
expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold
for: 0m
labels:
@@ -180,20 +228,29 @@ groups:
annotations:
summary: SMART Wearout Indicator (instance {{ $labels.instance }})
description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: smartctl_high_temperature
expr: smartctl_device_temperature > 60
for: 5m
labels:
severity: warning
annotations:
summary: "Disk temperature above 60C on {{ $labels.instance }}"
description: "Disk {{ $labels.device }} on {{ $labels.instance }} has temperature {{ $value }}C."
- name: wireguard_rules
rules:
- alert: WireguardHandshake
expr: (time() - wireguard_latest_handshake_seconds{instance="http-proxy.home.2rjus.net:9586",interface="wg0",public_key="32Rb13wExcy8uI92JTnFdiOfkv0mlQ6f181WA741DHs="}) > 300
- alert: wireguard_handshake_timeout
expr: (time() - wireguard_latest_handshake_seconds{interface="wg0"}) > 300
for: 1m
labels:
severity: warning
annotations:
summary: "Wireguard handshake timeout on {{ $labels.instance }}"
description: "Wireguard handshake timeout on {{ $labels.instance }} for more than 1 minutes."
description: "Wireguard handshake timeout on {{ $labels.instance }} for peer {{ $labels.public_key }}."
- name: monitoring_rules
rules:
- alert: prometheus_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="prometheus.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
@@ -201,6 +258,7 @@ groups:
description: "Prometheus service not running on {{ $labels.instance }}"
- alert: alertmanager_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
@@ -208,13 +266,7 @@ groups:
description: "Alertmanager service not running on {{ $labels.instance }}"
- alert: pushgateway_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
labels:
severity: critical
annotations:
summary: "Pushgateway service not running on {{ $labels.instance }}"
description: "Pushgateway service not running on {{ $labels.instance }}"
- alert: pushgateway_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
@@ -222,6 +274,7 @@ groups:
description: "Pushgateway service not running on {{ $labels.instance }}"
- alert: loki_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="loki.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
@@ -229,6 +282,7 @@ groups:
description: "Loki service not running on {{ $labels.instance }}"
- alert: grafana_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
for: 5m
labels:
severity: warning
annotations:
@@ -236,6 +290,7 @@ groups:
description: "Grafana service not running on {{ $labels.instance }}"
- alert: tempo_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="tempo.service", state="active"} == 0
for: 5m
labels:
severity: warning
annotations:
@@ -243,8 +298,53 @@ groups:
description: "Tempo service not running on {{ $labels.instance }}"
- alert: pyroscope_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="podman-pyroscope.service", state="active"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Pyroscope service not running on {{ $labels.instance }}"
description: "Pyroscope service not running on {{ $labels.instance }}"
- name: certificate_rules
rules:
- alert: certificate_expiring_soon
expr: labmon_tlsconmon_certificate_seconds_left < 86400
for: 5m
labels:
severity: warning
annotations:
summary: "TLS certificate expiring soon for {{ $labels.instance }}"
description: "TLS certificate for {{ $labels.address }} is expiring within 24 hours."
- alert: certificate_check_error
expr: labmon_tlsconmon_certificate_check_error == 1
for: 5m
labels:
severity: warning
annotations:
summary: "Error checking certificate for {{ $labels.address }}"
description: "Certificate check is failing for {{ $labels.address }} on {{ $labels.instance }}."
- alert: step_ca_certificate_expiring
expr: labmon_stepmon_certificate_seconds_left < 3600
for: 5m
labels:
severity: critical
annotations:
summary: "Step-CA certificate expiring for {{ $labels.instance }}"
description: "Step-CA certificate is expiring within 1 hour on {{ $labels.instance }}."
- name: proxmox_rules
rules:
- alert: pve_node_down
expr: pve_up{id=~"node/.*"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Proxmox node {{ $labels.id }} is down"
description: "Proxmox node {{ $labels.id }} has been down for more than 5 minutes."
- alert: pve_guest_stopped
expr: pve_up{id=~"qemu/.*"} == 0 and pve_onboot_status == 1
for: 5m
labels:
severity: warning
annotations:
summary: "Proxmox VM {{ $labels.id }} is stopped"
description: "Proxmox VM {{ $labels.id }} ({{ $labels.name }}) has onboot=1 but is stopped."

View File

@@ -6,4 +6,10 @@
./proxy.nix
./nix.nix
];
homelab.monitoring.scrapeTargets = [{
job_name = "nix-cache_caddy";
port = 443;
scheme = "https";
}];
}