Files
nixos-servers/services/monitoring/rules.yml
Torjus Håkestad dd1b64de27
Some checks failed
Run nix flake check / flake-check (pull_request) Successful in 2m49s
Run nix flake check / flake-check (push) Has been cancelled
monitoring: auto-generate Prometheus scrape targets from host configs
Add homelab.monitoring NixOS options (enable, scrapeTargets) following
the same pattern as homelab.dns. Prometheus scrape configs are now
auto-generated from flake host configurations and external targets,
replacing hardcoded target lists.

Also cleans up alert rules: snake_case naming, fix zigbee2mqtt typo,
remove duplicate pushgateway alert, add for clauses to monitoring_rules,
remove hardcoded WireGuard public key, and add new alerts for
certificates, proxmox, caddy, smartctl temperature, filesystem
prediction, systemd state, file descriptors, and host reboots.

Fixes grafana scrape target port from 3100 to 3000.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-05 00:49:07 +01:00

351 lines
16 KiB
YAML

groups:
- name: common_rules
rules:
- alert: node_down
expr: up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
- alert: low_disk_space
expr: node_filesystem_free_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Disk space low on {{ $labels.instance }}"
description: "Disk space is low on {{ $labels.instance }}. Please check."
- alert: high_cpu_load
expr: max(node_load5{}) by (instance) > (count by (instance)(node_cpu_seconds_total{mode="idle"}) * 0.7)
for: 15m
labels:
severity: warning
annotations:
summary: "High CPU load on {{ $labels.instance }}"
description: "CPU load is high on {{ $labels.instance }}. Please check."
- alert: low_memory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 2m
labels:
severity: warning
annotations:
summary: Low available memory on {{ $labels.instance }}
description: Node memory is filling up (< 10% left)\n VALUE = {{ $value }}
- alert: oom_kill
expr: increase(node_vmstat_oom_kill[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected on {{ $labels.instance }}
description: OOM kill detected
- alert: nixos_upgrade_failed
expr: node_systemd_unit_state{name="nixos-upgrade.service", state="failed"} == 1
for: 0m
labels:
severity: critical
annotations:
summary: "NixOS upgrade failed on {{ $labels.instance }}"
description: "NixOS upgrade failed on {{ $labels.instance }}"
- alert: promtail_not_running
expr: node_systemd_unit_state{name="promtail.service", state="active"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Promtail service not running on {{ $labels.instance }}"
description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes."
- alert: filesystem_filling_up
expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0
for: 1h
labels:
severity: warning
annotations:
summary: "Filesystem predicted to fill within 24h on {{ $labels.instance }}"
description: "Based on the last 6h trend, the root filesystem on {{ $labels.instance }} is predicted to run out of space within 24 hours."
- alert: systemd_not_running
expr: node_systemd_system_running == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Systemd not in running state on {{ $labels.instance }}"
description: "Systemd is not in running state on {{ $labels.instance }}. The system may be in a degraded state."
- alert: high_file_descriptors
expr: node_filefd_allocated / node_filefd_maximum > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "High file descriptor usage on {{ $labels.instance }}"
description: "More than 80% of file descriptors are in use on {{ $labels.instance }}."
- alert: host_reboot
expr: changes(node_boot_time_seconds[10m]) > 0
for: 0m
labels:
severity: info
annotations:
summary: "Host {{ $labels.instance }} has rebooted"
description: "Host {{ $labels.instance }} has rebooted."
- name: nameserver_rules
rules:
- alert: unbound_down
expr: node_systemd_unit_state {instance =~ "ns.+", name = "unbound.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Unbound not running on {{ $labels.instance }}"
description: "Unbound has been down on {{ $labels.instance }} more than 5 minutes."
- alert: nsd_down
expr: node_systemd_unit_state {instance =~ "ns.+", name = "nsd.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "NSD not running on {{ $labels.instance }}"
description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
- name: http_proxy_rules
rules:
- alert: caddy_down
expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Caddy not running on {{ $labels.instance }}"
description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
- alert: caddy_upstream_unhealthy
expr: caddy_reverse_proxy_upstreams_healthy == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Caddy upstream unhealthy for {{ $labels.upstream }}"
description: "Caddy reverse proxy upstream {{ $labels.upstream }} is unhealthy on {{ $labels.instance }}."
- alert: caddy_high_error_rate
expr: rate(caddy_http_request_errors_total[5m]) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "High HTTP error rate on {{ $labels.instance }}"
description: "Caddy is experiencing a high rate of HTTP errors on {{ $labels.instance }}."
- name: nats_rules
rules:
- alert: nats_down
expr: node_systemd_unit_state {instance="nats1.home.2rjus.net:9100", name = "nats.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "NATS not running on {{ $labels.instance }}"
description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
- name: nix_cache_rules
rules:
- alert: build_flakes_service_not_active_recently
expr: count_over_time(node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="active"}[1h]) < 1
for: 0m
labels:
severity: critical
annotations:
summary: "The build-flakes service on {{ $labels.instance }} has not run recently"
description: "The build-flakes service on {{ $labels.instance }} has not run recently"
- alert: build_flakes_error
expr: build_flakes_error == 1
labels:
severity: warning
annotations:
summary: "The build-flakes job has failed for host {{ $labels.host }}."
description: "The build-flakes job has failed for host {{ $labels.host }}."
- alert: harmonia_down
expr: node_systemd_unit_state {instance="nix-cache01.home.2rjus.net:9100", name = "harmonia.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Harmonia not running on {{ $labels.instance }}"
description: "Harmonia has been down on {{ $labels.instance }} more than 5 minutes."
- alert: low_disk_space_nix
expr: node_filesystem_free_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} / node_filesystem_size_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Disk space low on /nix for {{ $labels.instance }}"
description: "Disk space is low on /nix for host {{ $labels.instance }}. Please check."
- name: home_assistant_rules
rules:
- alert: home_assistant_down
expr: node_systemd_unit_state {instance="ha1.home.2rjus.net:9100", name="home-assistant.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Home assistant not running on {{ $labels.instance }}"
description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes."
- alert: zigbee2mqtt_down
expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Zigbee2mqtt not running on {{ $labels.instance }}"
description: "Zigbee2mqtt has been down on {{ $labels.instance }} more than 5 minutes."
- alert: mosquitto_down
expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "mosquitto.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Mosquitto not running on {{ $labels.instance }}"
description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
- name: smartctl_rules
rules:
- alert: smart_critical_warning
expr: smartctl_device_critical_warning > 0
for: 0m
labels:
severity: critical
annotations:
summary: SMART critical warning (instance {{ $labels.instance }})
description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: smart_media_errors
expr: smartctl_device_media_errors > 0
for: 0m
labels:
severity: critical
annotations:
summary: SMART media errors (instance {{ $labels.instance }})
description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: smart_wearout_indicator
expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold
for: 0m
labels:
severity: critical
annotations:
summary: SMART Wearout Indicator (instance {{ $labels.instance }})
description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: smartctl_high_temperature
expr: smartctl_device_temperature > 60
for: 5m
labels:
severity: warning
annotations:
summary: "Disk temperature above 60C on {{ $labels.instance }}"
description: "Disk {{ $labels.device }} on {{ $labels.instance }} has temperature {{ $value }}C."
- name: wireguard_rules
rules:
- alert: wireguard_handshake_timeout
expr: (time() - wireguard_latest_handshake_seconds{interface="wg0"}) > 300
for: 1m
labels:
severity: warning
annotations:
summary: "Wireguard handshake timeout on {{ $labels.instance }}"
description: "Wireguard handshake timeout on {{ $labels.instance }} for peer {{ $labels.public_key }}."
- name: monitoring_rules
rules:
- alert: prometheus_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="prometheus.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus service not running on {{ $labels.instance }}"
description: "Prometheus service not running on {{ $labels.instance }}"
- alert: alertmanager_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Alertmanager service not running on {{ $labels.instance }}"
description: "Alertmanager service not running on {{ $labels.instance }}"
- alert: pushgateway_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Pushgateway service not running on {{ $labels.instance }}"
description: "Pushgateway service not running on {{ $labels.instance }}"
- alert: loki_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="loki.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Loki service not running on {{ $labels.instance }}"
description: "Loki service not running on {{ $labels.instance }}"
- alert: grafana_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Grafana service not running on {{ $labels.instance }}"
description: "Grafana service not running on {{ $labels.instance }}"
- alert: tempo_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="tempo.service", state="active"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Tempo service not running on {{ $labels.instance }}"
description: "Tempo service not running on {{ $labels.instance }}"
- alert: pyroscope_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="podman-pyroscope.service", state="active"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Pyroscope service not running on {{ $labels.instance }}"
description: "Pyroscope service not running on {{ $labels.instance }}"
- name: certificate_rules
rules:
- alert: certificate_expiring_soon
expr: labmon_tlsconmon_certificate_seconds_left < 86400
for: 5m
labels:
severity: warning
annotations:
summary: "TLS certificate expiring soon for {{ $labels.instance }}"
description: "TLS certificate for {{ $labels.address }} is expiring within 24 hours."
- alert: certificate_check_error
expr: labmon_tlsconmon_certificate_check_error == 1
for: 5m
labels:
severity: warning
annotations:
summary: "Error checking certificate for {{ $labels.address }}"
description: "Certificate check is failing for {{ $labels.address }} on {{ $labels.instance }}."
- alert: step_ca_certificate_expiring
expr: labmon_stepmon_certificate_seconds_left < 3600
for: 5m
labels:
severity: critical
annotations:
summary: "Step-CA certificate expiring for {{ $labels.instance }}"
description: "Step-CA certificate is expiring within 1 hour on {{ $labels.instance }}."
- name: proxmox_rules
rules:
- alert: pve_node_down
expr: pve_up{id=~"node/.*"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Proxmox node {{ $labels.id }} is down"
description: "Proxmox node {{ $labels.id }} has been down for more than 5 minutes."
- alert: pve_guest_stopped
expr: pve_up{id=~"qemu/.*"} == 0 and pve_onboot_status == 1
for: 5m
labels:
severity: warning
annotations:
summary: "Proxmox VM {{ $labels.id }} is stopped"
description: "Proxmox VM {{ $labels.id }} ({{ $labels.name }}) has onboot=1 but is stopped."