Files
nixos-servers/services/monitoring/rules.yml
Torjus Håkestad 4f593126c0
Some checks failed
Run nix flake check / flake-check (push) Failing after 3m15s
Run nix flake check / flake-check (pull_request) Failing after 3m8s
monitoring01: remove host and migrate services to monitoring02
Remove monitoring01 host configuration and unused service modules
(prometheus, grafana, loki, tempo, pyroscope). Migrate blackbox,
exportarr, and pve exporters to monitoring02 with scrape configs
moved to VictoriaMetrics. Update alert rules, terraform vault
policies/secrets, http-proxy entries, and documentation to reflect
the monitoring02 migration.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-17 21:50:20 +01:00

400 lines
18 KiB
YAML

groups:
- name: common_rules
rules:
- alert: node_down
expr: up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
- alert: low_disk_space
expr: node_filesystem_free_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Disk space low on {{ $labels.instance }}"
description: "Disk space is low on {{ $labels.instance }}. Please check."
# Build hosts (e.g., nix-cache01) are expected to have high CPU during builds
- alert: high_cpu_load
expr: max(node_load5{role!="build-host"}) by (instance) > (count by (instance)(node_cpu_seconds_total{role!="build-host", mode="idle"}) * 0.7)
for: 15m
labels:
severity: warning
annotations:
summary: "High CPU load on {{ $labels.instance }}"
description: "CPU load is high on {{ $labels.instance }}. Please check."
- alert: high_cpu_load
expr: max(node_load5{role="build-host"}) by (instance) > (count by (instance)(node_cpu_seconds_total{role="build-host", mode="idle"}) * 0.7)
for: 2h
labels:
severity: warning
annotations:
summary: "High CPU load on {{ $labels.instance }}"
description: "CPU load is high on {{ $labels.instance }}. Please check."
- alert: low_memory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 2m
labels:
severity: warning
annotations:
summary: Low available memory on {{ $labels.instance }}
description: Node memory is filling up (< 10% left)\n VALUE = {{ $value }}
- alert: oom_kill
expr: increase(node_vmstat_oom_kill[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected on {{ $labels.instance }}
description: OOM kill detected
- alert: nixos_upgrade_failed
expr: node_systemd_unit_state{name="nixos-upgrade.service", state="failed"} == 1
for: 0m
labels:
severity: critical
annotations:
summary: "NixOS upgrade failed on {{ $labels.instance }}"
description: "NixOS upgrade failed on {{ $labels.instance }}"
- alert: promtail_not_running
expr: node_systemd_unit_state{name="promtail.service", state="active"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Promtail service not running on {{ $labels.instance }}"
description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes."
- alert: filesystem_filling_up
expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0
for: 1h
labels:
severity: warning
annotations:
summary: "Filesystem predicted to fill within 24h on {{ $labels.instance }}"
description: "Based on the last 6h trend, the root filesystem on {{ $labels.instance }} is predicted to run out of space within 24 hours."
- alert: systemd_not_running
expr: node_systemd_system_running == 0
for: 10m
labels:
severity: warning
annotations:
summary: "Systemd not in running state on {{ $labels.instance }}"
description: "Systemd is not in running state on {{ $labels.instance }}. The system may be in a degraded state. Note: brief degraded states during nixos-rebuild are normal."
- alert: high_file_descriptors
expr: node_filefd_allocated / node_filefd_maximum > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "High file descriptor usage on {{ $labels.instance }}"
description: "More than 80% of file descriptors are in use on {{ $labels.instance }}."
- alert: host_reboot
expr: changes(node_boot_time_seconds[10m]) > 0
for: 0m
labels:
severity: info
annotations:
summary: "Host {{ $labels.instance }} has rebooted"
description: "Host {{ $labels.instance }} has rebooted."
- name: nameserver_rules
rules:
- alert: unbound_down
expr: node_systemd_unit_state {instance =~ "ns.+", name = "unbound.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Unbound not running on {{ $labels.instance }}"
description: "Unbound has been down on {{ $labels.instance }} more than 5 minutes."
- alert: nsd_down
expr: node_systemd_unit_state {instance =~ "ns.+", name = "nsd.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "NSD not running on {{ $labels.instance }}"
description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
# Only alert on primary DNS (secondary has cold cache after failover)
- alert: unbound_low_cache_hit_ratio
expr: (rate(unbound_cache_hits_total{dns_role="primary"}[5m]) / (rate(unbound_cache_hits_total{dns_role="primary"}[5m]) + rate(unbound_cache_misses_total{dns_role="primary"}[5m]))) < 0.2
for: 15m
labels:
severity: warning
annotations:
summary: "Low DNS cache hit ratio on {{ $labels.instance }}"
description: "Unbound cache hit ratio is below 20% on {{ $labels.instance }}."
- name: http_proxy_rules
rules:
- alert: caddy_down
expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Caddy not running on {{ $labels.instance }}"
description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
- alert: caddy_upstream_unhealthy
expr: caddy_reverse_proxy_upstreams_healthy == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Caddy upstream unhealthy for {{ $labels.upstream }}"
description: "Caddy reverse proxy upstream {{ $labels.upstream }} is unhealthy on {{ $labels.instance }}."
- alert: caddy_high_error_rate
expr: rate(caddy_http_request_errors_total[5m]) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "High HTTP error rate on {{ $labels.instance }}"
description: "Caddy is experiencing a high rate of HTTP errors on {{ $labels.instance }}."
- name: nats_rules
rules:
- alert: nats_down
expr: node_systemd_unit_state {instance="nats1.home.2rjus.net:9100", name = "nats.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "NATS not running on {{ $labels.instance }}"
description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
- alert: nats_slow_consumers
expr: nats_core_slow_consumer_count > 0
for: 5m
labels:
severity: warning
annotations:
summary: "NATS has slow consumers on {{ $labels.instance }}"
description: "NATS has {{ $value }} slow consumers on {{ $labels.instance }}."
- name: nix_cache_rules
rules:
- alert: harmonia_down
expr: node_systemd_unit_state{instance="nix-cache02.home.2rjus.net:9100", name="harmonia.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Harmonia not running on {{ $labels.instance }}"
description: "Harmonia has been down on {{ $labels.instance }} more than 5 minutes."
- name: home_assistant_rules
rules:
- alert: home_assistant_down
expr: node_systemd_unit_state {instance="ha1.home.2rjus.net:9100", name="home-assistant.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Home assistant not running on {{ $labels.instance }}"
description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes."
- alert: zigbee2mqtt_down
expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Zigbee2mqtt not running on {{ $labels.instance }}"
description: "Zigbee2mqtt has been down on {{ $labels.instance }} more than 5 minutes."
- alert: mosquitto_down
expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "mosquitto.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Mosquitto not running on {{ $labels.instance }}"
description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
- alert: zigbee_sensor_stale
expr: (time() - hass_last_updated_time_seconds{entity=~"sensor\\.(0x[0-9a-f]+|temp_server)_temperature"}) > 14400
for: 5m
labels:
severity: warning
annotations:
summary: "Zigbee sensor {{ $labels.friendly_name }} is stale"
description: "Zigbee temperature sensor {{ $labels.entity }} has not reported data for over 4 hours. The sensor may have a dead battery or connectivity issues."
- name: smartctl_rules
rules:
- alert: smart_critical_warning
expr: smartctl_device_critical_warning > 0
for: 0m
labels:
severity: critical
annotations:
summary: SMART critical warning (instance {{ $labels.instance }})
description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: smart_media_errors
expr: smartctl_device_media_errors > 0
for: 0m
labels:
severity: critical
annotations:
summary: SMART media errors (instance {{ $labels.instance }})
description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: smart_wearout_indicator
expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold
for: 0m
labels:
severity: critical
annotations:
summary: SMART Wearout Indicator (instance {{ $labels.instance }})
description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: smartctl_high_temperature
expr: smartctl_device_temperature > 60
for: 5m
labels:
severity: warning
annotations:
summary: "Disk temperature above 60C on {{ $labels.instance }}"
description: "Disk {{ $labels.device }} on {{ $labels.instance }} has temperature {{ $value }}C."
- name: wireguard_rules
rules:
- alert: wireguard_handshake_timeout
expr: (time() - wireguard_latest_handshake_seconds{interface="wg0"}) > 300
for: 1m
labels:
severity: warning
annotations:
summary: "Wireguard handshake timeout on {{ $labels.instance }}"
description: "Wireguard handshake timeout on {{ $labels.instance }} for peer {{ $labels.public_key }}."
- name: monitoring_rules
rules:
- alert: victoriametrics_not_running
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="victoriametrics.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "VictoriaMetrics service not running on {{ $labels.instance }}"
description: "VictoriaMetrics service not running on {{ $labels.instance }}"
- alert: vmalert_not_running
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="vmalert.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "vmalert service not running on {{ $labels.instance }}"
description: "vmalert service not running on {{ $labels.instance }}"
- alert: alertmanager_not_running
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Alertmanager service not running on {{ $labels.instance }}"
description: "Alertmanager service not running on {{ $labels.instance }}"
- alert: loki_not_running
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="loki.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Loki service not running on {{ $labels.instance }}"
description: "Loki service not running on {{ $labels.instance }}"
- alert: grafana_not_running
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Grafana service not running on {{ $labels.instance }}"
description: "Grafana service not running on {{ $labels.instance }}"
- name: proxmox_rules
rules:
- alert: pve_node_down
expr: pve_up{id=~"node/.*"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Proxmox node {{ $labels.id }} is down"
description: "Proxmox node {{ $labels.id }} has been down for more than 5 minutes."
- alert: pve_guest_stopped
expr: pve_up{id=~"qemu/.*"} == 0 and pve_onboot_status == 1
for: 5m
labels:
severity: warning
annotations:
summary: "Proxmox VM {{ $labels.id }} is stopped"
description: "Proxmox VM {{ $labels.id }} ({{ $labels.name }}) has onboot=1 but is stopped."
- name: jellyfin_rules
rules:
- alert: jellyfin_down
expr: up{job="jellyfin"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Jellyfin not responding on {{ $labels.instance }}"
description: "Cannot scrape Jellyfin metrics from {{ $labels.instance }} for 5 minutes."
- name: vault_rules
rules:
- alert: openbao_down
expr: node_systemd_unit_state{instance="vault01.home.2rjus.net:9100", name="openbao.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "OpenBao not running on {{ $labels.instance }}"
description: "OpenBao has been down on {{ $labels.instance }} more than 5 minutes."
- alert: openbao_sealed
expr: vault_core_unsealed == 0
for: 5m
labels:
severity: critical
annotations:
summary: "OpenBao is sealed on {{ $labels.instance }}"
description: "OpenBao has been sealed on {{ $labels.instance }} for more than 5 minutes."
- alert: openbao_scrape_down
expr: up{job="openbao"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Cannot scrape OpenBao metrics from {{ $labels.instance }}"
description: "OpenBao metrics endpoint is not responding on {{ $labels.instance }}."
- name: certificate_rules
rules:
- alert: tls_certificate_expiring_soon
expr: (probe_ssl_earliest_cert_expiry - time()) < 86400 * 7
for: 1h
labels:
severity: warning
annotations:
summary: "TLS certificate expiring soon on {{ $labels.instance }}"
description: "The TLS certificate for {{ $labels.instance }} expires in less than 7 days."
- alert: tls_certificate_expiring_critical
expr: (probe_ssl_earliest_cert_expiry - time()) < 86400
for: 0m
labels:
severity: critical
annotations:
summary: "TLS certificate expiring within 24h on {{ $labels.instance }}"
description: "The TLS certificate for {{ $labels.instance }} expires in less than 24 hours. Immediate action required."
- alert: tls_probe_failed
expr: probe_success{job="blackbox_tls"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "TLS probe failed for {{ $labels.instance }}"
description: "Cannot connect to {{ $labels.instance }} to check TLS certificate. The service may be down or unreachable."
- name: homelab_deploy_rules
rules:
- alert: homelab_deploy_build_failed
expr: increase(homelab_deploy_build_host_total{status="failure"}[1h]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: "Build failed for {{ $labels.host }} in repo {{ $labels.repo }}"
description: "Host {{ $labels.host }} failed to build from {{ $labels.repo }} repository."
- alert: homelab_deploy_builder_down
expr: up{job="homelab-deploy-builder"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Homelab deploy builder not responding on {{ $labels.instance }}"
description: "Cannot scrape homelab-deploy-builder metrics from {{ $labels.instance }} for 5 minutes."