Files
nixos-servers/services/monitoring/rules.yml
Torjus Håkestad 2ca2509083
Some checks failed
Run nix flake check / flake-check (push) Failing after 3m55s
monitoring: increase filesystem_filling_up prediction window to 24h
Reduces false positives from transient Nix store growth by basing the
linear prediction on a 24h trend instead of 6h.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 09:36:27 +01:00

400 lines
18 KiB
YAML

groups:
- name: common_rules
rules:
- alert: node_down
expr: up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
- alert: low_disk_space
expr: node_filesystem_free_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Disk space low on {{ $labels.instance }}"
description: "Disk space is low on {{ $labels.instance }}. Please check."
# Build hosts (e.g., nix-cache01) are expected to have high CPU during builds
- alert: high_cpu_load
expr: max(node_load5{role!="build-host"}) by (instance) > (count by (instance)(node_cpu_seconds_total{role!="build-host", mode="idle"}) * 0.7)
for: 15m
labels:
severity: warning
annotations:
summary: "High CPU load on {{ $labels.instance }}"
description: "CPU load is high on {{ $labels.instance }}. Please check."
- alert: high_cpu_load
expr: max(node_load5{role="build-host"}) by (instance) > (count by (instance)(node_cpu_seconds_total{role="build-host", mode="idle"}) * 0.7)
for: 2h
labels:
severity: warning
annotations:
summary: "High CPU load on {{ $labels.instance }}"
description: "CPU load is high on {{ $labels.instance }}. Please check."
- alert: low_memory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 2m
labels:
severity: warning
annotations:
summary: Low available memory on {{ $labels.instance }}
description: Node memory is filling up (< 10% left)\n VALUE = {{ $value }}
- alert: oom_kill
expr: increase(node_vmstat_oom_kill[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected on {{ $labels.instance }}
description: OOM kill detected
- alert: nixos_upgrade_failed
expr: node_systemd_unit_state{name="nixos-upgrade.service", state="failed"} == 1
for: 0m
labels:
severity: critical
annotations:
summary: "NixOS upgrade failed on {{ $labels.instance }}"
description: "NixOS upgrade failed on {{ $labels.instance }}"
- alert: promtail_not_running
expr: node_systemd_unit_state{name="promtail.service", state="active"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Promtail service not running on {{ $labels.instance }}"
description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes."
- alert: filesystem_filling_up
expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[24h], 24*3600) < 0
for: 1h
labels:
severity: warning
annotations:
summary: "Filesystem predicted to fill within 24h on {{ $labels.instance }}"
description: "Based on the last 24h trend, the root filesystem on {{ $labels.instance }} is predicted to run out of space within 24 hours."
- alert: systemd_not_running
expr: node_systemd_system_running == 0
for: 10m
labels:
severity: warning
annotations:
summary: "Systemd not in running state on {{ $labels.instance }}"
description: "Systemd is not in running state on {{ $labels.instance }}. The system may be in a degraded state. Note: brief degraded states during nixos-rebuild are normal."
- alert: high_file_descriptors
expr: node_filefd_allocated / node_filefd_maximum > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "High file descriptor usage on {{ $labels.instance }}"
description: "More than 80% of file descriptors are in use on {{ $labels.instance }}."
- alert: host_reboot
expr: changes(node_boot_time_seconds[10m]) > 0
for: 0m
labels:
severity: info
annotations:
summary: "Host {{ $labels.instance }} has rebooted"
description: "Host {{ $labels.instance }} has rebooted."
- name: nameserver_rules
rules:
- alert: unbound_down
expr: node_systemd_unit_state {instance =~ "ns.+", name = "unbound.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Unbound not running on {{ $labels.instance }}"
description: "Unbound has been down on {{ $labels.instance }} more than 5 minutes."
- alert: nsd_down
expr: node_systemd_unit_state {instance =~ "ns.+", name = "nsd.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "NSD not running on {{ $labels.instance }}"
description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
# Only alert on primary DNS (secondary has cold cache after failover)
- alert: unbound_low_cache_hit_ratio
expr: (rate(unbound_cache_hits_total{dns_role="primary"}[5m]) / (rate(unbound_cache_hits_total{dns_role="primary"}[5m]) + rate(unbound_cache_misses_total{dns_role="primary"}[5m]))) < 0.2
for: 15m
labels:
severity: warning
annotations:
summary: "Low DNS cache hit ratio on {{ $labels.instance }}"
description: "Unbound cache hit ratio is below 20% on {{ $labels.instance }}."
- name: http_proxy_rules
rules:
- alert: caddy_down
expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Caddy not running on {{ $labels.instance }}"
description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
- alert: caddy_upstream_unhealthy
expr: caddy_reverse_proxy_upstreams_healthy == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Caddy upstream unhealthy for {{ $labels.upstream }}"
description: "Caddy reverse proxy upstream {{ $labels.upstream }} is unhealthy on {{ $labels.instance }}."
- alert: caddy_high_error_rate
expr: rate(caddy_http_request_errors_total[5m]) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "High HTTP error rate on {{ $labels.instance }}"
description: "Caddy is experiencing a high rate of HTTP errors on {{ $labels.instance }}."
- name: nats_rules
rules:
- alert: nats_down
expr: node_systemd_unit_state {instance="nats1.home.2rjus.net:9100", name = "nats.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "NATS not running on {{ $labels.instance }}"
description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
- alert: nats_slow_consumers
expr: nats_core_slow_consumer_count > 0
for: 5m
labels:
severity: warning
annotations:
summary: "NATS has slow consumers on {{ $labels.instance }}"
description: "NATS has {{ $value }} slow consumers on {{ $labels.instance }}."
- name: nix_cache_rules
rules:
- alert: harmonia_down
expr: node_systemd_unit_state{instance="nix-cache02.home.2rjus.net:9100", name="harmonia.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Harmonia not running on {{ $labels.instance }}"
description: "Harmonia has been down on {{ $labels.instance }} more than 5 minutes."
- name: home_assistant_rules
rules:
- alert: home_assistant_down
expr: node_systemd_unit_state {instance="ha1.home.2rjus.net:9100", name="home-assistant.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Home assistant not running on {{ $labels.instance }}"
description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes."
- alert: zigbee2mqtt_down
expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Zigbee2mqtt not running on {{ $labels.instance }}"
description: "Zigbee2mqtt has been down on {{ $labels.instance }} more than 5 minutes."
- alert: mosquitto_down
expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "mosquitto.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Mosquitto not running on {{ $labels.instance }}"
description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
- alert: zigbee_sensor_stale
expr: (time() - hass_last_updated_time_seconds{entity=~"sensor\\.(0x[0-9a-f]+|temp_server)_temperature"}) > 14400
for: 5m
labels:
severity: warning
annotations:
summary: "Zigbee sensor {{ $labels.friendly_name }} is stale"
description: "Zigbee temperature sensor {{ $labels.entity }} has not reported data for over 4 hours. The sensor may have a dead battery or connectivity issues."
- name: smartctl_rules
rules:
- alert: smart_critical_warning
expr: smartctl_device_critical_warning > 0
for: 0m
labels:
severity: critical
annotations:
summary: SMART critical warning (instance {{ $labels.instance }})
description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: smart_media_errors
expr: smartctl_device_media_errors > 0
for: 0m
labels:
severity: critical
annotations:
summary: SMART media errors (instance {{ $labels.instance }})
description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: smart_wearout_indicator
expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold
for: 0m
labels:
severity: critical
annotations:
summary: SMART Wearout Indicator (instance {{ $labels.instance }})
description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: smartctl_high_temperature
expr: smartctl_device_temperature > 60
for: 5m
labels:
severity: warning
annotations:
summary: "Disk temperature above 60C on {{ $labels.instance }}"
description: "Disk {{ $labels.device }} on {{ $labels.instance }} has temperature {{ $value }}C."
- name: wireguard_rules
rules:
- alert: wireguard_handshake_timeout
expr: (time() - wireguard_latest_handshake_seconds{interface="wg0"}) > 300
for: 1m
labels:
severity: warning
annotations:
summary: "Wireguard handshake timeout on {{ $labels.instance }}"
description: "Wireguard handshake timeout on {{ $labels.instance }} for peer {{ $labels.public_key }}."
- name: monitoring_rules
rules:
- alert: victoriametrics_not_running
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="victoriametrics.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "VictoriaMetrics service not running on {{ $labels.instance }}"
description: "VictoriaMetrics service not running on {{ $labels.instance }}"
- alert: vmalert_not_running
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="vmalert.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "vmalert service not running on {{ $labels.instance }}"
description: "vmalert service not running on {{ $labels.instance }}"
- alert: alertmanager_not_running
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Alertmanager service not running on {{ $labels.instance }}"
description: "Alertmanager service not running on {{ $labels.instance }}"
- alert: loki_not_running
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="loki.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Loki service not running on {{ $labels.instance }}"
description: "Loki service not running on {{ $labels.instance }}"
- alert: grafana_not_running
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Grafana service not running on {{ $labels.instance }}"
description: "Grafana service not running on {{ $labels.instance }}"
- name: proxmox_rules
rules:
- alert: pve_node_down
expr: pve_up{id=~"node/.*"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Proxmox node {{ $labels.id }} is down"
description: "Proxmox node {{ $labels.id }} has been down for more than 5 minutes."
- alert: pve_guest_stopped
expr: pve_up{id=~"qemu/.*"} == 0 and pve_onboot_status == 1
for: 5m
labels:
severity: warning
annotations:
summary: "Proxmox VM {{ $labels.id }} is stopped"
description: "Proxmox VM {{ $labels.id }} ({{ $labels.name }}) has onboot=1 but is stopped."
- name: jellyfin_rules
rules:
- alert: jellyfin_down
expr: up{job="jellyfin"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Jellyfin not responding on {{ $labels.instance }}"
description: "Cannot scrape Jellyfin metrics from {{ $labels.instance }} for 5 minutes."
- name: vault_rules
rules:
- alert: openbao_down
expr: node_systemd_unit_state{instance="vault01.home.2rjus.net:9100", name="openbao.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "OpenBao not running on {{ $labels.instance }}"
description: "OpenBao has been down on {{ $labels.instance }} more than 5 minutes."
- alert: openbao_sealed
expr: vault_core_unsealed == 0
for: 5m
labels:
severity: critical
annotations:
summary: "OpenBao is sealed on {{ $labels.instance }}"
description: "OpenBao has been sealed on {{ $labels.instance }} for more than 5 minutes."
- alert: openbao_scrape_down
expr: up{job="openbao"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Cannot scrape OpenBao metrics from {{ $labels.instance }}"
description: "OpenBao metrics endpoint is not responding on {{ $labels.instance }}."
- name: certificate_rules
rules:
- alert: tls_certificate_expiring_soon
expr: (probe_ssl_earliest_cert_expiry - time()) < 86400 * 7
for: 1h
labels:
severity: warning
annotations:
summary: "TLS certificate expiring soon on {{ $labels.instance }}"
description: "The TLS certificate for {{ $labels.instance }} expires in less than 7 days."
- alert: tls_certificate_expiring_critical
expr: (probe_ssl_earliest_cert_expiry - time()) < 86400
for: 0m
labels:
severity: critical
annotations:
summary: "TLS certificate expiring within 24h on {{ $labels.instance }}"
description: "The TLS certificate for {{ $labels.instance }} expires in less than 24 hours. Immediate action required."
- alert: tls_probe_failed
expr: probe_success{job="blackbox_tls"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "TLS probe failed for {{ $labels.instance }}"
description: "Cannot connect to {{ $labels.instance }} to check TLS certificate. The service may be down or unreachable."
- name: homelab_deploy_rules
rules:
- alert: homelab_deploy_build_failed
expr: increase(homelab_deploy_build_host_total{status="failure"}[1h]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: "Build failed for {{ $labels.host }} in repo {{ $labels.repo }}"
description: "Host {{ $labels.host }} failed to build from {{ $labels.repo }} repository."
- alert: homelab_deploy_builder_down
expr: up{job="homelab-deploy-builder"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Homelab deploy builder not responding on {{ $labels.instance }}"
description: "Cannot scrape homelab-deploy-builder metrics from {{ $labels.instance }} for 5 minutes."