groups: - name: common_rules rules: - alert: node_down expr: up == 0 for: 5m labels: severity: critical annotations: summary: "Instance {{ $labels.instance }} down" description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." - alert: low_disk_space expr: node_filesystem_free_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 10 for: 5m labels: severity: warning annotations: summary: "Disk space low on {{ $labels.instance }}" description: "Disk space is low on {{ $labels.instance }}. Please check." - alert: high_cpu_load expr: max(node_load5{}) by (instance) > (count by (instance)(node_cpu_seconds_total{mode="idle"}) * 0.7) for: 15m labels: severity: warning annotations: summary: "High CPU load on {{ $labels.instance }}" description: "CPU load is high on {{ $labels.instance }}. Please check." - alert: low_memory expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 for: 2m labels: severity: warning annotations: summary: Low available memory on {{ $labels.instance }} description: Node memory is filling up (< 10% left)\n VALUE = {{ $value }} - alert: oom_kill expr: increase(node_vmstat_oom_kill[1m]) > 0 for: 0m labels: severity: warning annotations: summary: Host OOM kill detected on {{ $labels.instance }} description: OOM kill detected - alert: nixos_upgrade_failed expr: node_systemd_unit_state{name="nixos-upgrade.service", state="failed"} == 1 for: 0m labels: severity: critical annotations: summary: "NixOS upgrade failed on {{ $labels.instance }}" description: "NixOS upgrade failed on {{ $labels.instance }}" - alert: promtail_not_running expr: node_systemd_unit_state{name="promtail.service", state="active"} == 0 for: 5m labels: severity: warning annotations: summary: "Promtail service not running on {{ $labels.instance }}" description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes." - alert: filesystem_filling_up expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0 for: 1h labels: severity: warning annotations: summary: "Filesystem predicted to fill within 24h on {{ $labels.instance }}" description: "Based on the last 6h trend, the root filesystem on {{ $labels.instance }} is predicted to run out of space within 24 hours." - alert: systemd_not_running expr: node_systemd_system_running == 0 for: 5m labels: severity: critical annotations: summary: "Systemd not in running state on {{ $labels.instance }}" description: "Systemd is not in running state on {{ $labels.instance }}. The system may be in a degraded state." - alert: high_file_descriptors expr: node_filefd_allocated / node_filefd_maximum > 0.8 for: 5m labels: severity: warning annotations: summary: "High file descriptor usage on {{ $labels.instance }}" description: "More than 80% of file descriptors are in use on {{ $labels.instance }}." - alert: host_reboot expr: changes(node_boot_time_seconds[10m]) > 0 for: 0m labels: severity: info annotations: summary: "Host {{ $labels.instance }} has rebooted" description: "Host {{ $labels.instance }} has rebooted." - name: nameserver_rules rules: - alert: unbound_down expr: node_systemd_unit_state {instance =~ "ns.+", name = "unbound.service", state = "active"} == 0 for: 5m labels: severity: critical annotations: summary: "Unbound not running on {{ $labels.instance }}" description: "Unbound has been down on {{ $labels.instance }} more than 5 minutes." - alert: nsd_down expr: node_systemd_unit_state {instance =~ "ns.+", name = "nsd.service", state = "active"} == 0 for: 5m labels: severity: critical annotations: summary: "NSD not running on {{ $labels.instance }}" description: "NSD has been down on {{ $labels.instance }} more than 5 minutes." - name: http_proxy_rules rules: - alert: caddy_down expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0 for: 5m labels: severity: critical annotations: summary: "Caddy not running on {{ $labels.instance }}" description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes." - alert: caddy_upstream_unhealthy expr: caddy_reverse_proxy_upstreams_healthy == 0 for: 5m labels: severity: warning annotations: summary: "Caddy upstream unhealthy for {{ $labels.upstream }}" description: "Caddy reverse proxy upstream {{ $labels.upstream }} is unhealthy on {{ $labels.instance }}." - alert: caddy_high_error_rate expr: rate(caddy_http_request_errors_total[5m]) > 1 for: 5m labels: severity: warning annotations: summary: "High HTTP error rate on {{ $labels.instance }}" description: "Caddy is experiencing a high rate of HTTP errors on {{ $labels.instance }}." - name: nats_rules rules: - alert: nats_down expr: node_systemd_unit_state {instance="nats1.home.2rjus.net:9100", name = "nats.service", state = "active"} == 0 for: 5m labels: severity: critical annotations: summary: "NATS not running on {{ $labels.instance }}" description: "NATS has been down on {{ $labels.instance }} more than 5 minutes." - name: nix_cache_rules rules: - alert: build_flakes_service_not_active_recently expr: count_over_time(node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="active"}[1h]) < 1 for: 0m labels: severity: critical annotations: summary: "The build-flakes service on {{ $labels.instance }} has not run recently" description: "The build-flakes service on {{ $labels.instance }} has not run recently" - alert: build_flakes_error expr: build_flakes_error == 1 labels: severity: warning annotations: summary: "The build-flakes job has failed for host {{ $labels.host }}." description: "The build-flakes job has failed for host {{ $labels.host }}." - alert: harmonia_down expr: node_systemd_unit_state {instance="nix-cache01.home.2rjus.net:9100", name = "harmonia.service", state = "active"} == 0 for: 5m labels: severity: critical annotations: summary: "Harmonia not running on {{ $labels.instance }}" description: "Harmonia has been down on {{ $labels.instance }} more than 5 minutes." - alert: low_disk_space_nix expr: node_filesystem_free_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} / node_filesystem_size_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} * 100 < 10 for: 5m labels: severity: warning annotations: summary: "Disk space low on /nix for {{ $labels.instance }}" description: "Disk space is low on /nix for host {{ $labels.instance }}. Please check." - name: home_assistant_rules rules: - alert: home_assistant_down expr: node_systemd_unit_state {instance="ha1.home.2rjus.net:9100", name="home-assistant.service", state="active"} == 0 for: 5m labels: severity: critical annotations: summary: "Home assistant not running on {{ $labels.instance }}" description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes." - alert: zigbee2mqtt_down expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0 for: 5m labels: severity: critical annotations: summary: "Zigbee2mqtt not running on {{ $labels.instance }}" description: "Zigbee2mqtt has been down on {{ $labels.instance }} more than 5 minutes." - alert: mosquitto_down expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "mosquitto.service", state = "active"} == 0 for: 5m labels: severity: critical annotations: summary: "Mosquitto not running on {{ $labels.instance }}" description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes." - name: smartctl_rules rules: - alert: smart_critical_warning expr: smartctl_device_critical_warning > 0 for: 0m labels: severity: critical annotations: summary: SMART critical warning (instance {{ $labels.instance }}) description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: smart_media_errors expr: smartctl_device_media_errors > 0 for: 0m labels: severity: critical annotations: summary: SMART media errors (instance {{ $labels.instance }}) description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: smart_wearout_indicator expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold for: 0m labels: severity: critical annotations: summary: SMART Wearout Indicator (instance {{ $labels.instance }}) description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: smartctl_high_temperature expr: smartctl_device_temperature > 60 for: 5m labels: severity: warning annotations: summary: "Disk temperature above 60C on {{ $labels.instance }}" description: "Disk {{ $labels.device }} on {{ $labels.instance }} has temperature {{ $value }}C." - name: wireguard_rules rules: - alert: wireguard_handshake_timeout expr: (time() - wireguard_latest_handshake_seconds{interface="wg0"}) > 300 for: 1m labels: severity: warning annotations: summary: "Wireguard handshake timeout on {{ $labels.instance }}" description: "Wireguard handshake timeout on {{ $labels.instance }} for peer {{ $labels.public_key }}." - name: monitoring_rules rules: - alert: prometheus_not_running expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="prometheus.service", state="active"} == 0 for: 5m labels: severity: critical annotations: summary: "Prometheus service not running on {{ $labels.instance }}" description: "Prometheus service not running on {{ $labels.instance }}" - alert: alertmanager_not_running expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0 for: 5m labels: severity: critical annotations: summary: "Alertmanager service not running on {{ $labels.instance }}" description: "Alertmanager service not running on {{ $labels.instance }}" - alert: pushgateway_not_running expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0 for: 5m labels: severity: critical annotations: summary: "Pushgateway service not running on {{ $labels.instance }}" description: "Pushgateway service not running on {{ $labels.instance }}" - alert: loki_not_running expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="loki.service", state="active"} == 0 for: 5m labels: severity: critical annotations: summary: "Loki service not running on {{ $labels.instance }}" description: "Loki service not running on {{ $labels.instance }}" - alert: grafana_not_running expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="grafana.service", state="active"} == 0 for: 5m labels: severity: warning annotations: summary: "Grafana service not running on {{ $labels.instance }}" description: "Grafana service not running on {{ $labels.instance }}" - alert: tempo_not_running expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="tempo.service", state="active"} == 0 for: 5m labels: severity: warning annotations: summary: "Tempo service not running on {{ $labels.instance }}" description: "Tempo service not running on {{ $labels.instance }}" - alert: pyroscope_not_running expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="podman-pyroscope.service", state="active"} == 0 for: 5m labels: severity: warning annotations: summary: "Pyroscope service not running on {{ $labels.instance }}" description: "Pyroscope service not running on {{ $labels.instance }}" - name: certificate_rules rules: - alert: certificate_expiring_soon expr: labmon_tlsconmon_certificate_seconds_left < 86400 for: 5m labels: severity: warning annotations: summary: "TLS certificate expiring soon for {{ $labels.instance }}" description: "TLS certificate for {{ $labels.address }} is expiring within 24 hours." - alert: certificate_check_error expr: labmon_tlsconmon_certificate_check_error == 1 for: 5m labels: severity: warning annotations: summary: "Error checking certificate for {{ $labels.address }}" description: "Certificate check is failing for {{ $labels.address }} on {{ $labels.instance }}." - alert: step_ca_certificate_expiring expr: labmon_stepmon_certificate_seconds_left < 3600 for: 5m labels: severity: critical annotations: summary: "Step-CA certificate expiring for {{ $labels.instance }}" description: "Step-CA certificate is expiring within 1 hour on {{ $labels.instance }}." - name: proxmox_rules rules: - alert: pve_node_down expr: pve_up{id=~"node/.*"} == 0 for: 5m labels: severity: critical annotations: summary: "Proxmox node {{ $labels.id }} is down" description: "Proxmox node {{ $labels.id }} has been down for more than 5 minutes." - alert: pve_guest_stopped expr: pve_up{id=~"qemu/.*"} == 0 and pve_onboot_status == 1 for: 5m labels: severity: warning annotations: summary: "Proxmox VM {{ $labels.id }} is stopped" description: "Proxmox VM {{ $labels.id }} ({{ $labels.name }}) has onboot=1 but is stopped."