groups: - name: common_rules rules: - alert: node_down expr: up == 0 for: 5m labels: severity: critical annotations: summary: "Instance {{ $labels.instance }} down" description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." - alert: low_disk_space expr: node_filesystem_free_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 10 for: 5m labels: severity: warning annotations: summary: "Disk space low on {{ $labels.instance }}" description: "Disk space is low on {{ $labels.instance }}. Please check." - alert: high_cpu_load expr: max(node_load5{instance!="nix-cache01.home.2rjus.net:9100"}) by (instance) > (count by (instance)(node_cpu_seconds_total{instance!="nix-cache01.home.2rjus.net:9100", mode="idle"}) * 0.7) for: 15m labels: severity: warning annotations: summary: "High CPU load on {{ $labels.instance }}" description: "CPU load is high on {{ $labels.instance }}. Please check." - alert: high_cpu_load expr: max(node_load5{instance="nix-cache01.home.2rjus.net:9100"}) by (instance) > (count by (instance)(node_cpu_seconds_total{instance="nix-cache01.home.2rjus.net:9100", mode="idle"}) * 0.7) for: 2h labels: severity: warning annotations: summary: "High CPU load on {{ $labels.instance }}" description: "CPU load is high on {{ $labels.instance }}. Please check." - alert: low_memory expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 for: 2m labels: severity: warning annotations: summary: Low available memory on {{ $labels.instance }} description: Node memory is filling up (< 10% left)\n VALUE = {{ $value }} - alert: oom_kill expr: increase(node_vmstat_oom_kill[1m]) > 0 for: 0m labels: severity: warning annotations: summary: Host OOM kill detected on {{ $labels.instance }} description: OOM kill detected - alert: nixos_upgrade_failed expr: node_systemd_unit_state{name="nixos-upgrade.service", state="failed"} == 1 for: 0m labels: severity: critical annotations: summary: "NixOS upgrade failed on {{ $labels.instance }}" description: "NixOS upgrade failed on {{ $labels.instance }}" - alert: promtail_not_running expr: node_systemd_unit_state{name="promtail.service", state="active"} == 0 for: 5m labels: severity: warning annotations: summary: "Promtail service not running on {{ $labels.instance }}" description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes." - alert: filesystem_filling_up expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0 for: 1h labels: severity: warning annotations: summary: "Filesystem predicted to fill within 24h on {{ $labels.instance }}" description: "Based on the last 6h trend, the root filesystem on {{ $labels.instance }} is predicted to run out of space within 24 hours." - alert: systemd_not_running expr: node_systemd_system_running == 0 for: 5m labels: severity: critical annotations: summary: "Systemd not in running state on {{ $labels.instance }}" description: "Systemd is not in running state on {{ $labels.instance }}. The system may be in a degraded state." - alert: high_file_descriptors expr: node_filefd_allocated / node_filefd_maximum > 0.8 for: 5m labels: severity: warning annotations: summary: "High file descriptor usage on {{ $labels.instance }}" description: "More than 80% of file descriptors are in use on {{ $labels.instance }}." - alert: host_reboot expr: changes(node_boot_time_seconds[10m]) > 0 for: 0m labels: severity: info annotations: summary: "Host {{ $labels.instance }} has rebooted" description: "Host {{ $labels.instance }} has rebooted." - name: nameserver_rules rules: - alert: unbound_down expr: node_systemd_unit_state {instance =~ "ns.+", name = "unbound.service", state = "active"} == 0 for: 5m labels: severity: critical annotations: summary: "Unbound not running on {{ $labels.instance }}" description: "Unbound has been down on {{ $labels.instance }} more than 5 minutes." - alert: nsd_down expr: node_systemd_unit_state {instance =~ "ns.+", name = "nsd.service", state = "active"} == 0 for: 5m labels: severity: critical annotations: summary: "NSD not running on {{ $labels.instance }}" description: "NSD has been down on {{ $labels.instance }} more than 5 minutes." - alert: unbound_low_cache_hit_ratio expr: (rate(unbound_cache_hits_total[5m]) / (rate(unbound_cache_hits_total[5m]) + rate(unbound_cache_misses_total[5m]))) < 0.5 for: 15m labels: severity: warning annotations: summary: "Low DNS cache hit ratio on {{ $labels.instance }}" description: "Unbound cache hit ratio is below 50% on {{ $labels.instance }}." - name: http_proxy_rules rules: - alert: caddy_down expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0 for: 5m labels: severity: critical annotations: summary: "Caddy not running on {{ $labels.instance }}" description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes." - alert: caddy_upstream_unhealthy expr: caddy_reverse_proxy_upstreams_healthy == 0 for: 5m labels: severity: warning annotations: summary: "Caddy upstream unhealthy for {{ $labels.upstream }}" description: "Caddy reverse proxy upstream {{ $labels.upstream }} is unhealthy on {{ $labels.instance }}." - alert: caddy_high_error_rate expr: rate(caddy_http_request_errors_total[5m]) > 1 for: 5m labels: severity: warning annotations: summary: "High HTTP error rate on {{ $labels.instance }}" description: "Caddy is experiencing a high rate of HTTP errors on {{ $labels.instance }}." - name: nats_rules rules: - alert: nats_down expr: node_systemd_unit_state {instance="nats1.home.2rjus.net:9100", name = "nats.service", state = "active"} == 0 for: 5m labels: severity: critical annotations: summary: "NATS not running on {{ $labels.instance }}" description: "NATS has been down on {{ $labels.instance }} more than 5 minutes." - alert: nats_slow_consumers expr: nats_core_slow_consumer_count > 0 for: 5m labels: severity: warning annotations: summary: "NATS has slow consumers on {{ $labels.instance }}" description: "NATS has {{ $value }} slow consumers on {{ $labels.instance }}." - name: nix_cache_rules rules: - alert: build_flakes_service_not_active_recently expr: count_over_time(node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="active"}[1h]) < 1 for: 0m labels: severity: critical annotations: summary: "The build-flakes service on {{ $labels.instance }} has not run recently" description: "The build-flakes service on {{ $labels.instance }} has not run recently" - alert: build_flakes_error expr: build_flakes_error == 1 labels: severity: warning annotations: summary: "The build-flakes job has failed for host {{ $labels.host }}." description: "The build-flakes job has failed for host {{ $labels.host }}." - alert: harmonia_down expr: node_systemd_unit_state {instance="nix-cache01.home.2rjus.net:9100", name = "harmonia.service", state = "active"} == 0 for: 5m labels: severity: critical annotations: summary: "Harmonia not running on {{ $labels.instance }}" description: "Harmonia has been down on {{ $labels.instance }} more than 5 minutes." - alert: low_disk_space_nix expr: node_filesystem_free_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} / node_filesystem_size_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} * 100 < 10 for: 5m labels: severity: warning annotations: summary: "Disk space low on /nix for {{ $labels.instance }}" description: "Disk space is low on /nix for host {{ $labels.instance }}. Please check." - name: home_assistant_rules rules: - alert: home_assistant_down expr: node_systemd_unit_state {instance="ha1.home.2rjus.net:9100", name="home-assistant.service", state="active"} == 0 for: 5m labels: severity: critical annotations: summary: "Home assistant not running on {{ $labels.instance }}" description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes." - alert: zigbee2mqtt_down expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0 for: 5m labels: severity: critical annotations: summary: "Zigbee2mqtt not running on {{ $labels.instance }}" description: "Zigbee2mqtt has been down on {{ $labels.instance }} more than 5 minutes." - alert: mosquitto_down expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "mosquitto.service", state = "active"} == 0 for: 5m labels: severity: critical annotations: summary: "Mosquitto not running on {{ $labels.instance }}" description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes." - name: smartctl_rules rules: - alert: smart_critical_warning expr: smartctl_device_critical_warning > 0 for: 0m labels: severity: critical annotations: summary: SMART critical warning (instance {{ $labels.instance }}) description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: smart_media_errors expr: smartctl_device_media_errors > 0 for: 0m labels: severity: critical annotations: summary: SMART media errors (instance {{ $labels.instance }}) description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: smart_wearout_indicator expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold for: 0m labels: severity: critical annotations: summary: SMART Wearout Indicator (instance {{ $labels.instance }}) description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: smartctl_high_temperature expr: smartctl_device_temperature > 60 for: 5m labels: severity: warning annotations: summary: "Disk temperature above 60C on {{ $labels.instance }}" description: "Disk {{ $labels.device }} on {{ $labels.instance }} has temperature {{ $value }}C." - name: wireguard_rules rules: - alert: wireguard_handshake_timeout expr: (time() - wireguard_latest_handshake_seconds{interface="wg0"}) > 300 for: 1m labels: severity: warning annotations: summary: "Wireguard handshake timeout on {{ $labels.instance }}" description: "Wireguard handshake timeout on {{ $labels.instance }} for peer {{ $labels.public_key }}." - name: monitoring_rules rules: - alert: prometheus_not_running expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="prometheus.service", state="active"} == 0 for: 5m labels: severity: critical annotations: summary: "Prometheus service not running on {{ $labels.instance }}" description: "Prometheus service not running on {{ $labels.instance }}" - alert: alertmanager_not_running expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0 for: 5m labels: severity: critical annotations: summary: "Alertmanager service not running on {{ $labels.instance }}" description: "Alertmanager service not running on {{ $labels.instance }}" - alert: pushgateway_not_running expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0 for: 5m labels: severity: critical annotations: summary: "Pushgateway service not running on {{ $labels.instance }}" description: "Pushgateway service not running on {{ $labels.instance }}" - alert: loki_not_running expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="loki.service", state="active"} == 0 for: 5m labels: severity: critical annotations: summary: "Loki service not running on {{ $labels.instance }}" description: "Loki service not running on {{ $labels.instance }}" - alert: grafana_not_running expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="grafana.service", state="active"} == 0 for: 5m labels: severity: warning annotations: summary: "Grafana service not running on {{ $labels.instance }}" description: "Grafana service not running on {{ $labels.instance }}" - alert: tempo_not_running expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="tempo.service", state="active"} == 0 for: 5m labels: severity: warning annotations: summary: "Tempo service not running on {{ $labels.instance }}" description: "Tempo service not running on {{ $labels.instance }}" - alert: pyroscope_not_running expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="podman-pyroscope.service", state="active"} == 0 for: 5m labels: severity: warning annotations: summary: "Pyroscope service not running on {{ $labels.instance }}" description: "Pyroscope service not running on {{ $labels.instance }}" - name: certificate_rules rules: - alert: certificate_expiring_soon expr: labmon_tlsconmon_certificate_seconds_left{address!="ca.home.2rjus.net:443"} < 86400 for: 5m labels: severity: warning annotations: summary: "TLS certificate expiring soon for {{ $labels.instance }}" description: "TLS certificate for {{ $labels.address }} is expiring within 24 hours." - alert: step_ca_serving_cert_expiring expr: labmon_tlsconmon_certificate_seconds_left{address="ca.home.2rjus.net:443"} < 3600 for: 5m labels: severity: critical annotations: summary: "Step-CA serving certificate expiring" description: "The step-ca serving certificate (24h auto-renewed) has less than 1 hour of validity left. Renewal may have failed." - alert: certificate_check_error expr: labmon_tlsconmon_certificate_check_error == 1 for: 5m labels: severity: warning annotations: summary: "Error checking certificate for {{ $labels.address }}" description: "Certificate check is failing for {{ $labels.address }} on {{ $labels.instance }}." - alert: step_ca_certificate_expiring expr: labmon_stepmon_certificate_seconds_left < 3600 for: 5m labels: severity: critical annotations: summary: "Step-CA certificate expiring for {{ $labels.instance }}" description: "Step-CA certificate is expiring within 1 hour on {{ $labels.instance }}." - name: proxmox_rules rules: - alert: pve_node_down expr: pve_up{id=~"node/.*"} == 0 for: 5m labels: severity: critical annotations: summary: "Proxmox node {{ $labels.id }} is down" description: "Proxmox node {{ $labels.id }} has been down for more than 5 minutes." - alert: pve_guest_stopped expr: pve_up{id=~"qemu/.*"} == 0 and pve_onboot_status == 1 for: 5m labels: severity: warning annotations: summary: "Proxmox VM {{ $labels.id }} is stopped" description: "Proxmox VM {{ $labels.id }} ({{ $labels.name }}) has onboot=1 but is stopped." - name: postgres_rules rules: - alert: postgres_down expr: node_systemd_unit_state{instance="pgdb1.home.2rjus.net:9100", name="postgresql.service", state="active"} == 0 for: 5m labels: severity: critical annotations: summary: "PostgreSQL not running on {{ $labels.instance }}" description: "PostgreSQL has been down on {{ $labels.instance }} more than 5 minutes." - alert: postgres_exporter_down expr: up{job="postgres"} == 0 for: 5m labels: severity: warning annotations: summary: "PostgreSQL exporter down on {{ $labels.instance }}" description: "Cannot scrape PostgreSQL metrics from {{ $labels.instance }}." - alert: postgres_high_connections expr: pg_stat_activity_count / pg_settings_max_connections > 0.8 for: 5m labels: severity: warning annotations: summary: "PostgreSQL connection pool near exhaustion on {{ $labels.instance }}" description: "PostgreSQL is using over 80% of max_connections on {{ $labels.instance }}." - name: auth_rules rules: - alert: authelia_down expr: node_systemd_unit_state{instance="auth01.home.2rjus.net:9100", name="authelia-auth.service", state="active"} == 0 for: 5m labels: severity: critical annotations: summary: "Authelia not running on {{ $labels.instance }}" description: "Authelia has been down on {{ $labels.instance }} more than 5 minutes." - alert: lldap_down expr: node_systemd_unit_state{instance="auth01.home.2rjus.net:9100", name="lldap.service", state="active"} == 0 for: 5m labels: severity: critical annotations: summary: "LLDAP not running on {{ $labels.instance }}" description: "LLDAP has been down on {{ $labels.instance }} more than 5 minutes." - name: jellyfin_rules rules: - alert: jellyfin_down expr: up{job="jellyfin"} == 0 for: 5m labels: severity: warning annotations: summary: "Jellyfin not responding on {{ $labels.instance }}" description: "Cannot scrape Jellyfin metrics from {{ $labels.instance }} for 5 minutes." - name: vault_rules rules: - alert: openbao_down expr: node_systemd_unit_state{instance="vault01.home.2rjus.net:9100", name="openbao.service", state="active"} == 0 for: 5m labels: severity: critical annotations: summary: "OpenBao not running on {{ $labels.instance }}" description: "OpenBao has been down on {{ $labels.instance }} more than 5 minutes." - alert: openbao_sealed expr: vault_core_unsealed == 0 for: 5m labels: severity: critical annotations: summary: "OpenBao is sealed on {{ $labels.instance }}" description: "OpenBao has been sealed on {{ $labels.instance }} for more than 5 minutes." - alert: openbao_scrape_down expr: up{job="openbao"} == 0 for: 5m labels: severity: warning annotations: summary: "Cannot scrape OpenBao metrics from {{ $labels.instance }}" description: "OpenBao metrics endpoint is not responding on {{ $labels.instance }}."