nixos-servers/services/monitoring/rules.yml

groups:
  - name: common_rules
    rules:
      - alert: node_down
        expr: up == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Instance {{ $labels.instance }} down"
          description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
      - alert: low_disk_space
        expr: node_filesystem_free_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Disk space low on {{ $labels.instance }}"
          description: "Disk space is low on {{ $labels.instance }}. Please check."
      # Build hosts (e.g., nix-cache01) are expected to have high CPU during builds
      - alert: high_cpu_load
        expr: max(node_load5{role!="build-host"}) by (instance) > (count by (instance)(node_cpu_seconds_total{role!="build-host", mode="idle"}) * 0.7)
        for: 15m
        labels:
          severity: warning
        annotations:
          summary: "High CPU load on {{ $labels.instance }}"
          description: "CPU load is high on {{ $labels.instance }}. Please check."
      - alert: high_cpu_load
        expr: max(node_load5{role="build-host"}) by (instance) > (count by (instance)(node_cpu_seconds_total{role="build-host", mode="idle"}) * 0.7)
        for: 2h
        labels:
          severity: warning
        annotations:
          summary: "High CPU load on {{ $labels.instance }}"
          description: "CPU load is high on {{ $labels.instance }}. Please check."
      - alert: low_memory
        expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Low available memory on {{ $labels.instance }}
          description: Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}
      - alert: oom_kill
        expr: increase(node_vmstat_oom_kill[1m]) > 0
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Host OOM kill detected on {{ $labels.instance }}
          description: OOM kill detected
      - alert: nixos_upgrade_failed
        expr: node_systemd_unit_state{name="nixos-upgrade.service", state="failed"} == 1
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: "NixOS upgrade failed on {{ $labels.instance }}"
          description: "NixOS upgrade failed on {{ $labels.instance }}"
      - alert: promtail_not_running
        expr: node_systemd_unit_state{name="promtail.service", state="active"} == 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Promtail service not running on {{ $labels.instance }}"
          description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes."
      - alert: filesystem_filling_up
        expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[24h], 24*3600) < 0
        for: 1h
        labels:
          severity: warning
        annotations:
          summary: "Filesystem predicted to fill within 24h on {{ $labels.instance }}"
          description: "Based on the last 24h trend, the root filesystem on {{ $labels.instance }} is predicted to run out of space within 24 hours."
      - alert: systemd_not_running
        expr: node_systemd_system_running == 0
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Systemd not in running state on {{ $labels.instance }}"
          description: "Systemd is not in running state on {{ $labels.instance }}. The system may be in a degraded state. Note: brief degraded states during nixos-rebuild are normal."
      - alert: high_file_descriptors
        expr: node_filefd_allocated / node_filefd_maximum > 0.8
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High file descriptor usage on {{ $labels.instance }}"
          description: "More than 80% of file descriptors are in use on {{ $labels.instance }}."
      - alert: host_reboot
        expr: changes(node_boot_time_seconds[10m]) > 0
        for: 0m
        labels:
          severity: info
        annotations:
          summary: "Host {{ $labels.instance }} has rebooted"
          description: "Host {{ $labels.instance }} has rebooted."
  - name: nameserver_rules
    rules:
      - alert: unbound_down
        expr: node_systemd_unit_state {instance =~ "ns.+", name = "unbound.service", state = "active"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Unbound not running on {{ $labels.instance }}"
          description: "Unbound has been down on {{ $labels.instance }} more than 5 minutes."
      - alert: nsd_down
        expr: node_systemd_unit_state {instance =~ "ns.+", name = "nsd.service", state = "active"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "NSD not running on {{ $labels.instance }}"
          description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
      # Only alert on primary DNS (secondary has cold cache after failover)
      - alert: unbound_low_cache_hit_ratio
        expr: (rate(unbound_cache_hits_total{dns_role="primary"}[5m]) / (rate(unbound_cache_hits_total{dns_role="primary"}[5m]) + rate(unbound_cache_misses_total{dns_role="primary"}[5m]))) < 0.2
        for: 15m
        labels:
          severity: warning
        annotations:
          summary: "Low DNS cache hit ratio on {{ $labels.instance }}"
          description: "Unbound cache hit ratio is below 20% on {{ $labels.instance }}."
  - name: http_proxy_rules
    rules:
      - alert: caddy_down
        expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Caddy not running on {{ $labels.instance }}"
          description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
      - alert: caddy_upstream_unhealthy
        expr: caddy_reverse_proxy_upstreams_healthy == 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Caddy upstream unhealthy for {{ $labels.upstream }}"
          description: "Caddy reverse proxy upstream {{ $labels.upstream }} is unhealthy on {{ $labels.instance }}."
      - alert: caddy_high_error_rate
        expr: rate(caddy_http_request_errors_total[5m]) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High HTTP error rate on {{ $labels.instance }}"
          description: "Caddy is experiencing a high rate of HTTP errors on {{ $labels.instance }}."
  - name: nats_rules
    rules:
      - alert: nats_down
        expr: node_systemd_unit_state {instance="nats1.home.2rjus.net:9100", name = "nats.service", state = "active"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "NATS not running on {{ $labels.instance }}"
          description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
      - alert: nats_slow_consumers
        expr: nats_core_slow_consumer_count > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "NATS has slow consumers on {{ $labels.instance }}"
          description: "NATS has {{ $value }} slow consumers on {{ $labels.instance }}."
  - name: nix_cache_rules
    rules:
      - alert: harmonia_down
        expr: node_systemd_unit_state{instance="nix-cache02.home.2rjus.net:9100", name="harmonia.service", state="active"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Harmonia not running on {{ $labels.instance }}"
          description: "Harmonia has been down on {{ $labels.instance }} more than 5 minutes."
  - name: home_assistant_rules
    rules:
      - alert: home_assistant_down
        expr: node_systemd_unit_state {instance="ha1.home.2rjus.net:9100", name="home-assistant.service", state="active"}  == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Home assistant not running on {{ $labels.instance }}"
          description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes."
      - alert: zigbee2mqtt_down
        expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Zigbee2mqtt not running on {{ $labels.instance }}"
          description: "Zigbee2mqtt has been down on {{ $labels.instance }} more than 5 minutes."
      - alert: mosquitto_down
        expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "mosquitto.service", state = "active"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Mosquitto not running on {{ $labels.instance }}"
          description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
      - alert: zigbee_sensor_stale
        expr: (time() - hass_last_updated_time_seconds{entity=~"sensor\\.(0x[0-9a-f]+|temp_server)_temperature"}) > 14400
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Zigbee sensor {{ $labels.friendly_name }} is stale"
          description: "Zigbee temperature sensor {{ $labels.entity }} has not reported data for over 4 hours. The sensor may have a dead battery or connectivity issues."
  - name: smartctl_rules
    rules:
      - alert: smart_critical_warning
        expr: smartctl_device_critical_warning > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: SMART critical warning (instance {{ $labels.instance }})
          description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: smart_media_errors
        expr: smartctl_device_media_errors > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: SMART media errors (instance {{ $labels.instance }})
          description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: smart_wearout_indicator
        expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: SMART Wearout Indicator (instance {{ $labels.instance }})
          description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: smartctl_high_temperature
        expr: smartctl_device_temperature > 60
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Disk temperature above 60C on {{ $labels.instance }}"
          description: "Disk {{ $labels.device }} on {{ $labels.instance }} has temperature {{ $value }}C."
  - name: wireguard_rules
    rules:
      - alert: wireguard_handshake_timeout
        expr: (time() - wireguard_latest_handshake_seconds{interface="wg0"}) > 300
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Wireguard handshake timeout on {{ $labels.instance }}"
          description: "Wireguard handshake timeout on {{ $labels.instance }} for peer {{ $labels.public_key }}."
  - name: monitoring_rules
    rules:
      - alert: victoriametrics_not_running
        expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="victoriametrics.service", state="active"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "VictoriaMetrics service not running on {{ $labels.instance }}"
          description: "VictoriaMetrics service not running on {{ $labels.instance }}"
      - alert: vmalert_not_running
        expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="vmalert.service", state="active"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "vmalert service not running on {{ $labels.instance }}"
          description: "vmalert service not running on {{ $labels.instance }}"
      - alert: alertmanager_not_running
        expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Alertmanager service not running on {{ $labels.instance }}"
          description: "Alertmanager service not running on {{ $labels.instance }}"
      - alert: loki_not_running
        expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="loki.service", state="active"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Loki service not running on {{ $labels.instance }}"
          description: "Loki service not running on {{ $labels.instance }}"
      - alert: grafana_not_running
        expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Grafana service not running on {{ $labels.instance }}"
          description: "Grafana service not running on {{ $labels.instance }}"
  - name: proxmox_rules
    rules:
      - alert: pve_node_down
        expr: pve_up{id=~"node/.*"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Proxmox node {{ $labels.id }} is down"
          description: "Proxmox node {{ $labels.id }} has been down for more than 5 minutes."
      - alert: pve_guest_stopped
        expr: pve_up{id=~"qemu/.*"} == 0 and pve_onboot_status == 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Proxmox VM {{ $labels.id }} is stopped"
          description: "Proxmox VM {{ $labels.id }} ({{ $labels.name }}) has onboot=1 but is stopped."
  - name: jellyfin_rules
    rules:
      - alert: jellyfin_down
        expr: up{job="jellyfin"} == 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Jellyfin not responding on {{ $labels.instance }}"
          description: "Cannot scrape Jellyfin metrics from {{ $labels.instance }} for 5 minutes."
  - name: vault_rules
    rules:
      - alert: openbao_down
        expr: node_systemd_unit_state{instance="vault01.home.2rjus.net:9100", name="openbao.service", state="active"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "OpenBao not running on {{ $labels.instance }}"
          description: "OpenBao has been down on {{ $labels.instance }} more than 5 minutes."
      - alert: openbao_sealed
        expr: vault_core_unsealed == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "OpenBao is sealed on {{ $labels.instance }}"
          description: "OpenBao has been sealed on {{ $labels.instance }} for more than 5 minutes."
      - alert: openbao_scrape_down
        expr: up{job="openbao"} == 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Cannot scrape OpenBao metrics from {{ $labels.instance }}"
          description: "OpenBao metrics endpoint is not responding on {{ $labels.instance }}."
  - name: certificate_rules
    rules:
      - alert: tls_certificate_expiring_soon
        expr: (probe_ssl_earliest_cert_expiry - time()) < 86400 * 7
        for: 1h
        labels:
          severity: warning
        annotations:
          summary: "TLS certificate expiring soon on {{ $labels.instance }}"
          description: "The TLS certificate for {{ $labels.instance }} expires in less than 7 days."
      - alert: tls_certificate_expiring_critical
        expr: (probe_ssl_earliest_cert_expiry - time()) < 86400
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: "TLS certificate expiring within 24h on {{ $labels.instance }}"
          description: "The TLS certificate for {{ $labels.instance }} expires in less than 24 hours. Immediate action required."
      - alert: tls_probe_failed
        expr: probe_success{job="blackbox_tls"} == 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "TLS probe failed for {{ $labels.instance }}"
          description: "Cannot connect to {{ $labels.instance }} to check TLS certificate. The service may be down or unreachable."
  - name: homelab_deploy_rules
    rules:
      - alert: homelab_deploy_build_failed
        expr: increase(homelab_deploy_build_host_total{status="failure"}[1h]) > 0
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: "Build failed for {{ $labels.host }} in repo {{ $labels.repo }}"
          description: "Host {{ $labels.host }} failed to build from {{ $labels.repo }} repository."
      - alert: homelab_deploy_builder_down
        expr: up{job="homelab-deploy-builder"} == 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Homelab deploy builder not responding on {{ $labels.instance }}"
          description: "Cannot scrape homelab-deploy-builder metrics from {{ $labels.instance }} for 5 minutes."