Remove monitoring01 host configuration and unused service modules (prometheus, grafana, loki, tempo, pyroscope). Migrate blackbox, exportarr, and pve exporters to monitoring02 with scrape configs moved to VictoriaMetrics. Update alert rules, terraform vault policies/secrets, http-proxy entries, and documentation to reflect the monitoring02 migration. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
400 lines
18 KiB
YAML
400 lines
18 KiB
YAML
groups:
|
|
- name: common_rules
|
|
rules:
|
|
- alert: node_down
|
|
expr: up == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} down"
|
|
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
|
|
- alert: low_disk_space
|
|
expr: node_filesystem_free_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Disk space low on {{ $labels.instance }}"
|
|
description: "Disk space is low on {{ $labels.instance }}. Please check."
|
|
# Build hosts (e.g., nix-cache01) are expected to have high CPU during builds
|
|
- alert: high_cpu_load
|
|
expr: max(node_load5{role!="build-host"}) by (instance) > (count by (instance)(node_cpu_seconds_total{role!="build-host", mode="idle"}) * 0.7)
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU load on {{ $labels.instance }}"
|
|
description: "CPU load is high on {{ $labels.instance }}. Please check."
|
|
- alert: high_cpu_load
|
|
expr: max(node_load5{role="build-host"}) by (instance) > (count by (instance)(node_cpu_seconds_total{role="build-host", mode="idle"}) * 0.7)
|
|
for: 2h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU load on {{ $labels.instance }}"
|
|
description: "CPU load is high on {{ $labels.instance }}. Please check."
|
|
- alert: low_memory
|
|
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Low available memory on {{ $labels.instance }}
|
|
description: Node memory is filling up (< 10% left)\n VALUE = {{ $value }}
|
|
- alert: oom_kill
|
|
expr: increase(node_vmstat_oom_kill[1m]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host OOM kill detected on {{ $labels.instance }}
|
|
description: OOM kill detected
|
|
- alert: nixos_upgrade_failed
|
|
expr: node_systemd_unit_state{name="nixos-upgrade.service", state="failed"} == 1
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "NixOS upgrade failed on {{ $labels.instance }}"
|
|
description: "NixOS upgrade failed on {{ $labels.instance }}"
|
|
- alert: promtail_not_running
|
|
expr: node_systemd_unit_state{name="promtail.service", state="active"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Promtail service not running on {{ $labels.instance }}"
|
|
description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes."
|
|
- alert: filesystem_filling_up
|
|
expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Filesystem predicted to fill within 24h on {{ $labels.instance }}"
|
|
description: "Based on the last 6h trend, the root filesystem on {{ $labels.instance }} is predicted to run out of space within 24 hours."
|
|
- alert: systemd_not_running
|
|
expr: node_systemd_system_running == 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Systemd not in running state on {{ $labels.instance }}"
|
|
description: "Systemd is not in running state on {{ $labels.instance }}. The system may be in a degraded state. Note: brief degraded states during nixos-rebuild are normal."
|
|
- alert: high_file_descriptors
|
|
expr: node_filefd_allocated / node_filefd_maximum > 0.8
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High file descriptor usage on {{ $labels.instance }}"
|
|
description: "More than 80% of file descriptors are in use on {{ $labels.instance }}."
|
|
- alert: host_reboot
|
|
expr: changes(node_boot_time_seconds[10m]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "Host {{ $labels.instance }} has rebooted"
|
|
description: "Host {{ $labels.instance }} has rebooted."
|
|
- name: nameserver_rules
|
|
rules:
|
|
- alert: unbound_down
|
|
expr: node_systemd_unit_state {instance =~ "ns.+", name = "unbound.service", state = "active"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Unbound not running on {{ $labels.instance }}"
|
|
description: "Unbound has been down on {{ $labels.instance }} more than 5 minutes."
|
|
- alert: nsd_down
|
|
expr: node_systemd_unit_state {instance =~ "ns.+", name = "nsd.service", state = "active"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "NSD not running on {{ $labels.instance }}"
|
|
description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
|
|
# Only alert on primary DNS (secondary has cold cache after failover)
|
|
- alert: unbound_low_cache_hit_ratio
|
|
expr: (rate(unbound_cache_hits_total{dns_role="primary"}[5m]) / (rate(unbound_cache_hits_total{dns_role="primary"}[5m]) + rate(unbound_cache_misses_total{dns_role="primary"}[5m]))) < 0.2
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Low DNS cache hit ratio on {{ $labels.instance }}"
|
|
description: "Unbound cache hit ratio is below 20% on {{ $labels.instance }}."
|
|
- name: http_proxy_rules
|
|
rules:
|
|
- alert: caddy_down
|
|
expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Caddy not running on {{ $labels.instance }}"
|
|
description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
|
|
- alert: caddy_upstream_unhealthy
|
|
expr: caddy_reverse_proxy_upstreams_healthy == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Caddy upstream unhealthy for {{ $labels.upstream }}"
|
|
description: "Caddy reverse proxy upstream {{ $labels.upstream }} is unhealthy on {{ $labels.instance }}."
|
|
- alert: caddy_high_error_rate
|
|
expr: rate(caddy_http_request_errors_total[5m]) > 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High HTTP error rate on {{ $labels.instance }}"
|
|
description: "Caddy is experiencing a high rate of HTTP errors on {{ $labels.instance }}."
|
|
- name: nats_rules
|
|
rules:
|
|
- alert: nats_down
|
|
expr: node_systemd_unit_state {instance="nats1.home.2rjus.net:9100", name = "nats.service", state = "active"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "NATS not running on {{ $labels.instance }}"
|
|
description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
|
|
- alert: nats_slow_consumers
|
|
expr: nats_core_slow_consumer_count > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "NATS has slow consumers on {{ $labels.instance }}"
|
|
description: "NATS has {{ $value }} slow consumers on {{ $labels.instance }}."
|
|
- name: nix_cache_rules
|
|
rules:
|
|
- alert: harmonia_down
|
|
expr: node_systemd_unit_state{instance="nix-cache02.home.2rjus.net:9100", name="harmonia.service", state="active"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Harmonia not running on {{ $labels.instance }}"
|
|
description: "Harmonia has been down on {{ $labels.instance }} more than 5 minutes."
|
|
- name: home_assistant_rules
|
|
rules:
|
|
- alert: home_assistant_down
|
|
expr: node_systemd_unit_state {instance="ha1.home.2rjus.net:9100", name="home-assistant.service", state="active"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Home assistant not running on {{ $labels.instance }}"
|
|
description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes."
|
|
- alert: zigbee2mqtt_down
|
|
expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Zigbee2mqtt not running on {{ $labels.instance }}"
|
|
description: "Zigbee2mqtt has been down on {{ $labels.instance }} more than 5 minutes."
|
|
- alert: mosquitto_down
|
|
expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "mosquitto.service", state = "active"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Mosquitto not running on {{ $labels.instance }}"
|
|
description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
|
|
- alert: zigbee_sensor_stale
|
|
expr: (time() - hass_last_updated_time_seconds{entity=~"sensor\\.(0x[0-9a-f]+|temp_server)_temperature"}) > 14400
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Zigbee sensor {{ $labels.friendly_name }} is stale"
|
|
description: "Zigbee temperature sensor {{ $labels.entity }} has not reported data for over 4 hours. The sensor may have a dead battery or connectivity issues."
|
|
- name: smartctl_rules
|
|
rules:
|
|
- alert: smart_critical_warning
|
|
expr: smartctl_device_critical_warning > 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: SMART critical warning (instance {{ $labels.instance }})
|
|
description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
- alert: smart_media_errors
|
|
expr: smartctl_device_media_errors > 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: SMART media errors (instance {{ $labels.instance }})
|
|
description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
- alert: smart_wearout_indicator
|
|
expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: SMART Wearout Indicator (instance {{ $labels.instance }})
|
|
description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
- alert: smartctl_high_temperature
|
|
expr: smartctl_device_temperature > 60
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Disk temperature above 60C on {{ $labels.instance }}"
|
|
description: "Disk {{ $labels.device }} on {{ $labels.instance }} has temperature {{ $value }}C."
|
|
- name: wireguard_rules
|
|
rules:
|
|
- alert: wireguard_handshake_timeout
|
|
expr: (time() - wireguard_latest_handshake_seconds{interface="wg0"}) > 300
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Wireguard handshake timeout on {{ $labels.instance }}"
|
|
description: "Wireguard handshake timeout on {{ $labels.instance }} for peer {{ $labels.public_key }}."
|
|
- name: monitoring_rules
|
|
rules:
|
|
- alert: victoriametrics_not_running
|
|
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="victoriametrics.service", state="active"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "VictoriaMetrics service not running on {{ $labels.instance }}"
|
|
description: "VictoriaMetrics service not running on {{ $labels.instance }}"
|
|
- alert: vmalert_not_running
|
|
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="vmalert.service", state="active"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "vmalert service not running on {{ $labels.instance }}"
|
|
description: "vmalert service not running on {{ $labels.instance }}"
|
|
- alert: alertmanager_not_running
|
|
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Alertmanager service not running on {{ $labels.instance }}"
|
|
description: "Alertmanager service not running on {{ $labels.instance }}"
|
|
- alert: loki_not_running
|
|
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="loki.service", state="active"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Loki service not running on {{ $labels.instance }}"
|
|
description: "Loki service not running on {{ $labels.instance }}"
|
|
- alert: grafana_not_running
|
|
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Grafana service not running on {{ $labels.instance }}"
|
|
description: "Grafana service not running on {{ $labels.instance }}"
|
|
- name: proxmox_rules
|
|
rules:
|
|
- alert: pve_node_down
|
|
expr: pve_up{id=~"node/.*"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Proxmox node {{ $labels.id }} is down"
|
|
description: "Proxmox node {{ $labels.id }} has been down for more than 5 minutes."
|
|
- alert: pve_guest_stopped
|
|
expr: pve_up{id=~"qemu/.*"} == 0 and pve_onboot_status == 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Proxmox VM {{ $labels.id }} is stopped"
|
|
description: "Proxmox VM {{ $labels.id }} ({{ $labels.name }}) has onboot=1 but is stopped."
|
|
- name: jellyfin_rules
|
|
rules:
|
|
- alert: jellyfin_down
|
|
expr: up{job="jellyfin"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Jellyfin not responding on {{ $labels.instance }}"
|
|
description: "Cannot scrape Jellyfin metrics from {{ $labels.instance }} for 5 minutes."
|
|
- name: vault_rules
|
|
rules:
|
|
- alert: openbao_down
|
|
expr: node_systemd_unit_state{instance="vault01.home.2rjus.net:9100", name="openbao.service", state="active"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "OpenBao not running on {{ $labels.instance }}"
|
|
description: "OpenBao has been down on {{ $labels.instance }} more than 5 minutes."
|
|
- alert: openbao_sealed
|
|
expr: vault_core_unsealed == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "OpenBao is sealed on {{ $labels.instance }}"
|
|
description: "OpenBao has been sealed on {{ $labels.instance }} for more than 5 minutes."
|
|
- alert: openbao_scrape_down
|
|
expr: up{job="openbao"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Cannot scrape OpenBao metrics from {{ $labels.instance }}"
|
|
description: "OpenBao metrics endpoint is not responding on {{ $labels.instance }}."
|
|
- name: certificate_rules
|
|
rules:
|
|
- alert: tls_certificate_expiring_soon
|
|
expr: (probe_ssl_earliest_cert_expiry - time()) < 86400 * 7
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "TLS certificate expiring soon on {{ $labels.instance }}"
|
|
description: "The TLS certificate for {{ $labels.instance }} expires in less than 7 days."
|
|
- alert: tls_certificate_expiring_critical
|
|
expr: (probe_ssl_earliest_cert_expiry - time()) < 86400
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "TLS certificate expiring within 24h on {{ $labels.instance }}"
|
|
description: "The TLS certificate for {{ $labels.instance }} expires in less than 24 hours. Immediate action required."
|
|
- alert: tls_probe_failed
|
|
expr: probe_success{job="blackbox_tls"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "TLS probe failed for {{ $labels.instance }}"
|
|
description: "Cannot connect to {{ $labels.instance }} to check TLS certificate. The service may be down or unreachable."
|
|
- name: homelab_deploy_rules
|
|
rules:
|
|
- alert: homelab_deploy_build_failed
|
|
expr: increase(homelab_deploy_build_host_total{status="failure"}[1h]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Build failed for {{ $labels.host }} in repo {{ $labels.repo }}"
|
|
description: "Host {{ $labels.host }} failed to build from {{ $labels.repo }} repository."
|
|
- alert: homelab_deploy_builder_down
|
|
expr: up{job="homelab-deploy-builder"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Homelab deploy builder not responding on {{ $labels.instance }}"
|
|
description: "Cannot scrape homelab-deploy-builder metrics from {{ $labels.instance }} for 5 minutes."
|