diff --git a/services/monitoring/prometheus.nix b/services/monitoring/prometheus.nix index 39538e7..0ec707a 100644 --- a/services/monitoring/prometheus.nix +++ b/services/monitoring/prometheus.nix @@ -41,194 +41,7 @@ scrape_interval = "15s"; }; rules = [ - '' - groups: - - name: common_rules - rules: - - alert: node_down - expr: up == 0 - for: 5m - labels: - severity: critical - annotations: - summary: "Instance {{ $labels.instance }} down" - description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." - - alert: low_disk_space - expr: node_filesystem_free_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 10 - for: 5m - labels: - severity: warning - annotations: - summary: "Disk space low on {{ $labels.instance }}" - description: "Disk space is low on {{ $labels.instance }}. Please check." - - alert: high_cpu_load - expr: max(node_load5{}) by (instance) > (count by (instance)(node_cpu_seconds_total{mode="idle"}) * 0.7) - for: 15m - labels: - severity: warning - annotations: - summary: "High CPU load on {{ $labels.instance }}" - description: "CPU load is high on {{ $labels.instance }}. Please check." - - alert: low_memory - expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 - for: 2m - labels: - severity: warning - annotations: - summary: Low available memory on {{ $labels.instance }} - description: Node memory is filling up (< 10% left)\n VALUE = {{ $value }} - - alert: oom_kill - expr: increase(node_vmstat_oom_kill[1m]) > 0 - for: 0m - labels: - severity: warning - annotations: - summary: Host OOM kill detected on {{ $labels.instance }} - description: OOM kill detected - - alert: nixos_upgrade_failed - expr: node_systemd_unit_state{name="nixos-upgrade.service", state="failed"} == 1 - for: 0m - labels: - severity: critical - annotations: - summary: "NixOS upgrade failed on {{ $labels.instance }}" - description: "NixOS upgrade failed on {{ $labels.instance }}" - - alert: promtail_not_running - expr: node_systemd_unit_state{name="promtail.service", state="active"} == 0 - for: 5m - labels: - severity: warning - annotations: - summary: "Promtail service not running on {{ $labels.instance }}" - description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes." - - name: nameserver_rules - rules: - - alert: unbound_down - expr: node_systemd_unit_state {instance =~ "ns.+", name = "unbound.service", state = "active"} == 0 - for: 5m - labels: - severity: critical - annotations: - summary: "Unbound not running on {{ $labels.instance }}" - description: "Unbound has been down on {{ $labels.instance }} more than 5 minutes." - - alert: nsd_down - expr: node_systemd_unit_state {instance =~ "ns.+", name = "nsd.service", state = "active"} == 0 - for: 5m - labels: - severity: critical - annotations: - summary: "NSD not running on {{ $labels.instance }}" - description: "NSD has been down on {{ $labels.instance }} more than 5 minutes." - - name: http-proxy_rules - rules: - - alert: caddy_down - expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0 - for: 5m - labels: - severity: critical - annotations: - summary: "Caddy not running on {{ $labels.instance }}" - description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes." - - name: nats_rules - rules: - - alert: nats_down - expr: node_systemd_unit_state {instance="nats1.home.2rjus.net:9100", name = "nats.service", state = "active"} == 0 - for: 5m - labels: - severity: critical - annotations: - summary: "NATS not running on {{ $labels.instance }}" - description: "NATS has been down on {{ $labels.instance }} more than 5 minutes." - - name: nix_cache_rules - rules: - - alert: build-flakes_service_failed - expr: node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="failed"} == 1 - for: 0m - keep_firing_for: 10m - labels: - severity: critical - annotations: - summary: "The build-flakes service on {{ $labels.instance }} has failed" - description: "The build-flakes service on {{ $labels.instance }} has failed" - - alert: harmonia_down - expr: node_systemd_unit_state {instance="nix-cache01.home.2rjus.net:9100", name = "harmonia.service", state = "active"} == 0 - for: 5m - labels: - severity: critical - annotations: - summary: "Harmonia not running on {{ $labels.instance }}" - description: "Harmonia has been down on {{ $labels.instance }} more than 5 minutes." - - alert: low_disk_space_nix - expr: node_filesystem_free_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} / node_filesystem_size_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} * 100 < 10 - for: 5m - labels: - severity: warning - annotations: - summary: "Disk space low on /nix for {{ $labels.instance }}" - description: "Disk space is low on /nix for host {{ $labels.instance }}. Please check." - - name: home_assistant_rules - rules: - - alert: home_assistant_down - expr: node_systemd_unit_state {instance="ha1.home.2rjus.net:9100", name="home-assistant.service", state="active"} == 0 - for: 5m - labels: - severity: critical - annotations: - summary: "Home assistant not running on {{ $labels.instance }}" - description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes." - - alert: zigbee2qmtt_down - expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0 - for: 5m - labels: - severity: critical - annotations: - summary: "Zigbee2mqtt not running on {{ $labels.instance }}" - description: "Zigbee2mqtt has been down on {{ $labels.instance }} more than 5 minutes." - - alert: mosquitto_down - expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "mosquitto.service", state = "active"} == 0 - for: 5m - labels: - severity: critical - annotations: - summary: "Mosquitto not running on {{ $labels.instance }}" - description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes." - - name: smartctl_rules - rules: - - alert: SmartCriticalWarning - expr: smartctl_device_critical_warning > 0 - for: 0m - labels: - severity: critical - annotations: - summary: SMART critical warning (instance {{ $labels.instance }}) - description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: SmartMediaErrors - expr: smartctl_device_media_errors > 0 - for: 0m - labels: - severity: critical - annotations: - summary: SMART media errors (instance {{ $labels.instance }}) - description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: SmartWearoutIndicator - expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold - for: 0m - labels: - severity: critical - annotations: - summary: SMART Wearout Indicator (instance {{ $labels.instance }}) - description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: wireguard_rules - rules: - - alert: WireguardHandshake - expr: (time() - wireguard_latest_handshake_seconds{instance="http-proxy.home.2rjus.net:9586",interface="wg0",public_key="32Rb13wExcy8uI92JTnFdiOfkv0mlQ6f181WA741DHs="}) > 300 - for: 1m - labels: - severity: warning - annotations: - summary: "Wireguard handshake timeout on {{ $labels.instance }}" - description: "Wireguard handshake timeout on {{ $labels.instance }} for more than 1 minutes." - '' + (builtins.readFile ./rules.yml) ]; scrapeConfigs = [ diff --git a/services/monitoring/rules.yml b/services/monitoring/rules.yml new file mode 100644 index 0000000..d240da1 --- /dev/null +++ b/services/monitoring/rules.yml @@ -0,0 +1,186 @@ +groups: + - name: common_rules + rules: + - alert: node_down + expr: up == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Instance {{ $labels.instance }} down" + description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." + - alert: low_disk_space + expr: node_filesystem_free_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Disk space low on {{ $labels.instance }}" + description: "Disk space is low on {{ $labels.instance }}. Please check." + - alert: high_cpu_load + expr: max(node_load5{}) by (instance) > (count by (instance)(node_cpu_seconds_total{mode="idle"}) * 0.7) + for: 15m + labels: + severity: warning + annotations: + summary: "High CPU load on {{ $labels.instance }}" + description: "CPU load is high on {{ $labels.instance }}. Please check." + - alert: low_memory + expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 + for: 2m + labels: + severity: warning + annotations: + summary: Low available memory on {{ $labels.instance }} + description: Node memory is filling up (< 10% left)\n VALUE = {{ $value }} + - alert: oom_kill + expr: increase(node_vmstat_oom_kill[1m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Host OOM kill detected on {{ $labels.instance }} + description: OOM kill detected + - alert: nixos_upgrade_failed + expr: node_systemd_unit_state{name="nixos-upgrade.service", state="failed"} == 1 + for: 0m + labels: + severity: critical + annotations: + summary: "NixOS upgrade failed on {{ $labels.instance }}" + description: "NixOS upgrade failed on {{ $labels.instance }}" + - alert: promtail_not_running + expr: node_systemd_unit_state{name="promtail.service", state="active"} == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Promtail service not running on {{ $labels.instance }}" + description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes." + - name: nameserver_rules + rules: + - alert: unbound_down + expr: node_systemd_unit_state {instance =~ "ns.+", name = "unbound.service", state = "active"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Unbound not running on {{ $labels.instance }}" + description: "Unbound has been down on {{ $labels.instance }} more than 5 minutes." + - alert: nsd_down + expr: node_systemd_unit_state {instance =~ "ns.+", name = "nsd.service", state = "active"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "NSD not running on {{ $labels.instance }}" + description: "NSD has been down on {{ $labels.instance }} more than 5 minutes." + - name: http-proxy_rules + rules: + - alert: caddy_down + expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Caddy not running on {{ $labels.instance }}" + description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes." + - name: nats_rules + rules: + - alert: nats_down + expr: node_systemd_unit_state {instance="nats1.home.2rjus.net:9100", name = "nats.service", state = "active"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "NATS not running on {{ $labels.instance }}" + description: "NATS has been down on {{ $labels.instance }} more than 5 minutes." + - name: nix_cache_rules + rules: + - alert: build-flakes_service_failed + expr: node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="failed"} == 1 + for: 0m + keep_firing_for: 10m + labels: + severity: critical + annotations: + summary: "The build-flakes service on {{ $labels.instance }} has failed" + description: "The build-flakes service on {{ $labels.instance }} has failed" + - alert: harmonia_down + expr: node_systemd_unit_state {instance="nix-cache01.home.2rjus.net:9100", name = "harmonia.service", state = "active"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Harmonia not running on {{ $labels.instance }}" + description: "Harmonia has been down on {{ $labels.instance }} more than 5 minutes." + - alert: low_disk_space_nix + expr: node_filesystem_free_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} / node_filesystem_size_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Disk space low on /nix for {{ $labels.instance }}" + description: "Disk space is low on /nix for host {{ $labels.instance }}. Please check." + - name: home_assistant_rules + rules: + - alert: home_assistant_down + expr: node_systemd_unit_state {instance="ha1.home.2rjus.net:9100", name="home-assistant.service", state="active"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Home assistant not running on {{ $labels.instance }}" + description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes." + - alert: zigbee2qmtt_down + expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Zigbee2mqtt not running on {{ $labels.instance }}" + description: "Zigbee2mqtt has been down on {{ $labels.instance }} more than 5 minutes." + - alert: mosquitto_down + expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "mosquitto.service", state = "active"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Mosquitto not running on {{ $labels.instance }}" + description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes." + - name: smartctl_rules + rules: + - alert: SmartCriticalWarning + expr: smartctl_device_critical_warning > 0 + for: 0m + labels: + severity: critical + annotations: + summary: SMART critical warning (instance {{ $labels.instance }}) + description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: SmartMediaErrors + expr: smartctl_device_media_errors > 0 + for: 0m + labels: + severity: critical + annotations: + summary: SMART media errors (instance {{ $labels.instance }}) + description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: SmartWearoutIndicator + expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold + for: 0m + labels: + severity: critical + annotations: + summary: SMART Wearout Indicator (instance {{ $labels.instance }}) + description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: wireguard_rules + rules: + - alert: WireguardHandshake + expr: (time() - wireguard_latest_handshake_seconds{instance="http-proxy.home.2rjus.net:9586",interface="wg0",public_key="32Rb13wExcy8uI92JTnFdiOfkv0mlQ6f181WA741DHs="}) > 300 + for: 1m + labels: + severity: warning + annotations: + summary: "Wireguard handshake timeout on {{ $labels.instance }}" + description: "Wireguard handshake timeout on {{ $labels.instance }} for more than 1 minutes."