From b8d058d23ee4f99c9dd445932a26418b56531258 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Wed, 12 Feb 2025 20:34:22 +0100 Subject: [PATCH] Add alerting rules --- services/monitoring/prometheus.nix | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/services/monitoring/prometheus.nix b/services/monitoring/prometheus.nix index ced3ac2..f948163 100644 --- a/services/monitoring/prometheus.nix +++ b/services/monitoring/prometheus.nix @@ -67,14 +67,38 @@ annotations: summary: "High CPU load on {{ $labels.instance }}" description: "CPU load is high on {{ $labels.instance }}. Please check." + - alert: low_memory + expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 + for: 2m + labels: + severity: warning + annotations: + summary: Low available memory on {{ $labels.instance }} + description: Node memory is filling up (< 10% left)\n VALUE = {{ $value }} + - alert: oom_kill + expr: increase(node_vmstat_oom_kill[1m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Host OOM kill detected on {{ $labels.instance }} + description: OOM kill detected - alert: nixos_upgrade_failed expr: node_systemd_unit_state{name="nixos-upgrade.service", state="failed"} == 1 - for: 1m + for: 0m labels: severity: critical annotations: summary: "NixOS upgrade failed on {{ $labels.instance }}" description: "NixOS upgrade failed on {{ $labels.instance }}" + - alert: promtail_not_running + expr: node_systemd_unit_state{name="promtail.service", state="active"} == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Promtail service not running on {{ $labels.instance }}" + description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes." - name: nameserver_rules rules: - alert: unbound_down @@ -105,7 +129,7 @@ description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes." - name: nats_rules rules: - - alert: nats down + - alert: nats_down expr: node_systemd_unit_state {instance="nats1.home.2rjus.net:9100", name = "nats.service", state = "active"} == 0 for: 5m labels: @@ -115,7 +139,7 @@ description: "NATS has been down on {{ $labels.instance }} more than 5 minutes." - name: nix_cache_rules rules: - - alert: build-flakes service failed + - alert: build-flakes_service_failed expr: node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service",state="failed"} == 1 for: 5m labels: