From f1ca20a3877a3ad9e870a2b78b94126ca9ad8b5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Tue, 11 Feb 2025 23:24:35 +0100 Subject: [PATCH] Add some alerting rules --- services/monitoring/prometheus.nix | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/services/monitoring/prometheus.nix b/services/monitoring/prometheus.nix index 7f7c6f4..55c3a10 100644 --- a/services/monitoring/prometheus.nix +++ b/services/monitoring/prometheus.nix @@ -67,6 +67,14 @@ annotations: summary: "High CPU load on {{ $labels.instance }}" description: "CPU load is high on {{ $labels.instance }}. Please check." + - alert: nixos_upgrade_failed + expr: node_systemd_unit_state{name="nixos-upgrade.service", state="failed"} == 1 + for: 1m + labels: + severity: critical + annotations: + summary: "NixOS upgrade failed on {{ $labels.instance }}" + description: "NixOS upgrade failed on {{ $labels.instance }}" - name: nameserver_rules rules: - alert: unbound_down @@ -105,6 +113,24 @@ annotations: summary: "NATS not running on {{ $labels.instance }}" description: "NATS has been down on {{ $labels.instance }} more than 5 minutes." + - name: nix_cache_rules + rules: + - alert: build-flakes service failed + expr: node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service",state="failed"} == 1 + for: 5m + labels: + severity: critical + annotations: + summary: "The build-flakes service on {{ $labels.instance }} has failed" + description: "The build-flakes service on {{ $labels.instance }} has failed" + - alert: low_disk_space_nix + expr: node_filesystem_free_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} / node_filesystem_size_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Disk space low on /nix for {{ $labels.instance }}" + description: "Disk space is low on /nix for host {{ $labels.instance }}. Please check." - name: home_assistant_rules rules: - alert: home_assistant_down