From fb1a36a846d534c61d8cddd3340dce1db126a75f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Wed, 28 May 2025 21:26:04 +0200 Subject: [PATCH] Rework build-flakes alert rules --- services/monitoring/rules.yml | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/services/monitoring/rules.yml b/services/monitoring/rules.yml index d240da1..81dd859 100644 --- a/services/monitoring/rules.yml +++ b/services/monitoring/rules.yml @@ -97,15 +97,21 @@ groups: description: "NATS has been down on {{ $labels.instance }} more than 5 minutes." - name: nix_cache_rules rules: - - alert: build-flakes_service_failed - expr: node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="failed"} == 1 + - alert: build-flakes_service_not_active_recently + expr: count_over_time(node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="active"}[1h]) < 1 for: 0m - keep_firing_for: 10m labels: severity: critical annotations: - summary: "The build-flakes service on {{ $labels.instance }} has failed" - description: "The build-flakes service on {{ $labels.instance }} has failed" + summary: "The build-flakes service on {{ $labels.instance }} has not run recently" + description: "The build-flakes service on {{ $labels.instance }} has not run recently" + - alert: + expr: build_flakes_error == 1 + labels: + severity: warning + annotations: + summary: "The build-flakes job has failed for host {{ $labels.host }}." + description: "The build-flakes job has failed for host {{ $labels.host }}." - alert: harmonia_down expr: node_systemd_unit_state {instance="nix-cache01.home.2rjus.net:9100", name = "harmonia.service", state = "active"} == 0 for: 5m