From 28b8d7c115331a0d01be1ead8a9a9ecc056ee407 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Thu, 5 Feb 2026 02:28:48 +0100 Subject: [PATCH] monitoring: increase high_cpu_load duration for nix-cache01 to 2h nix-cache01 regularly hits high CPU during nix builds, causing flappy alerts. Keep the 15m threshold for all other hosts. Co-Authored-By: Claude Opus 4.5 --- services/monitoring/rules.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/services/monitoring/rules.yml b/services/monitoring/rules.yml index b9b36aa..ea5dc42 100644 --- a/services/monitoring/rules.yml +++ b/services/monitoring/rules.yml @@ -18,13 +18,21 @@ groups: summary: "Disk space low on {{ $labels.instance }}" description: "Disk space is low on {{ $labels.instance }}. Please check." - alert: high_cpu_load - expr: max(node_load5{}) by (instance) > (count by (instance)(node_cpu_seconds_total{mode="idle"}) * 0.7) + expr: max(node_load5{instance!="nix-cache01.home.2rjus.net:9100"}) by (instance) > (count by (instance)(node_cpu_seconds_total{instance!="nix-cache01.home.2rjus.net:9100", mode="idle"}) * 0.7) for: 15m labels: severity: warning annotations: summary: "High CPU load on {{ $labels.instance }}" description: "CPU load is high on {{ $labels.instance }}. Please check." + - alert: high_cpu_load + expr: max(node_load5{instance="nix-cache01.home.2rjus.net:9100"}) by (instance) > (count by (instance)(node_cpu_seconds_total{instance="nix-cache01.home.2rjus.net:9100", mode="idle"}) * 0.7) + for: 2h + labels: + severity: warning + annotations: + summary: "High CPU load on {{ $labels.instance }}" + description: "CPU load is high on {{ $labels.instance }}. Please check." - alert: low_memory expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 for: 2m