monitoring: increase high_cpu_load duration for nix-cache01 to 2h

nix-cache01 regularly hits high CPU during nix builds, causing flappy
alerts. Keep the 15m threshold for all other hosts.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-02-05 02:28:48 +01:00
parent 64f2688349
commit 28b8d7c115

View File

@@ -18,13 +18,21 @@ groups:
summary: "Disk space low on {{ $labels.instance }}" summary: "Disk space low on {{ $labels.instance }}"
description: "Disk space is low on {{ $labels.instance }}. Please check." description: "Disk space is low on {{ $labels.instance }}. Please check."
- alert: high_cpu_load - alert: high_cpu_load
expr: max(node_load5{}) by (instance) > (count by (instance)(node_cpu_seconds_total{mode="idle"}) * 0.7) expr: max(node_load5{instance!="nix-cache01.home.2rjus.net:9100"}) by (instance) > (count by (instance)(node_cpu_seconds_total{instance!="nix-cache01.home.2rjus.net:9100", mode="idle"}) * 0.7)
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "High CPU load on {{ $labels.instance }}" summary: "High CPU load on {{ $labels.instance }}"
description: "CPU load is high on {{ $labels.instance }}. Please check." description: "CPU load is high on {{ $labels.instance }}. Please check."
- alert: high_cpu_load
expr: max(node_load5{instance="nix-cache01.home.2rjus.net:9100"}) by (instance) > (count by (instance)(node_cpu_seconds_total{instance="nix-cache01.home.2rjus.net:9100", mode="idle"}) * 0.7)
for: 2h
labels:
severity: warning
annotations:
summary: "High CPU load on {{ $labels.instance }}"
description: "CPU load is high on {{ $labels.instance }}. Please check."
- alert: low_memory - alert: low_memory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 2m for: 2m