diff --git a/services/monitoring/rules.yml b/services/monitoring/rules.yml index b9b36aa..ea5dc42 100644 --- a/services/monitoring/rules.yml +++ b/services/monitoring/rules.yml @@ -18,13 +18,21 @@ groups: summary: "Disk space low on {{ $labels.instance }}" description: "Disk space is low on {{ $labels.instance }}. Please check." - alert: high_cpu_load - expr: max(node_load5{}) by (instance) > (count by (instance)(node_cpu_seconds_total{mode="idle"}) * 0.7) + expr: max(node_load5{instance!="nix-cache01.home.2rjus.net:9100"}) by (instance) > (count by (instance)(node_cpu_seconds_total{instance!="nix-cache01.home.2rjus.net:9100", mode="idle"}) * 0.7) for: 15m labels: severity: warning annotations: summary: "High CPU load on {{ $labels.instance }}" description: "CPU load is high on {{ $labels.instance }}. Please check." + - alert: high_cpu_load + expr: max(node_load5{instance="nix-cache01.home.2rjus.net:9100"}) by (instance) > (count by (instance)(node_cpu_seconds_total{instance="nix-cache01.home.2rjus.net:9100", mode="idle"}) * 0.7) + for: 2h + labels: + severity: warning + annotations: + summary: "High CPU load on {{ $labels.instance }}" + description: "CPU load is high on {{ $labels.instance }}. Please check." - alert: low_memory expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 for: 2m