Add alerting rules
Some checks failed
Run nix flake check / flake-check (push) Failing after 8m51s

This commit is contained in:
Torjus Håkestad 2025-02-12 20:34:22 +01:00
parent a5448c5fc1
commit b8d058d23e
Signed by: torjus
SSH Key Fingerprint: SHA256:KjAds8wHfD2mBYK2H815s/+ABcSdcIHUndwHEdSxml4

View File

@ -67,14 +67,38 @@
annotations:
summary: "High CPU load on {{ $labels.instance }}"
description: "CPU load is high on {{ $labels.instance }}. Please check."
- alert: low_memory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 2m
labels:
severity: warning
annotations:
summary: Low available memory on {{ $labels.instance }}
description: Node memory is filling up (< 10% left)\n VALUE = {{ $value }}
- alert: oom_kill
expr: increase(node_vmstat_oom_kill[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected on {{ $labels.instance }}
description: OOM kill detected
- alert: nixos_upgrade_failed
expr: node_systemd_unit_state{name="nixos-upgrade.service", state="failed"} == 1
for: 1m
for: 0m
labels:
severity: critical
annotations:
summary: "NixOS upgrade failed on {{ $labels.instance }}"
description: "NixOS upgrade failed on {{ $labels.instance }}"
- alert: promtail_not_running
expr: node_systemd_unit_state{name="promtail.service", state="active"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Promtail service not running on {{ $labels.instance }}"
description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes."
- name: nameserver_rules
rules:
- alert: unbound_down
@ -105,7 +129,7 @@
description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
- name: nats_rules
rules:
- alert: nats down
- alert: nats_down
expr: node_systemd_unit_state {instance="nats1.home.2rjus.net:9100", name = "nats.service", state = "active"} == 0
for: 5m
labels:
@ -115,7 +139,7 @@
description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
- name: nix_cache_rules
rules:
- alert: build-flakes service failed
- alert: build-flakes_service_failed
expr: node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service",state="failed"} == 1
for: 5m
labels: