Add some alerting rules
Some checks failed
Run nix flake check / flake-check (push) Failing after 14m34s

This commit is contained in:
Torjus Håkestad 2025-02-11 23:24:35 +01:00
parent f0bc29ac5e
commit f1ca20a387
Signed by: torjus
SSH Key Fingerprint: SHA256:KjAds8wHfD2mBYK2H815s/+ABcSdcIHUndwHEdSxml4

View File

@ -67,6 +67,14 @@
annotations:
summary: "High CPU load on {{ $labels.instance }}"
description: "CPU load is high on {{ $labels.instance }}. Please check."
- alert: nixos_upgrade_failed
expr: node_systemd_unit_state{name="nixos-upgrade.service", state="failed"} == 1
for: 1m
labels:
severity: critical
annotations:
summary: "NixOS upgrade failed on {{ $labels.instance }}"
description: "NixOS upgrade failed on {{ $labels.instance }}"
- name: nameserver_rules
rules:
- alert: unbound_down
@ -105,6 +113,24 @@
annotations:
summary: "NATS not running on {{ $labels.instance }}"
description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
- name: nix_cache_rules
rules:
- alert: build-flakes service failed
expr: node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service",state="failed"} == 1
for: 5m
labels:
severity: critical
annotations:
summary: "The build-flakes service on {{ $labels.instance }} has failed"
description: "The build-flakes service on {{ $labels.instance }} has failed"
- alert: low_disk_space_nix
expr: node_filesystem_free_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} / node_filesystem_size_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Disk space low on /nix for {{ $labels.instance }}"
description: "Disk space is low on /nix for host {{ $labels.instance }}. Please check."
- name: home_assistant_rules
rules:
- alert: home_assistant_down