This commit is contained in:
parent
a5448c5fc1
commit
b8d058d23e
@ -67,14 +67,38 @@
|
||||
annotations:
|
||||
summary: "High CPU load on {{ $labels.instance }}"
|
||||
description: "CPU load is high on {{ $labels.instance }}. Please check."
|
||||
- alert: low_memory
|
||||
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Low available memory on {{ $labels.instance }}
|
||||
description: Node memory is filling up (< 10% left)\n VALUE = {{ $value }}
|
||||
- alert: oom_kill
|
||||
expr: increase(node_vmstat_oom_kill[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host OOM kill detected on {{ $labels.instance }}
|
||||
description: OOM kill detected
|
||||
- alert: nixos_upgrade_failed
|
||||
expr: node_systemd_unit_state{name="nixos-upgrade.service", state="failed"} == 1
|
||||
for: 1m
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "NixOS upgrade failed on {{ $labels.instance }}"
|
||||
description: "NixOS upgrade failed on {{ $labels.instance }}"
|
||||
- alert: promtail_not_running
|
||||
expr: node_systemd_unit_state{name="promtail.service", state="active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Promtail service not running on {{ $labels.instance }}"
|
||||
description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes."
|
||||
- name: nameserver_rules
|
||||
rules:
|
||||
- alert: unbound_down
|
||||
@ -105,7 +129,7 @@
|
||||
description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
|
||||
- name: nats_rules
|
||||
rules:
|
||||
- alert: nats down
|
||||
- alert: nats_down
|
||||
expr: node_systemd_unit_state {instance="nats1.home.2rjus.net:9100", name = "nats.service", state = "active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
@ -115,7 +139,7 @@
|
||||
description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
|
||||
- name: nix_cache_rules
|
||||
rules:
|
||||
- alert: build-flakes service failed
|
||||
- alert: build-flakes_service_failed
|
||||
expr: node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service",state="failed"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
|
Loading…
x
Reference in New Issue
Block a user