diff --git a/services/monitoring/prometheus.nix b/services/monitoring/prometheus.nix index 61c1834..7f7c6f4 100644 --- a/services/monitoring/prometheus.nix +++ b/services/monitoring/prometheus.nix @@ -48,7 +48,7 @@ for: 5m labels: severity: critical - annotations: + annotations: summary: "Instance {{ $labels.instance }} down" description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." - alert: low_disk_space @@ -56,7 +56,7 @@ for: 5m labels: severity: warning - annotations: + annotations: summary: "Disk space low on {{ $labels.instance }}" description: "Disk space is low on {{ $labels.instance }}. Please check." - alert: high_cpu_load @@ -64,7 +64,7 @@ for: 5m labels: severity: warning - annotations: + annotations: summary: "High CPU load on {{ $labels.instance }}" description: "CPU load is high on {{ $labels.instance }}. Please check." - name: nameserver_rules @@ -74,7 +74,7 @@ for: 5m labels: severity: critical - annotations: + annotations: summary: "Unbound not running on {{ $labels.instance }}" description: "Unbound has been down on {{ $labels.instance }} more than 5 minutes." - alert: nsd_down @@ -82,7 +82,7 @@ for: 5m labels: severity: critical - annotations: + annotations: summary: "NSD not running on {{ $labels.instance }}" description: "NSD has been down on {{ $labels.instance }} more than 5 minutes." - name: http-proxy_rules @@ -92,9 +92,19 @@ for: 5m labels: severity: critical - annotations: + annotations: summary: "Caddy not running on {{ $labels.instance }}" description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes." + - name: nats_rules + rules: + - alert: nats down + expr: node_systemd_unit_state {instance="nats1.home.2rjus.net:9100", name = "nats.service", state = "active"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "NATS not running on {{ $labels.instance }}" + description: "NATS has been down on {{ $labels.instance }} more than 5 minutes." - name: home_assistant_rules rules: - alert: home_assistant_down @@ -102,7 +112,7 @@ for: 5m labels: severity: critical - annotations: + annotations: summary: "Home assistant not running on {{ $labels.instance }}" description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes." - alert: zigbee2qmtt_down @@ -110,7 +120,7 @@ for: 5m labels: severity: critical - annotations: + annotations: summary: "Zigbee2mqtt not running on {{ $labels.instance }}" description: "Zigbee2mqtt has been down on {{ $labels.instance }} more than 5 minutes." - alert: mosquitto_down @@ -118,7 +128,7 @@ for: 5m labels: severity: critical - annotations: + annotations: summary: "Mosquitto not running on {{ $labels.instance }}" description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes." '' @@ -140,6 +150,7 @@ "ns1.home.2rjus.net:9100" "ns2.home.2rjus.net:9100" "pgdb1.home.2rjus.net:9100" + "nats1.home.2rjus.net:9100" ]; } ];