Add nats host to monitoring
Some checks failed
Run nix flake check / flake-check (push) Has been cancelled

This commit is contained in:
Torjus Håkestad 2025-02-11 23:12:55 +01:00
parent 539ff4eeac
commit f0bc29ac5e
Signed by: torjus
SSH Key Fingerprint: SHA256:KjAds8wHfD2mBYK2H815s/+ABcSdcIHUndwHEdSxml4

View File

@ -48,7 +48,7 @@
for: 5m for: 5m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "Instance {{ $labels.instance }} down" summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
- alert: low_disk_space - alert: low_disk_space
@ -56,7 +56,7 @@
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "Disk space low on {{ $labels.instance }}" summary: "Disk space low on {{ $labels.instance }}"
description: "Disk space is low on {{ $labels.instance }}. Please check." description: "Disk space is low on {{ $labels.instance }}. Please check."
- alert: high_cpu_load - alert: high_cpu_load
@ -64,7 +64,7 @@
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "High CPU load on {{ $labels.instance }}" summary: "High CPU load on {{ $labels.instance }}"
description: "CPU load is high on {{ $labels.instance }}. Please check." description: "CPU load is high on {{ $labels.instance }}. Please check."
- name: nameserver_rules - name: nameserver_rules
@ -74,7 +74,7 @@
for: 5m for: 5m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "Unbound not running on {{ $labels.instance }}" summary: "Unbound not running on {{ $labels.instance }}"
description: "Unbound has been down on {{ $labels.instance }} more than 5 minutes." description: "Unbound has been down on {{ $labels.instance }} more than 5 minutes."
- alert: nsd_down - alert: nsd_down
@ -82,7 +82,7 @@
for: 5m for: 5m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "NSD not running on {{ $labels.instance }}" summary: "NSD not running on {{ $labels.instance }}"
description: "NSD has been down on {{ $labels.instance }} more than 5 minutes." description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
- name: http-proxy_rules - name: http-proxy_rules
@ -92,9 +92,19 @@
for: 5m for: 5m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "Caddy not running on {{ $labels.instance }}" summary: "Caddy not running on {{ $labels.instance }}"
description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes." description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
- name: nats_rules
rules:
- alert: nats down
expr: node_systemd_unit_state {instance="nats1.home.2rjus.net:9100", name = "nats.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "NATS not running on {{ $labels.instance }}"
description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
- name: home_assistant_rules - name: home_assistant_rules
rules: rules:
- alert: home_assistant_down - alert: home_assistant_down
@ -102,7 +112,7 @@
for: 5m for: 5m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "Home assistant not running on {{ $labels.instance }}" summary: "Home assistant not running on {{ $labels.instance }}"
description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes." description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes."
- alert: zigbee2qmtt_down - alert: zigbee2qmtt_down
@ -110,7 +120,7 @@
for: 5m for: 5m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "Zigbee2mqtt not running on {{ $labels.instance }}" summary: "Zigbee2mqtt not running on {{ $labels.instance }}"
description: "Zigbee2mqtt has been down on {{ $labels.instance }} more than 5 minutes." description: "Zigbee2mqtt has been down on {{ $labels.instance }} more than 5 minutes."
- alert: mosquitto_down - alert: mosquitto_down
@ -118,7 +128,7 @@
for: 5m for: 5m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "Mosquitto not running on {{ $labels.instance }}" summary: "Mosquitto not running on {{ $labels.instance }}"
description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes." description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
'' ''
@ -140,6 +150,7 @@
"ns1.home.2rjus.net:9100" "ns1.home.2rjus.net:9100"
"ns2.home.2rjus.net:9100" "ns2.home.2rjus.net:9100"
"pgdb1.home.2rjus.net:9100" "pgdb1.home.2rjus.net:9100"
"nats1.home.2rjus.net:9100"
]; ];
} }
]; ];