Add nats host to monitoring
Some checks failed
Run nix flake check / flake-check (push) Has been cancelled
Some checks failed
Run nix flake check / flake-check (push) Has been cancelled
This commit is contained in:
parent
539ff4eeac
commit
f0bc29ac5e
@ -48,7 +48,7 @@
|
|||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Instance {{ $labels.instance }} down"
|
summary: "Instance {{ $labels.instance }} down"
|
||||||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
|
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
|
||||||
- alert: low_disk_space
|
- alert: low_disk_space
|
||||||
@ -56,7 +56,7 @@
|
|||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Disk space low on {{ $labels.instance }}"
|
summary: "Disk space low on {{ $labels.instance }}"
|
||||||
description: "Disk space is low on {{ $labels.instance }}. Please check."
|
description: "Disk space is low on {{ $labels.instance }}. Please check."
|
||||||
- alert: high_cpu_load
|
- alert: high_cpu_load
|
||||||
@ -64,7 +64,7 @@
|
|||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "High CPU load on {{ $labels.instance }}"
|
summary: "High CPU load on {{ $labels.instance }}"
|
||||||
description: "CPU load is high on {{ $labels.instance }}. Please check."
|
description: "CPU load is high on {{ $labels.instance }}. Please check."
|
||||||
- name: nameserver_rules
|
- name: nameserver_rules
|
||||||
@ -74,7 +74,7 @@
|
|||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Unbound not running on {{ $labels.instance }}"
|
summary: "Unbound not running on {{ $labels.instance }}"
|
||||||
description: "Unbound has been down on {{ $labels.instance }} more than 5 minutes."
|
description: "Unbound has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
- alert: nsd_down
|
- alert: nsd_down
|
||||||
@ -82,7 +82,7 @@
|
|||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "NSD not running on {{ $labels.instance }}"
|
summary: "NSD not running on {{ $labels.instance }}"
|
||||||
description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
|
description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
- name: http-proxy_rules
|
- name: http-proxy_rules
|
||||||
@ -92,9 +92,19 @@
|
|||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Caddy not running on {{ $labels.instance }}"
|
summary: "Caddy not running on {{ $labels.instance }}"
|
||||||
description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
|
description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
|
- name: nats_rules
|
||||||
|
rules:
|
||||||
|
- alert: nats down
|
||||||
|
expr: node_systemd_unit_state {instance="nats1.home.2rjus.net:9100", name = "nats.service", state = "active"} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "NATS not running on {{ $labels.instance }}"
|
||||||
|
description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
- name: home_assistant_rules
|
- name: home_assistant_rules
|
||||||
rules:
|
rules:
|
||||||
- alert: home_assistant_down
|
- alert: home_assistant_down
|
||||||
@ -102,7 +112,7 @@
|
|||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Home assistant not running on {{ $labels.instance }}"
|
summary: "Home assistant not running on {{ $labels.instance }}"
|
||||||
description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes."
|
description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
- alert: zigbee2qmtt_down
|
- alert: zigbee2qmtt_down
|
||||||
@ -110,7 +120,7 @@
|
|||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Zigbee2mqtt not running on {{ $labels.instance }}"
|
summary: "Zigbee2mqtt not running on {{ $labels.instance }}"
|
||||||
description: "Zigbee2mqtt has been down on {{ $labels.instance }} more than 5 minutes."
|
description: "Zigbee2mqtt has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
- alert: mosquitto_down
|
- alert: mosquitto_down
|
||||||
@ -118,7 +128,7 @@
|
|||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Mosquitto not running on {{ $labels.instance }}"
|
summary: "Mosquitto not running on {{ $labels.instance }}"
|
||||||
description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
|
description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
''
|
''
|
||||||
@ -140,6 +150,7 @@
|
|||||||
"ns1.home.2rjus.net:9100"
|
"ns1.home.2rjus.net:9100"
|
||||||
"ns2.home.2rjus.net:9100"
|
"ns2.home.2rjus.net:9100"
|
||||||
"pgdb1.home.2rjus.net:9100"
|
"pgdb1.home.2rjus.net:9100"
|
||||||
|
"nats1.home.2rjus.net:9100"
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
|
Loading…
x
Reference in New Issue
Block a user