Improve monitoring stuff
Some checks failed
Run nix flake check / flake-check (push) Failing after 5m5s
Some checks failed
Run nix flake check / flake-check (push) Failing after 5m5s
This commit is contained in:
parent
6caa78b824
commit
f08ac69003
@ -3,8 +3,100 @@
|
||||
services.prometheus = {
|
||||
enable = true;
|
||||
alertmanager = {
|
||||
enable = false;
|
||||
enable = true;
|
||||
configuration = {
|
||||
global =
|
||||
{
|
||||
};
|
||||
route = {
|
||||
receiver = "webhook_gunter";
|
||||
group_wait = "30s";
|
||||
group_interval = "5m";
|
||||
repeat_interval = "12h";
|
||||
group_by = [ "alertname" ];
|
||||
};
|
||||
receivers = [
|
||||
{
|
||||
name = "webhook_gunter";
|
||||
webhook_configs = [
|
||||
{
|
||||
url = "http://gunter.home.2rjus.net:5001/alert";
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
rules = [
|
||||
''
|
||||
groups:
|
||||
- name: common_rules
|
||||
rules:
|
||||
- alert: node_down
|
||||
expr: up == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Instance {{ $labels.instance }} down"
|
||||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
|
||||
- name: nameserver_rules
|
||||
rules:
|
||||
- alert: unbound_down
|
||||
expr: node_systemd_unit_state {instance =~ "ns.+", name = "unbound.service", state = "active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Unbound not running on {{ $labels.instance }}"
|
||||
description: "Unbound has been down on {{ $labels.instance }} more than 5 minutes."
|
||||
- alert: nsd_down
|
||||
expr: node_systemd_unit_state {instance =~ "ns.+", name = "nsd.service", state = "active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "NSD not running on {{ $labels.instance }}"
|
||||
description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
|
||||
- name: http-proxy_rules
|
||||
rules:
|
||||
- alert: caddy_down
|
||||
expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Caddy not running on {{ $labels.instance }}"
|
||||
description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
|
||||
- name: home_assistant_rules
|
||||
rules:
|
||||
- alert: home_assistant_down
|
||||
expr: node_systemd_unit_state {instance="ha1.home.2rjus.net:9100", name="home-assistant.service", state="active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Home assistant not running on {{ $labels.instance }}"
|
||||
description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes."
|
||||
- alert: zigbee2qmtt_down
|
||||
expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Zigbee2mqtt not running on {{ $labels.instance }}"
|
||||
description: "Zigbee2mqtt has been down on {{ $labels.instance }} more than 5 minutes."
|
||||
- alert: mosquitto_down
|
||||
expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "mosquitto.service", state = "active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Mosquitto not running on {{ $labels.instance }}"
|
||||
description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
|
||||
''
|
||||
];
|
||||
|
||||
scrapeConfigs = [
|
||||
{
|
||||
job_name = "node-exporter";
|
||||
|
Loading…
Reference in New Issue
Block a user