nixos-servers/services/monitoring/prometheus.nix
Torjus Håkestad f08ac69003
Some checks failed
Run nix flake check / flake-check (push) Failing after 5m5s
Improve monitoring stuff
2024-12-02 23:41:46 +01:00

121 lines
4.4 KiB
Nix

{ ... }:
{
services.prometheus = {
enable = true;
alertmanager = {
enable = true;
configuration = {
global =
{
};
route = {
receiver = "webhook_gunter";
group_wait = "30s";
group_interval = "5m";
repeat_interval = "12h";
group_by = [ "alertname" ];
};
receivers = [
{
name = "webhook_gunter";
webhook_configs = [
{
url = "http://gunter.home.2rjus.net:5001/alert";
}
];
}
];
};
};
rules = [
''
groups:
- name: common_rules
rules:
- alert: node_down
expr: up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
- name: nameserver_rules
rules:
- alert: unbound_down
expr: node_systemd_unit_state {instance =~ "ns.+", name = "unbound.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Unbound not running on {{ $labels.instance }}"
description: "Unbound has been down on {{ $labels.instance }} more than 5 minutes."
- alert: nsd_down
expr: node_systemd_unit_state {instance =~ "ns.+", name = "nsd.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "NSD not running on {{ $labels.instance }}"
description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
- name: http-proxy_rules
rules:
- alert: caddy_down
expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Caddy not running on {{ $labels.instance }}"
description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
- name: home_assistant_rules
rules:
- alert: home_assistant_down
expr: node_systemd_unit_state {instance="ha1.home.2rjus.net:9100", name="home-assistant.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Home assistant not running on {{ $labels.instance }}"
description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes."
- alert: zigbee2qmtt_down
expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Zigbee2mqtt not running on {{ $labels.instance }}"
description: "Zigbee2mqtt has been down on {{ $labels.instance }} more than 5 minutes."
- alert: mosquitto_down
expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "mosquitto.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Mosquitto not running on {{ $labels.instance }}"
description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
''
];
scrapeConfigs = [
{
job_name = "node-exporter";
static_configs = [
{
targets = [
# Node exporter
"gunter.home.2rjus.net:9100"
"ca.home.2rjus.net:9100"
"monitoring01.home.2rjus.net:9100"
"ns1.home.2rjus.net:9100"
"ns2.home.2rjus.net:9100"
"http-proxy.home.2rjus.net:9100"
"ha1.home.2rjus.net:9100"
];
}
];
}
];
};
}