Torjus Håkestad c07d96bbab
Some checks failed
Run nix flake check / flake-check (push) Failing after 3m17s
Periodic flake update / flake-update (push) Successful in 2m15s
Add alert for wireguard handshake
2025-05-18 01:12:04 +02:00

358 lines
13 KiB
Nix

{ ... }:
{
services.prometheus = {
enable = true;
alertmanager = {
enable = true;
configuration = {
global = {
};
route = {
receiver = "webhook_natstonotify";
group_wait = "30s";
group_interval = "5m";
repeat_interval = "1h";
group_by = [ "alertname" ];
};
receivers = [
{
name = "webhook_natstonotify";
webhook_configs = [
{
url = "http://localhost:5001/alert";
}
];
}
];
};
};
alertmanagers = [
{
static_configs = [
{
targets = [ "localhost:9093" ];
}
];
}
];
retentionTime = "30d";
globalConfig = {
scrape_interval = "15s";
};
rules = [
''
groups:
- name: common_rules
rules:
- alert: node_down
expr: up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
- alert: low_disk_space
expr: node_filesystem_free_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Disk space low on {{ $labels.instance }}"
description: "Disk space is low on {{ $labels.instance }}. Please check."
- alert: high_cpu_load
expr: max(node_load5{}) by (instance) > (count by (instance)(node_cpu_seconds_total{mode="idle"}) * 0.7)
for: 15m
labels:
severity: warning
annotations:
summary: "High CPU load on {{ $labels.instance }}"
description: "CPU load is high on {{ $labels.instance }}. Please check."
- alert: low_memory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 2m
labels:
severity: warning
annotations:
summary: Low available memory on {{ $labels.instance }}
description: Node memory is filling up (< 10% left)\n VALUE = {{ $value }}
- alert: oom_kill
expr: increase(node_vmstat_oom_kill[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected on {{ $labels.instance }}
description: OOM kill detected
- alert: nixos_upgrade_failed
expr: node_systemd_unit_state{name="nixos-upgrade.service", state="failed"} == 1
for: 0m
labels:
severity: critical
annotations:
summary: "NixOS upgrade failed on {{ $labels.instance }}"
description: "NixOS upgrade failed on {{ $labels.instance }}"
- alert: promtail_not_running
expr: node_systemd_unit_state{name="promtail.service", state="active"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Promtail service not running on {{ $labels.instance }}"
description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes."
- name: nameserver_rules
rules:
- alert: unbound_down
expr: node_systemd_unit_state {instance =~ "ns.+", name = "unbound.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Unbound not running on {{ $labels.instance }}"
description: "Unbound has been down on {{ $labels.instance }} more than 5 minutes."
- alert: nsd_down
expr: node_systemd_unit_state {instance =~ "ns.+", name = "nsd.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "NSD not running on {{ $labels.instance }}"
description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
- name: http-proxy_rules
rules:
- alert: caddy_down
expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Caddy not running on {{ $labels.instance }}"
description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
- name: nats_rules
rules:
- alert: nats_down
expr: node_systemd_unit_state {instance="nats1.home.2rjus.net:9100", name = "nats.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "NATS not running on {{ $labels.instance }}"
description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
- name: nix_cache_rules
rules:
- alert: build-flakes_service_failed
expr: node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="failed"} == 1
for: 0m
keep_firing_for: 10m
labels:
severity: critical
annotations:
summary: "The build-flakes service on {{ $labels.instance }} has failed"
description: "The build-flakes service on {{ $labels.instance }} has failed"
- alert: harmonia_down
expr: node_systemd_unit_state {instance="nix-cache01.home.2rjus.net:9100", name = "harmonia.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Harmonia not running on {{ $labels.instance }}"
description: "Harmonia has been down on {{ $labels.instance }} more than 5 minutes."
- alert: low_disk_space_nix
expr: node_filesystem_free_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} / node_filesystem_size_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Disk space low on /nix for {{ $labels.instance }}"
description: "Disk space is low on /nix for host {{ $labels.instance }}. Please check."
- name: home_assistant_rules
rules:
- alert: home_assistant_down
expr: node_systemd_unit_state {instance="ha1.home.2rjus.net:9100", name="home-assistant.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Home assistant not running on {{ $labels.instance }}"
description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes."
- alert: zigbee2qmtt_down
expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Zigbee2mqtt not running on {{ $labels.instance }}"
description: "Zigbee2mqtt has been down on {{ $labels.instance }} more than 5 minutes."
- alert: mosquitto_down
expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "mosquitto.service", state = "active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Mosquitto not running on {{ $labels.instance }}"
description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
- name: smartctl_rules
rules:
- alert: SmartCriticalWarning
expr: smartctl_device_critical_warning > 0
for: 0m
labels:
severity: critical
annotations:
summary: SMART critical warning (instance {{ $labels.instance }})
description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartMediaErrors
expr: smartctl_device_media_errors > 0
for: 0m
labels:
severity: critical
annotations:
summary: SMART media errors (instance {{ $labels.instance }})
description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartWearoutIndicator
expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold
for: 0m
labels:
severity: critical
annotations:
summary: SMART Wearout Indicator (instance {{ $labels.instance }})
description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name: wireguard_rules
rules:
- alert: WireguardHandshake
expr: (time() - wireguard_latest_handshake_seconds{instance="http-proxy.home.2rjus.net:9586",interface="wg0",public_key="32Rb13wExcy8uI92JTnFdiOfkv0mlQ6f181WA741DHs="}) > 300
for: 1m
labels:
severity: warning
annotations:
summary: "Wireguard handshake timeout on {{ $labels.instance }}"
description: "Wireguard handshake timeout on {{ $labels.instance }} for more than 1 minutes."
''
];
scrapeConfigs = [
{
job_name = "node-exporter";
static_configs = [
{
targets = [
"ca.home.2rjus.net:9100"
"gunter.home.2rjus.net:9100"
"ha1.home.2rjus.net:9100"
"http-proxy.home.2rjus.net:9100"
"jelly01.home.2rjus.net:9100"
"monitoring01.home.2rjus.net:9100"
"nix-cache01.home.2rjus.net:9100"
"ns1.home.2rjus.net:9100"
"ns2.home.2rjus.net:9100"
"pgdb1.home.2rjus.net:9100"
"nats1.home.2rjus.net:9100"
];
}
];
}
{
job_name = "prometheus";
static_configs = [
{
targets = [ "localhost:9090" ];
}
];
}
{
job_name = "loki";
static_configs = [
{
targets = [ "localhost:3100" ];
}
];
}
{
job_name = "grafana";
static_configs = [
{
targets = [ "localhost:3100" ];
}
];
}
{
job_name = "alertmanager";
static_configs = [
{
targets = [ "localhost:9093" ];
}
];
}
{
job_name = "restic_rest";
static_configs = [
{
targets = [ "10.69.12.52:8000" ];
}
];
}
{
job_name = "pve-exporter";
static_configs = [
{
targets = [ "10.69.12.75" ];
}
];
metrics_path = "/pve";
params = {
module = [ "default" ];
cluster = [ "1" ];
node = [ "1" ];
};
relabel_configs = [
{
source_labels = [ "__address__" ];
target_label = "__param_target";
}
{
source_labels = [ "__param_target" ];
target_label = "instance";
}
{
target_label = "__address__";
replacement = "127.0.0.1:9221";
}
];
}
{
job_name = "caddy";
static_configs = [
{
targets = [ "http-proxy.home.2rjus.net" ];
}
];
}
{
job_name = "jellyfin";
static_configs = [
{
targets = [ "jelly01.home.2rjus.net:8096" ];
}
];
}
{
job_name = "smartctl";
static_configs = [
{
targets = [ "gunter.home.2rjus.net:9633" ];
}
];
}
{
job_name = "wireguard";
static_configs = [
{
targets = [ "http-proxy.home.2rjus.net:9586" ];
}
];
}
];
};
}