{ ... }: { services.prometheus = { enable = true; alertmanager = { enable = true; configuration = { global = { }; route = { receiver = "webhook_natstonotify"; group_wait = "30s"; group_interval = "5m"; repeat_interval = "1h"; group_by = [ "alertname" ]; }; receivers = [ { name = "webhook_natstonotify"; webhook_configs = [ { url = "http://localhost:5001/alert"; } ]; } ]; }; }; alertmanagers = [ { static_configs = [ { targets = [ "localhost:9093" ]; } ]; } ]; retentionTime = "30d"; globalConfig = { scrape_interval = "15s"; }; rules = [ '' groups: - name: common_rules rules: - alert: node_down expr: up == 0 for: 5m labels: severity: critical annotations: summary: "Instance {{ $labels.instance }} down" description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." - alert: low_disk_space expr: node_filesystem_free_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 10 for: 5m labels: severity: warning annotations: summary: "Disk space low on {{ $labels.instance }}" description: "Disk space is low on {{ $labels.instance }}. Please check." - alert: high_cpu_load expr: max(node_load5{}) by (instance) > (count by (instance)(node_cpu_seconds_total{mode="idle"}) * 0.7) for: 15m labels: severity: warning annotations: summary: "High CPU load on {{ $labels.instance }}" description: "CPU load is high on {{ $labels.instance }}. Please check." - alert: low_memory expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 for: 2m labels: severity: warning annotations: summary: Low available memory on {{ $labels.instance }} description: Node memory is filling up (< 10% left)\n VALUE = {{ $value }} - alert: oom_kill expr: increase(node_vmstat_oom_kill[1m]) > 0 for: 0m labels: severity: warning annotations: summary: Host OOM kill detected on {{ $labels.instance }} description: OOM kill detected - alert: nixos_upgrade_failed expr: node_systemd_unit_state{name="nixos-upgrade.service", state="failed"} == 1 for: 0m labels: severity: critical annotations: summary: "NixOS upgrade failed on {{ $labels.instance }}" description: "NixOS upgrade failed on {{ $labels.instance }}" - alert: promtail_not_running expr: node_systemd_unit_state{name="promtail.service", state="active"} == 0 for: 5m labels: severity: warning annotations: summary: "Promtail service not running on {{ $labels.instance }}" description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes." - name: nameserver_rules rules: - alert: unbound_down expr: node_systemd_unit_state {instance =~ "ns.+", name = "unbound.service", state = "active"} == 0 for: 5m labels: severity: critical annotations: summary: "Unbound not running on {{ $labels.instance }}" description: "Unbound has been down on {{ $labels.instance }} more than 5 minutes." - alert: nsd_down expr: node_systemd_unit_state {instance =~ "ns.+", name = "nsd.service", state = "active"} == 0 for: 5m labels: severity: critical annotations: summary: "NSD not running on {{ $labels.instance }}" description: "NSD has been down on {{ $labels.instance }} more than 5 minutes." - name: http-proxy_rules rules: - alert: caddy_down expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0 for: 5m labels: severity: critical annotations: summary: "Caddy not running on {{ $labels.instance }}" description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes." - name: nats_rules rules: - alert: nats_down expr: node_systemd_unit_state {instance="nats1.home.2rjus.net:9100", name = "nats.service", state = "active"} == 0 for: 5m labels: severity: critical annotations: summary: "NATS not running on {{ $labels.instance }}" description: "NATS has been down on {{ $labels.instance }} more than 5 minutes." - name: nix_cache_rules rules: - alert: build-flakes_service_failed expr: node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="failed"} == 1 for: 0m keep_firing_for: 10m labels: severity: critical annotations: summary: "The build-flakes service on {{ $labels.instance }} has failed" description: "The build-flakes service on {{ $labels.instance }} has failed" - alert: harmonia_down expr: node_systemd_unit_state {instance="nix-cache01.home.2rjus.net:9100", name = "harmonia.service", state = "active"} == 0 for: 5m labels: severity: critical annotations: summary: "Harmonia not running on {{ $labels.instance }}" description: "Harmonia has been down on {{ $labels.instance }} more than 5 minutes." - alert: low_disk_space_nix expr: node_filesystem_free_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} / node_filesystem_size_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} * 100 < 10 for: 5m labels: severity: warning annotations: summary: "Disk space low on /nix for {{ $labels.instance }}" description: "Disk space is low on /nix for host {{ $labels.instance }}. Please check." - name: home_assistant_rules rules: - alert: home_assistant_down expr: node_systemd_unit_state {instance="ha1.home.2rjus.net:9100", name="home-assistant.service", state="active"} == 0 for: 5m labels: severity: critical annotations: summary: "Home assistant not running on {{ $labels.instance }}" description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes." - alert: zigbee2qmtt_down expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0 for: 5m labels: severity: critical annotations: summary: "Zigbee2mqtt not running on {{ $labels.instance }}" description: "Zigbee2mqtt has been down on {{ $labels.instance }} more than 5 minutes." - alert: mosquitto_down expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "mosquitto.service", state = "active"} == 0 for: 5m labels: severity: critical annotations: summary: "Mosquitto not running on {{ $labels.instance }}" description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes." '' ]; scrapeConfigs = [ { job_name = "node-exporter"; static_configs = [ { targets = [ "ca.home.2rjus.net:9100" "gunter.home.2rjus.net:9100" "ha1.home.2rjus.net:9100" "http-proxy.home.2rjus.net:9100" "jelly01.home.2rjus.net:9100" "monitoring01.home.2rjus.net:9100" "nix-cache01.home.2rjus.net:9100" "ns1.home.2rjus.net:9100" "ns2.home.2rjus.net:9100" "pgdb1.home.2rjus.net:9100" "nats1.home.2rjus.net:9100" ]; } ]; } { job_name = "prometheus"; static_configs = [ { targets = [ "localhost:9090" ]; } ]; } { job_name = "loki"; static_configs = [ { targets = [ "localhost:3100" ]; } ]; } { job_name = "grafana"; static_configs = [ { targets = [ "localhost:3100" ]; } ]; } { job_name = "alertmanager"; static_configs = [ { targets = [ "localhost:9093" ]; } ]; } { job_name = "restic_rest"; static_configs = [ { targets = [ "10.69.12.52:8000" ]; } ]; } { job_name = "pve-exporter"; static_configs = [ { targets = [ "10.69.12.75" ]; } ]; metrics_path = "/pve"; params = { module = [ "default" ]; cluster = [ "1" ]; node = [ "1" ]; }; relabel_configs = [ { source_labels = [ "__address__" ]; target_label = "__param_target"; } { source_labels = [ "__param_target" ]; target_label = "instance"; } { target_label = "__address__"; replacement = "127.0.0.1:9221"; } ]; } { job_name = "caddy"; static_configs = [ { targets = [ "http-proxy.home.2rjus.net" ]; } ]; } { job_name = "jellyfin"; static_configs = [ { targets = [ "jelly01.home.2rjus.net:8096" ]; } ]; } { job_name = "smartctl"; static_configs = [ { targets = [ "gunter.home.2rjus.net:9633" ]; } ]; } ]; }; }