Move prometheus roles to external file

2025-05-18 14:54:09 +02:00
parent 071bf948a6
commit fe2e87658a
2 changed files with 187 additions and 188 deletions
--- a/services/monitoring/prometheus.nix
+++ b/services/monitoring/prometheus.nix
@@ -41,194 +41,7 @@
      scrape_interval = "15s";
    };
    rules = [
-      ''
-        groups:
-          - name: common_rules
-            rules:
-              - alert: node_down
-                expr: up == 0
-                for: 5m
-                labels:
-                  severity: critical
-                annotations:
-                  summary: "Instance {{ $labels.instance }} down"
-                  description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
-              - alert: low_disk_space
-                expr: node_filesystem_free_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 10
-                for: 5m
-                labels:
-                  severity: warning
-                annotations:
-                  summary: "Disk space low on {{ $labels.instance }}"
-                  description: "Disk space is low on {{ $labels.instance }}. Please check."
-              - alert: high_cpu_load
-                expr: max(node_load5{}) by (instance) > (count by (instance)(node_cpu_seconds_total{mode="idle"}) * 0.7)
-                for: 15m
-                labels:
-                  severity: warning
-                annotations:
-                  summary: "High CPU load on {{ $labels.instance }}"
-                  description: "CPU load is high on {{ $labels.instance }}. Please check."
-              - alert: low_memory
-                expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
-                for: 2m
-                labels:
-                  severity: warning
-                annotations:
-                  summary: Low available memory on {{ $labels.instance }}
-                  description: Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}
-              - alert: oom_kill
-                expr: increase(node_vmstat_oom_kill[1m]) > 0
-                for: 0m
-                labels:
-                  severity: warning
-                annotations:
-                  summary: Host OOM kill detected on {{ $labels.instance }}
-                  description: OOM kill detected
-              - alert: nixos_upgrade_failed
-                expr: node_systemd_unit_state{name="nixos-upgrade.service", state="failed"} == 1
-                for: 0m
-                labels:
-                  severity: critical
-                annotations:
-                  summary: "NixOS upgrade failed on {{ $labels.instance }}"
-                  description: "NixOS upgrade failed on {{ $labels.instance }}"
-              - alert: promtail_not_running
-                expr: node_systemd_unit_state{name="promtail.service", state="active"} == 0
-                for: 5m
-                labels:
-                  severity: warning
-                annotations:
-                  summary: "Promtail service not running on {{ $labels.instance }}"
-                  description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes."
-          - name: nameserver_rules
-            rules:
-              - alert: unbound_down
-                expr: node_systemd_unit_state {instance =~ "ns.+", name = "unbound.service", state = "active"} == 0
-                for: 5m
-                labels:
-                  severity: critical
-                annotations:
-                  summary: "Unbound not running on {{ $labels.instance }}"
-                  description: "Unbound has been down on {{ $labels.instance }} more than 5 minutes."
-              - alert: nsd_down
-                expr: node_systemd_unit_state {instance =~ "ns.+", name = "nsd.service", state = "active"} == 0
-                for: 5m
-                labels:
-                  severity: critical
-                annotations:
-                  summary: "NSD not running on {{ $labels.instance }}"
-                  description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
-          - name: http-proxy_rules 
-            rules:
-              - alert: caddy_down
-                expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0
-                for: 5m
-                labels:
-                  severity: critical
-                annotations:
-                  summary: "Caddy not running on {{ $labels.instance }}"
-                  description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
-          - name: nats_rules
-            rules:
-              - alert: nats_down
-                expr: node_systemd_unit_state {instance="nats1.home.2rjus.net:9100", name = "nats.service", state = "active"} == 0
-                for: 5m
-                labels:
-                  severity: critical
-                annotations:
-                  summary: "NATS not running on {{ $labels.instance }}"
-                  description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
-          - name: nix_cache_rules
-            rules:
-              - alert: build-flakes_service_failed
-                expr: node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="failed"} == 1
-                for: 0m
-                keep_firing_for: 10m
-                labels:
-                  severity: critical
-                annotations:
-                  summary: "The build-flakes service on {{ $labels.instance }} has failed"
-                  description: "The build-flakes service on {{ $labels.instance }} has failed"
-              - alert: harmonia_down
-                expr: node_systemd_unit_state {instance="nix-cache01.home.2rjus.net:9100", name = "harmonia.service", state = "active"} == 0
-                for: 5m
-                labels:
-                  severity: critical
-                annotations:
-                  summary: "Harmonia not running on {{ $labels.instance }}"
-                  description: "Harmonia has been down on {{ $labels.instance }} more than 5 minutes."
-              - alert: low_disk_space_nix
-                expr: node_filesystem_free_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} / node_filesystem_size_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} * 100 < 10
-                for: 5m
-                labels:
-                  severity: warning
-                annotations:
-                  summary: "Disk space low on /nix for {{ $labels.instance }}"
-                  description: "Disk space is low on /nix for host {{ $labels.instance }}. Please check."
-          - name: home_assistant_rules
-            rules:
-              - alert: home_assistant_down
-                expr: node_systemd_unit_state {instance="ha1.home.2rjus.net:9100", name="home-assistant.service", state="active"}  == 0
-                for: 5m
-                labels:
-                  severity: critical
-                annotations:
-                  summary: "Home assistant not running on {{ $labels.instance }}"
-                  description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes."
-              - alert: zigbee2qmtt_down 
-                expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0
-                for: 5m
-                labels:
-                  severity: critical
-                annotations:
-                  summary: "Zigbee2mqtt not running on {{ $labels.instance }}"
-                  description: "Zigbee2mqtt has been down on {{ $labels.instance }} more than 5 minutes."
-              - alert: mosquitto_down
-                expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "mosquitto.service", state = "active"} == 0
-                for: 5m
-                labels:
-                  severity: critical
-                annotations:
-                  summary: "Mosquitto not running on {{ $labels.instance }}"
-                  description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
-          - name: smartctl_rules
-            rules:
-              - alert: SmartCriticalWarning
-                expr: smartctl_device_critical_warning > 0
-                for: 0m
-                labels:
-                  severity: critical
-                annotations:
-                  summary: SMART critical warning (instance {{ $labels.instance }})
-                  description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-              - alert: SmartMediaErrors
-                expr: smartctl_device_media_errors > 0
-                for: 0m
-                labels:
-                  severity: critical
-                annotations:
-                  summary: SMART media errors (instance {{ $labels.instance }})
-                  description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-              - alert: SmartWearoutIndicator
-                expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold
-                for: 0m
-                labels:
-                  severity: critical
-                annotations:
-                  summary: SMART Wearout Indicator (instance {{ $labels.instance }})
-                  description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-          - name: wireguard_rules
-            rules:
-              - alert: WireguardHandshake
-                expr: (time() - wireguard_latest_handshake_seconds{instance="http-proxy.home.2rjus.net:9586",interface="wg0",public_key="32Rb13wExcy8uI92JTnFdiOfkv0mlQ6f181WA741DHs="}) > 300
-                for: 1m
-                labels:
-                  severity: warning
-                annotations:
-                  summary: "Wireguard handshake timeout on {{ $labels.instance }}"
-                  description: "Wireguard handshake timeout on {{ $labels.instance }} for more than 1 minutes."
-      ''
+      (builtins.readFile ./rules.yml)
    ];

    scrapeConfigs = [
--- a/services/monitoring/rules.yml
+++ b/services/monitoring/rules.yml
@@ -0,0 +1,186 @@
+groups:
+  - name: common_rules
+    rules:
+      - alert: node_down
+        expr: up == 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Instance {{ $labels.instance }} down"
+          description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
+      - alert: low_disk_space
+        expr: node_filesystem_free_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 10
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Disk space low on {{ $labels.instance }}"
+          description: "Disk space is low on {{ $labels.instance }}. Please check."
+      - alert: high_cpu_load
+        expr: max(node_load5{}) by (instance) > (count by (instance)(node_cpu_seconds_total{mode="idle"}) * 0.7)
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High CPU load on {{ $labels.instance }}"
+          description: "CPU load is high on {{ $labels.instance }}. Please check."
+      - alert: low_memory
+        expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Low available memory on {{ $labels.instance }}
+          description: Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}
+      - alert: oom_kill
+        expr: increase(node_vmstat_oom_kill[1m]) > 0
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host OOM kill detected on {{ $labels.instance }}
+          description: OOM kill detected
+      - alert: nixos_upgrade_failed
+        expr: node_systemd_unit_state{name="nixos-upgrade.service", state="failed"} == 1
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: "NixOS upgrade failed on {{ $labels.instance }}"
+          description: "NixOS upgrade failed on {{ $labels.instance }}"
+      - alert: promtail_not_running
+        expr: node_systemd_unit_state{name="promtail.service", state="active"} == 0
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Promtail service not running on {{ $labels.instance }}"
+          description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes."
+  - name: nameserver_rules
+    rules:
+      - alert: unbound_down
+        expr: node_systemd_unit_state {instance =~ "ns.+", name = "unbound.service", state = "active"} == 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Unbound not running on {{ $labels.instance }}"
+          description: "Unbound has been down on {{ $labels.instance }} more than 5 minutes."
+      - alert: nsd_down
+        expr: node_systemd_unit_state {instance =~ "ns.+", name = "nsd.service", state = "active"} == 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "NSD not running on {{ $labels.instance }}"
+          description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
+  - name: http-proxy_rules 
+    rules:
+      - alert: caddy_down
+        expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Caddy not running on {{ $labels.instance }}"
+          description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
+  - name: nats_rules
+    rules:
+      - alert: nats_down
+        expr: node_systemd_unit_state {instance="nats1.home.2rjus.net:9100", name = "nats.service", state = "active"} == 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "NATS not running on {{ $labels.instance }}"
+          description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
+  - name: nix_cache_rules
+    rules:
+      - alert: build-flakes_service_failed
+        expr: node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="failed"} == 1
+        for: 0m
+        keep_firing_for: 10m
+        labels:
+          severity: critical
+        annotations:
+          summary: "The build-flakes service on {{ $labels.instance }} has failed"
+          description: "The build-flakes service on {{ $labels.instance }} has failed"
+      - alert: harmonia_down
+        expr: node_systemd_unit_state {instance="nix-cache01.home.2rjus.net:9100", name = "harmonia.service", state = "active"} == 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Harmonia not running on {{ $labels.instance }}"
+          description: "Harmonia has been down on {{ $labels.instance }} more than 5 minutes."
+      - alert: low_disk_space_nix
+        expr: node_filesystem_free_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} / node_filesystem_size_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} * 100 < 10
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Disk space low on /nix for {{ $labels.instance }}"
+          description: "Disk space is low on /nix for host {{ $labels.instance }}. Please check."
+  - name: home_assistant_rules
+    rules:
+      - alert: home_assistant_down
+        expr: node_systemd_unit_state {instance="ha1.home.2rjus.net:9100", name="home-assistant.service", state="active"}  == 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Home assistant not running on {{ $labels.instance }}"
+          description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes."
+      - alert: zigbee2qmtt_down 
+        expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Zigbee2mqtt not running on {{ $labels.instance }}"
+          description: "Zigbee2mqtt has been down on {{ $labels.instance }} more than 5 minutes."
+      - alert: mosquitto_down
+        expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "mosquitto.service", state = "active"} == 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Mosquitto not running on {{ $labels.instance }}"
+          description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
+  - name: smartctl_rules
+    rules:
+      - alert: SmartCriticalWarning
+        expr: smartctl_device_critical_warning > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: SMART critical warning (instance {{ $labels.instance }})
+          description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+      - alert: SmartMediaErrors
+        expr: smartctl_device_media_errors > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: SMART media errors (instance {{ $labels.instance }})
+          description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+      - alert: SmartWearoutIndicator
+        expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: SMART Wearout Indicator (instance {{ $labels.instance }})
+          description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+  - name: wireguard_rules
+    rules:
+      - alert: WireguardHandshake
+        expr: (time() - wireguard_latest_handshake_seconds{instance="http-proxy.home.2rjus.net:9586",interface="wg0",public_key="32Rb13wExcy8uI92JTnFdiOfkv0mlQ6f181WA741DHs="}) > 300
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Wireguard handshake timeout on {{ $labels.instance }}"
+          description: "Wireguard handshake timeout on {{ $labels.instance }} for more than 1 minutes."