monitoring: auto-generate Prometheus scrape targets from host configs

Add homelab.monitoring NixOS options (enable, scrapeTargets) following the same pattern as homelab.dns. Prometheus scrape configs are now auto-generated from flake host configurations and external targets, replacing hardcoded target lists. Also cleans up alert rules: snake_case naming, fix zigbee2mqtt typo, remove duplicate pushgateway alert, add for clauses to monitoring_rules, remove hardcoded WireGuard public key, and add new alerts for certificates, proxmox, caddy, smartctl temperature, filesystem prediction, systemd state, file descriptors, and host reboots. Fixes grafana scrape target port from 3100 to 3000. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-05 00:49:07 +01:00
parent 4e8cc124f2
commit dd1b64de27
12 changed files with 380 additions and 119 deletions
--- a/services/ca/default.nix
+++ b/services/ca/default.nix
@@ -1,5 +1,9 @@
 { pkgs, unstable, ... }:
 {
+  homelab.monitoring.scrapeTargets = [{
+    job_name = "step-ca";
+    port = 9000;
+  }];
  sops.secrets."ca_root_pw" = {
    sopsFile = ../../secrets/ca/secrets.yaml;
    owner = "step-ca";
--- a/services/home-assistant/default.nix
+++ b/services/home-assistant/default.nix
@@ -1,5 +1,11 @@
 { pkgs, config, ... }:
 {
+  homelab.monitoring.scrapeTargets = [{
+    job_name = "home-assistant";
+    port = 8123;
+    metrics_path = "/api/prometheus";
+    scrape_interval = "60s";
+  }];
  # Enable the Home Assistant service
  services.home-assistant = {
    enable = true;
--- a/services/http-proxy/default.nix
+++ b/services/http-proxy/default.nix
@@ -3,4 +3,9 @@
  imports = [
    ./proxy.nix
  ];
+
+  homelab.monitoring.scrapeTargets = [{
+    job_name = "caddy";
+    port = 80;
+  }];
 }
--- a/services/jellyfin/default.nix
+++ b/services/jellyfin/default.nix
@@ -1,5 +1,9 @@
 { pkgs, ... }:
 {
+  homelab.monitoring.scrapeTargets = [{
+    job_name = "jellyfin";
+    port = 8096;
+  }];
  services.jellyfin = {
    enable = true;
  };
--- a/services/monitoring/external-targets.nix
+++ b/services/monitoring/external-targets.nix
@@ -0,0 +1,12 @@
+# Monitoring targets for hosts not managed by this flake
+# These are manually maintained and combined with auto-generated targets
+{
+  nodeExporter = [
+    "gunter.home.2rjus.net:9100"
+  ];
+  scrapeConfigs = [
+    { job_name = "smartctl"; targets = [ "gunter.home.2rjus.net:9633" ]; }
+    { job_name = "ghettoptt"; targets = [ "gunter.home.2rjus.net:8989" ]; }
+    { job_name = "restic_rest"; targets = [ "10.69.12.52:8000" ]; }
+  ];
+}
--- a/services/monitoring/prometheus.nix
+++ b/services/monitoring/prometheus.nix
@@ -1,4 +1,11 @@
-{ ... }:
+{ self, lib, ... }:
+let
+  monLib = import ../../lib/monitoring.nix { inherit lib; };
+  externalTargets = import ./external-targets.nix;
+
+  nodeExporterTargets = monLib.generateNodeExporterTargets self externalTargets;
+  autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets;
+in
 {
  services.prometheus = {
    enable = true;
@@ -45,26 +52,16 @@
    ];

    scrapeConfigs = [
+      # Auto-generated node-exporter targets from flake hosts + external
      {
        job_name = "node-exporter";
        static_configs = [
          {
-            targets = [
-              "ca.home.2rjus.net:9100"
-              "gunter.home.2rjus.net:9100"
-              "ha1.home.2rjus.net:9100"
-              "http-proxy.home.2rjus.net:9100"
-              "jelly01.home.2rjus.net:9100"
-              "monitoring01.home.2rjus.net:9100"
-              "nix-cache01.home.2rjus.net:9100"
-              "ns1.home.2rjus.net:9100"
-              "ns2.home.2rjus.net:9100"
-              "pgdb1.home.2rjus.net:9100"
-              "nats1.home.2rjus.net:9100"
-            ];
+            targets = nodeExporterTargets;
          }
        ];
      }
+      # Local monitoring services (not auto-generated)
      {
        job_name = "prometheus";
        static_configs = [
@@ -85,7 +82,7 @@
        job_name = "grafana";
        static_configs = [
          {
-            targets = [ "localhost:3100" ];
+            targets = [ "localhost:3000" ];
          }
        ];
      }
@@ -98,13 +95,23 @@
        ];
      }
      {
-        job_name = "restic_rest";
+        job_name = "pushgateway";
+        honor_labels = true;
        static_configs = [
          {
-            targets = [ "10.69.12.52:8000" ];
+            targets = [ "localhost:9091" ];
          }
        ];
      }
+      {
+        job_name = "labmon";
+        static_configs = [
+          {
+            targets = [ "monitoring01.home.2rjus.net:9969" ];
+          }
+        ];
+      }
+      # pve-exporter with complex relabel config
      {
        job_name = "pve-exporter";
        static_configs = [
@@ -133,91 +140,8 @@
          }
        ];
      }
-      {
-        job_name = "caddy";
-        static_configs = [
-          {
-            targets = [ "http-proxy.home.2rjus.net" ];
-          }
-        ];
-      }
-      {
-        job_name = "jellyfin";
-        static_configs = [
-          {
-            targets = [ "jelly01.home.2rjus.net:8096" ];
-          }
-        ];
-      }
-      {
-        job_name = "smartctl";
-        static_configs = [
-          {
-            targets = [ "gunter.home.2rjus.net:9633" ];
-          }
-        ];
-      }
-      {
-        job_name = "wireguard";
-        static_configs = [
-          {
-            targets = [ "http-proxy.home.2rjus.net:9586" ];
-          }
-        ];
-      }
-      {
-        job_name = "home-assistant";
-        scrape_interval = "60s";
-        metrics_path = "/api/prometheus";
-        static_configs = [
-          {
-            targets = [ "ha1.home.2rjus.net:8123" ];
-          }
-        ];
-      }
-      {
-        job_name = "ghettoptt";
-        static_configs = [
-          {
-            targets = [ "gunter.home.2rjus.net:8989" ];
-          }
-        ];
-      }
-      {
-        job_name = "step-ca";
-        static_configs = [
-          {
-            targets = [ "ca.home.2rjus.net:9000" ];
-          }
-        ];
-      }
-      {
-        job_name = "labmon";
-        static_configs = [
-          {
-            targets = [ "monitoring01.home.2rjus.net:9969" ];
-          }
-        ];
-      }
-      {
-        job_name = "pushgateway";
-        honor_labels = true;
-        static_configs = [
-          {
-            targets = [ "localhost:9091" ];
-          }
-        ];
-      }
-      {
-        job_name = "nix-cache_caddy";
-        scheme = "https";
-        static_configs = [
-          {
-            targets = [ "nix-cache.home.2rjus.net" ];
-          }
-        ];
-      }
-    ];
+    ] ++ autoScrapeConfigs;
+
    pushgateway = {
      enable = true;
      web = {
--- a/services/monitoring/rules.yml
+++ b/services/monitoring/rules.yml
@@ -57,6 +57,38 @@ groups:
        annotations:
          summary: "Promtail service not running on {{ $labels.instance }}"
          description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes."
+      - alert: filesystem_filling_up
+        expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0
+        for: 1h
+        labels:
+          severity: warning
+        annotations:
+          summary: "Filesystem predicted to fill within 24h on {{ $labels.instance }}"
+          description: "Based on the last 6h trend, the root filesystem on {{ $labels.instance }} is predicted to run out of space within 24 hours."
+      - alert: systemd_not_running
+        expr: node_systemd_system_running == 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Systemd not in running state on {{ $labels.instance }}"
+          description: "Systemd is not in running state on {{ $labels.instance }}. The system may be in a degraded state."
+      - alert: high_file_descriptors
+        expr: node_filefd_allocated / node_filefd_maximum > 0.8
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High file descriptor usage on {{ $labels.instance }}"
+          description: "More than 80% of file descriptors are in use on {{ $labels.instance }}."
+      - alert: host_reboot
+        expr: changes(node_boot_time_seconds[10m]) > 0
+        for: 0m
+        labels:
+          severity: info
+        annotations:
+          summary: "Host {{ $labels.instance }} has rebooted"
+          description: "Host {{ $labels.instance }} has rebooted."
  - name: nameserver_rules
    rules:
      - alert: unbound_down
@@ -75,7 +107,7 @@ groups:
        annotations:
          summary: "NSD not running on {{ $labels.instance }}"
          description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
-  - name: http-proxy_rules 
+  - name: http_proxy_rules
    rules:
      - alert: caddy_down
        expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0
@@ -85,6 +117,22 @@ groups:
        annotations:
          summary: "Caddy not running on {{ $labels.instance }}"
          description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
+      - alert: caddy_upstream_unhealthy
+        expr: caddy_reverse_proxy_upstreams_healthy == 0
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Caddy upstream unhealthy for {{ $labels.upstream }}"
+          description: "Caddy reverse proxy upstream {{ $labels.upstream }} is unhealthy on {{ $labels.instance }}."
+      - alert: caddy_high_error_rate
+        expr: rate(caddy_http_request_errors_total[5m]) > 1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High HTTP error rate on {{ $labels.instance }}"
+          description: "Caddy is experiencing a high rate of HTTP errors on {{ $labels.instance }}."
  - name: nats_rules
    rules:
      - alert: nats_down
@@ -97,7 +145,7 @@ groups:
          description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
  - name: nix_cache_rules
    rules:
-      - alert: build-flakes_service_not_active_recently
+      - alert: build_flakes_service_not_active_recently
        expr: count_over_time(node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="active"}[1h]) < 1
        for: 0m
        labels:
@@ -138,7 +186,7 @@ groups:
        annotations:
          summary: "Home assistant not running on {{ $labels.instance }}"
          description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes."
-      - alert: zigbee2qmtt_down 
+      - alert: zigbee2mqtt_down
        expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0
        for: 5m
        labels:
@@ -156,7 +204,7 @@ groups:
          description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
  - name: smartctl_rules
    rules:
-      - alert: SmartCriticalWarning
+      - alert: smart_critical_warning
        expr: smartctl_device_critical_warning > 0
        for: 0m
        labels:
@@ -164,7 +212,7 @@ groups:
        annotations:
          summary: SMART critical warning (instance {{ $labels.instance }})
          description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: SmartMediaErrors
+      - alert: smart_media_errors
        expr: smartctl_device_media_errors > 0
        for: 0m
        labels:
@@ -172,7 +220,7 @@ groups:
        annotations:
          summary: SMART media errors (instance {{ $labels.instance }})
          description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: SmartWearoutIndicator
+      - alert: smart_wearout_indicator
        expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold
        for: 0m
        labels:
@@ -180,20 +228,29 @@ groups:
        annotations:
          summary: SMART Wearout Indicator (instance {{ $labels.instance }})
          description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+      - alert: smartctl_high_temperature
+        expr: smartctl_device_temperature > 60
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Disk temperature above 60C on {{ $labels.instance }}"
+          description: "Disk {{ $labels.device }} on {{ $labels.instance }} has temperature {{ $value }}C."
  - name: wireguard_rules
    rules:
-      - alert: WireguardHandshake
-        expr: (time() - wireguard_latest_handshake_seconds{instance="http-proxy.home.2rjus.net:9586",interface="wg0",public_key="32Rb13wExcy8uI92JTnFdiOfkv0mlQ6f181WA741DHs="}) > 300
+      - alert: wireguard_handshake_timeout
+        expr: (time() - wireguard_latest_handshake_seconds{interface="wg0"}) > 300
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Wireguard handshake timeout on {{ $labels.instance }}"
-          description: "Wireguard handshake timeout on {{ $labels.instance }} for more than 1 minutes."
+          description: "Wireguard handshake timeout on {{ $labels.instance }} for peer {{ $labels.public_key }}."
  - name: monitoring_rules
    rules:
      - alert: prometheus_not_running
        expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="prometheus.service", state="active"} == 0
+        for: 5m
        labels:
          severity: critical
        annotations:
@@ -201,6 +258,7 @@ groups:
          description: "Prometheus service not running on {{ $labels.instance }}"
      - alert: alertmanager_not_running
        expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
+        for: 5m
        labels:
          severity: critical
        annotations:
@@ -208,13 +266,7 @@ groups:
          description: "Alertmanager service not running on {{ $labels.instance }}"
      - alert: pushgateway_not_running
        expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
-        labels:
-          severity: critical
-        annotations:
-          summary: "Pushgateway service not running on {{ $labels.instance }}"
-          description: "Pushgateway service not running on {{ $labels.instance }}"
-      - alert: pushgateway_not_running
-        expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
+        for: 5m
        labels:
          severity: critical
        annotations:
@@ -222,6 +274,7 @@ groups:
          description: "Pushgateway service not running on {{ $labels.instance }}"
      - alert: loki_not_running
        expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="loki.service", state="active"} == 0
+        for: 5m
        labels:
          severity: critical
        annotations:
@@ -229,6 +282,7 @@ groups:
          description: "Loki service not running on {{ $labels.instance }}"
      - alert: grafana_not_running
        expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
+        for: 5m
        labels:
          severity: warning
        annotations:
@@ -236,6 +290,7 @@ groups:
          description: "Grafana service not running on {{ $labels.instance }}"
      - alert: tempo_not_running
        expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="tempo.service", state="active"} == 0
+        for: 5m
        labels:
          severity: warning
        annotations:
@@ -243,8 +298,53 @@ groups:
          description: "Tempo service not running on {{ $labels.instance }}"
      - alert: pyroscope_not_running
        expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="podman-pyroscope.service", state="active"} == 0
+        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Pyroscope service not running on {{ $labels.instance }}"
          description: "Pyroscope service not running on {{ $labels.instance }}"
+  - name: certificate_rules
+    rules:
+      - alert: certificate_expiring_soon
+        expr: labmon_tlsconmon_certificate_seconds_left < 86400
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "TLS certificate expiring soon for {{ $labels.instance }}"
+          description: "TLS certificate for {{ $labels.address }} is expiring within 24 hours."
+      - alert: certificate_check_error
+        expr: labmon_tlsconmon_certificate_check_error == 1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Error checking certificate for {{ $labels.address }}"
+          description: "Certificate check is failing for {{ $labels.address }} on {{ $labels.instance }}."
+      - alert: step_ca_certificate_expiring
+        expr: labmon_stepmon_certificate_seconds_left < 3600
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Step-CA certificate expiring for {{ $labels.instance }}"
+          description: "Step-CA certificate is expiring within 1 hour on {{ $labels.instance }}."
+  - name: proxmox_rules
+    rules:
+      - alert: pve_node_down
+        expr: pve_up{id=~"node/.*"} == 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Proxmox node {{ $labels.id }} is down"
+          description: "Proxmox node {{ $labels.id }} has been down for more than 5 minutes."
+      - alert: pve_guest_stopped
+        expr: pve_up{id=~"qemu/.*"} == 0 and pve_onboot_status == 1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Proxmox VM {{ $labels.id }} is stopped"
+          description: "Proxmox VM {{ $labels.id }} ({{ $labels.name }}) has onboot=1 but is stopped."
--- a/services/nix-cache/default.nix
+++ b/services/nix-cache/default.nix
@@ -6,4 +6,10 @@
    ./proxy.nix
    ./nix.nix
  ];
+
+  homelab.monitoring.scrapeTargets = [{
+    job_name = "nix-cache_caddy";
+    port = 443;
+    scheme = "https";
+  }];
 }