From dd1b64de276afb364c3ad43bd3fd440a5b7a86a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= <torjus@usit.uio.no>
Date: Thu, 5 Feb 2026 00:49:07 +0100
Subject: [PATCH 1/2] monitoring: auto-generate Prometheus scrape targets from
 host configs

Add homelab.monitoring NixOS options (enable, scrapeTargets) following
the same pattern as homelab.dns. Prometheus scrape configs are now
auto-generated from flake host configurations and external targets,
replacing hardcoded target lists.

Also cleans up alert rules: snake_case naming, fix zigbee2mqtt typo,
remove duplicate pushgateway alert, add for clauses to monitoring_rules,
remove hardcoded WireGuard public key, and add new alerts for
certificates, proxmox, caddy, smartctl temperature, filesystem
prediction, systemd state, file descriptors, and host reboots.

Fixes grafana scrape target port from 3100 to 3000.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 hosts/http-proxy/wireguard.nix           |   6 +-
 lib/monitoring.nix                       | 145 +++++++++++++++++++++++
 modules/homelab/default.nix              |   1 +
 modules/homelab/monitoring.nix           |  50 ++++++++
 services/ca/default.nix                  |   4 +
 services/home-assistant/default.nix      |   6 +
 services/http-proxy/default.nix          |   5 +
 services/jellyfin/default.nix            |   4 +
 services/monitoring/external-targets.nix |  12 ++
 services/monitoring/prometheus.nix       | 128 ++++----------------
 services/monitoring/rules.yml            | 132 ++++++++++++++++++---
 services/nix-cache/default.nix           |   6 +
 12 files changed, 380 insertions(+), 119 deletions(-)
 create mode 100644 lib/monitoring.nix
 create mode 100644 modules/homelab/monitoring.nix
 create mode 100644 services/monitoring/external-targets.nix

diff --git a/hosts/http-proxy/wireguard.nix b/hosts/http-proxy/wireguard.nix
index 6485e69..5470996 100644
--- a/hosts/http-proxy/wireguard.nix
+++ b/hosts/http-proxy/wireguard.nix
@@ -26,7 +26,11 @@
       };
     };
   };
-  # monitoring
+  homelab.monitoring.scrapeTargets = [{
+    job_name = "wireguard";
+    port = 9586;
+  }];
+
   services.prometheus.exporters.wireguard = {
     enable = true;
   };
diff --git a/lib/monitoring.nix b/lib/monitoring.nix
new file mode 100644
index 0000000..19e522a
--- /dev/null
+++ b/lib/monitoring.nix
@@ -0,0 +1,145 @@
+{ lib }:
+let
+  # Extract IP address from CIDR notation (e.g., "10.69.13.5/24" -> "10.69.13.5")
+  extractIP = address:
+    let
+      parts = lib.splitString "/" address;
+    in
+    builtins.head parts;
+
+  # Check if a network interface name looks like a VPN/tunnel interface
+  isVpnInterface = ifaceName:
+    lib.hasPrefix "wg" ifaceName ||
+    lib.hasPrefix "tun" ifaceName ||
+    lib.hasPrefix "tap" ifaceName ||
+    lib.hasPrefix "vti" ifaceName;
+
+  # Extract monitoring info from a single host configuration
+  # Returns null if host should not be included
+  extractHostMonitoring = name: hostConfig:
+    let
+      cfg = hostConfig.config;
+      monConfig = (cfg.homelab or { }).monitoring or { enable = true; scrapeTargets = [ ]; };
+      dnsConfig = (cfg.homelab or { }).dns or { enable = true; };
+      hostname = cfg.networking.hostName;
+      networks = cfg.systemd.network.networks or { };
+
+      # Filter out VPN interfaces and find networks with static addresses
+      physicalNetworks = lib.filterAttrs
+        (netName: netCfg:
+          let
+            ifaceName = netCfg.matchConfig.Name or "";
+          in
+          !(isVpnInterface ifaceName) && (netCfg.address or [ ]) != [ ])
+        networks;
+
+      # Get addresses from physical networks only
+      networkAddresses = lib.flatten (
+        lib.mapAttrsToList
+          (netName: netCfg: netCfg.address or [ ])
+          physicalNetworks
+      );
+
+      firstAddress = if networkAddresses != [ ] then builtins.head networkAddresses else null;
+    in
+    if !(monConfig.enable or true) || !(dnsConfig.enable or true) || firstAddress == null then
+      null
+    else
+      {
+        inherit hostname;
+        ip = extractIP firstAddress;
+        scrapeTargets = monConfig.scrapeTargets or [ ];
+      };
+
+  # Generate node-exporter targets from all flake hosts
+  generateNodeExporterTargets = self: externalTargets:
+    let
+      nixosConfigs = self.nixosConfigurations or { };
+      hostList = lib.filter (x: x != null) (
+        lib.mapAttrsToList extractHostMonitoring nixosConfigs
+      );
+      flakeTargets = map (host: "${host.hostname}.home.2rjus.net:9100") hostList;
+    in
+    flakeTargets ++ (externalTargets.nodeExporter or [ ]);
+
+  # Generate scrape configs from all flake hosts and external targets
+  generateScrapeConfigs = self: externalTargets:
+    let
+      nixosConfigs = self.nixosConfigurations or { };
+      hostList = lib.filter (x: x != null) (
+        lib.mapAttrsToList extractHostMonitoring nixosConfigs
+      );
+
+      # Collect all scrapeTargets from all hosts, grouped by job_name
+      allTargets = lib.flatten (map
+        (host:
+          map
+            (target: {
+              inherit (target) job_name port metrics_path scheme scrape_interval honor_labels;
+              hostname = host.hostname;
+            })
+            host.scrapeTargets
+        )
+        hostList
+      );
+
+      # Group targets by job_name
+      grouped = lib.groupBy (t: t.job_name) allTargets;
+
+      # Generate a scrape config for each job
+      flakeScrapeConfigs = lib.mapAttrsToList
+        (jobName: targets:
+          let
+            first = builtins.head targets;
+            targetAddrs = map
+              (t:
+                let
+                  portStr = toString t.port;
+                in
+                "${t.hostname}.home.2rjus.net:${portStr}")
+              targets;
+            config = {
+              job_name = jobName;
+              static_configs = [{
+                targets = targetAddrs;
+              }];
+            }
+            // (lib.optionalAttrs (first.metrics_path != "/metrics") {
+              metrics_path = first.metrics_path;
+            })
+            // (lib.optionalAttrs (first.scheme != "http") {
+              scheme = first.scheme;
+            })
+            // (lib.optionalAttrs (first.scrape_interval != null) {
+              scrape_interval = first.scrape_interval;
+            })
+            // (lib.optionalAttrs first.honor_labels {
+              honor_labels = true;
+            });
+          in
+          config
+        )
+        grouped;
+
+      # External scrape configs
+      externalScrapeConfigs = map
+        (ext: {
+          job_name = ext.job_name;
+          static_configs = [{
+            targets = ext.targets;
+          }];
+        } // (lib.optionalAttrs (ext ? metrics_path) {
+          metrics_path = ext.metrics_path;
+        }) // (lib.optionalAttrs (ext ? scheme) {
+          scheme = ext.scheme;
+        }) // (lib.optionalAttrs (ext ? scrape_interval) {
+          scrape_interval = ext.scrape_interval;
+        }))
+        (externalTargets.scrapeConfigs or [ ]);
+    in
+    flakeScrapeConfigs ++ externalScrapeConfigs;
+
+in
+{
+  inherit extractHostMonitoring generateNodeExporterTargets generateScrapeConfigs;
+}
diff --git a/modules/homelab/default.nix b/modules/homelab/default.nix
index 0d2ba01..b945a3d 100644
--- a/modules/homelab/default.nix
+++ b/modules/homelab/default.nix
@@ -2,5 +2,6 @@
 {
   imports = [
     ./dns.nix
+    ./monitoring.nix
   ];
 }
diff --git a/modules/homelab/monitoring.nix b/modules/homelab/monitoring.nix
new file mode 100644
index 0000000..b6e101d
--- /dev/null
+++ b/modules/homelab/monitoring.nix
@@ -0,0 +1,50 @@
+{ config, lib, ... }:
+let
+  cfg = config.homelab.monitoring;
+in
+{
+  options.homelab.monitoring = {
+    enable = lib.mkOption {
+      type = lib.types.bool;
+      default = true;
+      description = "Include this host in Prometheus node-exporter scrape targets";
+    };
+
+    scrapeTargets = lib.mkOption {
+      type = lib.types.listOf (lib.types.submodule {
+        options = {
+          job_name = lib.mkOption {
+            type = lib.types.str;
+            description = "Prometheus scrape job name";
+          };
+          port = lib.mkOption {
+            type = lib.types.port;
+            description = "Port to scrape metrics from";
+          };
+          metrics_path = lib.mkOption {
+            type = lib.types.str;
+            default = "/metrics";
+            description = "HTTP path to scrape metrics from";
+          };
+          scheme = lib.mkOption {
+            type = lib.types.str;
+            default = "http";
+            description = "HTTP scheme (http or https)";
+          };
+          scrape_interval = lib.mkOption {
+            type = lib.types.nullOr lib.types.str;
+            default = null;
+            description = "Override the global scrape interval for this target";
+          };
+          honor_labels = lib.mkOption {
+            type = lib.types.bool;
+            default = false;
+            description = "Whether to honor labels from the scraped target";
+          };
+        };
+      });
+      default = [ ];
+      description = "Additional Prometheus scrape targets exposed by this host";
+    };
+  };
+}
diff --git a/services/ca/default.nix b/services/ca/default.nix
index 9c52015..b5759a0 100644
--- a/services/ca/default.nix
+++ b/services/ca/default.nix
@@ -1,5 +1,9 @@
 { pkgs, unstable, ... }:
 {
+  homelab.monitoring.scrapeTargets = [{
+    job_name = "step-ca";
+    port = 9000;
+  }];
   sops.secrets."ca_root_pw" = {
     sopsFile = ../../secrets/ca/secrets.yaml;
     owner = "step-ca";
diff --git a/services/home-assistant/default.nix b/services/home-assistant/default.nix
index c987fe4..14f4fce 100644
--- a/services/home-assistant/default.nix
+++ b/services/home-assistant/default.nix
@@ -1,5 +1,11 @@
 { pkgs, config, ... }:
 {
+  homelab.monitoring.scrapeTargets = [{
+    job_name = "home-assistant";
+    port = 8123;
+    metrics_path = "/api/prometheus";
+    scrape_interval = "60s";
+  }];
   # Enable the Home Assistant service
   services.home-assistant = {
     enable = true;
diff --git a/services/http-proxy/default.nix b/services/http-proxy/default.nix
index d046d09..07ef28f 100644
--- a/services/http-proxy/default.nix
+++ b/services/http-proxy/default.nix
@@ -3,4 +3,9 @@
   imports = [
     ./proxy.nix
   ];
+
+  homelab.monitoring.scrapeTargets = [{
+    job_name = "caddy";
+    port = 80;
+  }];
 }
diff --git a/services/jellyfin/default.nix b/services/jellyfin/default.nix
index e2322de..729080c 100644
--- a/services/jellyfin/default.nix
+++ b/services/jellyfin/default.nix
@@ -1,5 +1,9 @@
 { pkgs, ... }:
 {
+  homelab.monitoring.scrapeTargets = [{
+    job_name = "jellyfin";
+    port = 8096;
+  }];
   services.jellyfin = {
     enable = true;
   };
diff --git a/services/monitoring/external-targets.nix b/services/monitoring/external-targets.nix
new file mode 100644
index 0000000..debc0d5
--- /dev/null
+++ b/services/monitoring/external-targets.nix
@@ -0,0 +1,12 @@
+# Monitoring targets for hosts not managed by this flake
+# These are manually maintained and combined with auto-generated targets
+{
+  nodeExporter = [
+    "gunter.home.2rjus.net:9100"
+  ];
+  scrapeConfigs = [
+    { job_name = "smartctl"; targets = [ "gunter.home.2rjus.net:9633" ]; }
+    { job_name = "ghettoptt"; targets = [ "gunter.home.2rjus.net:8989" ]; }
+    { job_name = "restic_rest"; targets = [ "10.69.12.52:8000" ]; }
+  ];
+}
diff --git a/services/monitoring/prometheus.nix b/services/monitoring/prometheus.nix
index cb496ad..c96b817 100644
--- a/services/monitoring/prometheus.nix
+++ b/services/monitoring/prometheus.nix
@@ -1,4 +1,11 @@
-{ ... }:
+{ self, lib, ... }:
+let
+  monLib = import ../../lib/monitoring.nix { inherit lib; };
+  externalTargets = import ./external-targets.nix;
+
+  nodeExporterTargets = monLib.generateNodeExporterTargets self externalTargets;
+  autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets;
+in
 {
   services.prometheus = {
     enable = true;
@@ -45,26 +52,16 @@
     ];
 
     scrapeConfigs = [
+      # Auto-generated node-exporter targets from flake hosts + external
       {
         job_name = "node-exporter";
         static_configs = [
           {
-            targets = [
-              "ca.home.2rjus.net:9100"
-              "gunter.home.2rjus.net:9100"
-              "ha1.home.2rjus.net:9100"
-              "http-proxy.home.2rjus.net:9100"
-              "jelly01.home.2rjus.net:9100"
-              "monitoring01.home.2rjus.net:9100"
-              "nix-cache01.home.2rjus.net:9100"
-              "ns1.home.2rjus.net:9100"
-              "ns2.home.2rjus.net:9100"
-              "pgdb1.home.2rjus.net:9100"
-              "nats1.home.2rjus.net:9100"
-            ];
+            targets = nodeExporterTargets;
           }
         ];
       }
+      # Local monitoring services (not auto-generated)
       {
         job_name = "prometheus";
         static_configs = [
@@ -85,7 +82,7 @@
         job_name = "grafana";
         static_configs = [
           {
-            targets = [ "localhost:3100" ];
+            targets = [ "localhost:3000" ];
           }
         ];
       }
@@ -98,13 +95,23 @@
         ];
       }
       {
-        job_name = "restic_rest";
+        job_name = "pushgateway";
+        honor_labels = true;
         static_configs = [
           {
-            targets = [ "10.69.12.52:8000" ];
+            targets = [ "localhost:9091" ];
           }
         ];
       }
+      {
+        job_name = "labmon";
+        static_configs = [
+          {
+            targets = [ "monitoring01.home.2rjus.net:9969" ];
+          }
+        ];
+      }
+      # pve-exporter with complex relabel config
       {
         job_name = "pve-exporter";
         static_configs = [
@@ -133,91 +140,8 @@
           }
         ];
       }
-      {
-        job_name = "caddy";
-        static_configs = [
-          {
-            targets = [ "http-proxy.home.2rjus.net" ];
-          }
-        ];
-      }
-      {
-        job_name = "jellyfin";
-        static_configs = [
-          {
-            targets = [ "jelly01.home.2rjus.net:8096" ];
-          }
-        ];
-      }
-      {
-        job_name = "smartctl";
-        static_configs = [
-          {
-            targets = [ "gunter.home.2rjus.net:9633" ];
-          }
-        ];
-      }
-      {
-        job_name = "wireguard";
-        static_configs = [
-          {
-            targets = [ "http-proxy.home.2rjus.net:9586" ];
-          }
-        ];
-      }
-      {
-        job_name = "home-assistant";
-        scrape_interval = "60s";
-        metrics_path = "/api/prometheus";
-        static_configs = [
-          {
-            targets = [ "ha1.home.2rjus.net:8123" ];
-          }
-        ];
-      }
-      {
-        job_name = "ghettoptt";
-        static_configs = [
-          {
-            targets = [ "gunter.home.2rjus.net:8989" ];
-          }
-        ];
-      }
-      {
-        job_name = "step-ca";
-        static_configs = [
-          {
-            targets = [ "ca.home.2rjus.net:9000" ];
-          }
-        ];
-      }
-      {
-        job_name = "labmon";
-        static_configs = [
-          {
-            targets = [ "monitoring01.home.2rjus.net:9969" ];
-          }
-        ];
-      }
-      {
-        job_name = "pushgateway";
-        honor_labels = true;
-        static_configs = [
-          {
-            targets = [ "localhost:9091" ];
-          }
-        ];
-      }
-      {
-        job_name = "nix-cache_caddy";
-        scheme = "https";
-        static_configs = [
-          {
-            targets = [ "nix-cache.home.2rjus.net" ];
-          }
-        ];
-      }
-    ];
+    ] ++ autoScrapeConfigs;
+
     pushgateway = {
       enable = true;
       web = {
diff --git a/services/monitoring/rules.yml b/services/monitoring/rules.yml
index 4c9afc0..c5eba34 100644
--- a/services/monitoring/rules.yml
+++ b/services/monitoring/rules.yml
@@ -57,6 +57,38 @@ groups:
         annotations:
           summary: "Promtail service not running on {{ $labels.instance }}"
           description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes."
+      - alert: filesystem_filling_up
+        expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0
+        for: 1h
+        labels:
+          severity: warning
+        annotations:
+          summary: "Filesystem predicted to fill within 24h on {{ $labels.instance }}"
+          description: "Based on the last 6h trend, the root filesystem on {{ $labels.instance }} is predicted to run out of space within 24 hours."
+      - alert: systemd_not_running
+        expr: node_systemd_system_running == 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Systemd not in running state on {{ $labels.instance }}"
+          description: "Systemd is not in running state on {{ $labels.instance }}. The system may be in a degraded state."
+      - alert: high_file_descriptors
+        expr: node_filefd_allocated / node_filefd_maximum > 0.8
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High file descriptor usage on {{ $labels.instance }}"
+          description: "More than 80% of file descriptors are in use on {{ $labels.instance }}."
+      - alert: host_reboot
+        expr: changes(node_boot_time_seconds[10m]) > 0
+        for: 0m
+        labels:
+          severity: info
+        annotations:
+          summary: "Host {{ $labels.instance }} has rebooted"
+          description: "Host {{ $labels.instance }} has rebooted."
   - name: nameserver_rules
     rules:
       - alert: unbound_down
@@ -75,7 +107,7 @@ groups:
         annotations:
           summary: "NSD not running on {{ $labels.instance }}"
           description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
-  - name: http-proxy_rules 
+  - name: http_proxy_rules
     rules:
       - alert: caddy_down
         expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0
@@ -85,6 +117,22 @@ groups:
         annotations:
           summary: "Caddy not running on {{ $labels.instance }}"
           description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
+      - alert: caddy_upstream_unhealthy
+        expr: caddy_reverse_proxy_upstreams_healthy == 0
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Caddy upstream unhealthy for {{ $labels.upstream }}"
+          description: "Caddy reverse proxy upstream {{ $labels.upstream }} is unhealthy on {{ $labels.instance }}."
+      - alert: caddy_high_error_rate
+        expr: rate(caddy_http_request_errors_total[5m]) > 1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High HTTP error rate on {{ $labels.instance }}"
+          description: "Caddy is experiencing a high rate of HTTP errors on {{ $labels.instance }}."
   - name: nats_rules
     rules:
       - alert: nats_down
@@ -97,7 +145,7 @@ groups:
           description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
   - name: nix_cache_rules
     rules:
-      - alert: build-flakes_service_not_active_recently
+      - alert: build_flakes_service_not_active_recently
         expr: count_over_time(node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="active"}[1h]) < 1
         for: 0m
         labels:
@@ -138,7 +186,7 @@ groups:
         annotations:
           summary: "Home assistant not running on {{ $labels.instance }}"
           description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes."
-      - alert: zigbee2qmtt_down 
+      - alert: zigbee2mqtt_down
         expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0
         for: 5m
         labels:
@@ -156,7 +204,7 @@ groups:
           description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
   - name: smartctl_rules
     rules:
-      - alert: SmartCriticalWarning
+      - alert: smart_critical_warning
         expr: smartctl_device_critical_warning > 0
         for: 0m
         labels:
@@ -164,7 +212,7 @@ groups:
         annotations:
           summary: SMART critical warning (instance {{ $labels.instance }})
           description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: SmartMediaErrors
+      - alert: smart_media_errors
         expr: smartctl_device_media_errors > 0
         for: 0m
         labels:
@@ -172,7 +220,7 @@ groups:
         annotations:
           summary: SMART media errors (instance {{ $labels.instance }})
           description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: SmartWearoutIndicator
+      - alert: smart_wearout_indicator
         expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold
         for: 0m
         labels:
@@ -180,20 +228,29 @@ groups:
         annotations:
           summary: SMART Wearout Indicator (instance {{ $labels.instance }})
           description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+      - alert: smartctl_high_temperature
+        expr: smartctl_device_temperature > 60
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Disk temperature above 60C on {{ $labels.instance }}"
+          description: "Disk {{ $labels.device }} on {{ $labels.instance }} has temperature {{ $value }}C."
   - name: wireguard_rules
     rules:
-      - alert: WireguardHandshake
-        expr: (time() - wireguard_latest_handshake_seconds{instance="http-proxy.home.2rjus.net:9586",interface="wg0",public_key="32Rb13wExcy8uI92JTnFdiOfkv0mlQ6f181WA741DHs="}) > 300
+      - alert: wireguard_handshake_timeout
+        expr: (time() - wireguard_latest_handshake_seconds{interface="wg0"}) > 300
         for: 1m
         labels:
           severity: warning
         annotations:
           summary: "Wireguard handshake timeout on {{ $labels.instance }}"
-          description: "Wireguard handshake timeout on {{ $labels.instance }} for more than 1 minutes."
+          description: "Wireguard handshake timeout on {{ $labels.instance }} for peer {{ $labels.public_key }}."
   - name: monitoring_rules
     rules:
       - alert: prometheus_not_running
         expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="prometheus.service", state="active"} == 0
+        for: 5m
         labels:
           severity: critical
         annotations:
@@ -201,6 +258,7 @@ groups:
           description: "Prometheus service not running on {{ $labels.instance }}"
       - alert: alertmanager_not_running
         expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
+        for: 5m
         labels:
           severity: critical
         annotations:
@@ -208,13 +266,7 @@ groups:
           description: "Alertmanager service not running on {{ $labels.instance }}"
       - alert: pushgateway_not_running
         expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
-        labels:
-          severity: critical
-        annotations:
-          summary: "Pushgateway service not running on {{ $labels.instance }}"
-          description: "Pushgateway service not running on {{ $labels.instance }}"
-      - alert: pushgateway_not_running
-        expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
+        for: 5m
         labels:
           severity: critical
         annotations:
@@ -222,6 +274,7 @@ groups:
           description: "Pushgateway service not running on {{ $labels.instance }}"
       - alert: loki_not_running
         expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="loki.service", state="active"} == 0
+        for: 5m
         labels:
           severity: critical
         annotations:
@@ -229,6 +282,7 @@ groups:
           description: "Loki service not running on {{ $labels.instance }}"
       - alert: grafana_not_running
         expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
+        for: 5m
         labels:
           severity: warning
         annotations:
@@ -236,6 +290,7 @@ groups:
           description: "Grafana service not running on {{ $labels.instance }}"
       - alert: tempo_not_running
         expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="tempo.service", state="active"} == 0
+        for: 5m
         labels:
           severity: warning
         annotations:
@@ -243,8 +298,53 @@ groups:
           description: "Tempo service not running on {{ $labels.instance }}"
       - alert: pyroscope_not_running
         expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="podman-pyroscope.service", state="active"} == 0
+        for: 5m
         labels:
           severity: warning
         annotations:
           summary: "Pyroscope service not running on {{ $labels.instance }}"
           description: "Pyroscope service not running on {{ $labels.instance }}"
+  - name: certificate_rules
+    rules:
+      - alert: certificate_expiring_soon
+        expr: labmon_tlsconmon_certificate_seconds_left < 86400
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "TLS certificate expiring soon for {{ $labels.instance }}"
+          description: "TLS certificate for {{ $labels.address }} is expiring within 24 hours."
+      - alert: certificate_check_error
+        expr: labmon_tlsconmon_certificate_check_error == 1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Error checking certificate for {{ $labels.address }}"
+          description: "Certificate check is failing for {{ $labels.address }} on {{ $labels.instance }}."
+      - alert: step_ca_certificate_expiring
+        expr: labmon_stepmon_certificate_seconds_left < 3600
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Step-CA certificate expiring for {{ $labels.instance }}"
+          description: "Step-CA certificate is expiring within 1 hour on {{ $labels.instance }}."
+  - name: proxmox_rules
+    rules:
+      - alert: pve_node_down
+        expr: pve_up{id=~"node/.*"} == 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Proxmox node {{ $labels.id }} is down"
+          description: "Proxmox node {{ $labels.id }} has been down for more than 5 minutes."
+      - alert: pve_guest_stopped
+        expr: pve_up{id=~"qemu/.*"} == 0 and pve_onboot_status == 1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Proxmox VM {{ $labels.id }} is stopped"
+          description: "Proxmox VM {{ $labels.id }} ({{ $labels.name }}) has onboot=1 but is stopped."
diff --git a/services/nix-cache/default.nix b/services/nix-cache/default.nix
index 18db40c..5db16b7 100644
--- a/services/nix-cache/default.nix
+++ b/services/nix-cache/default.nix
@@ -6,4 +6,10 @@
     ./proxy.nix
     ./nix.nix
   ];
+
+  homelab.monitoring.scrapeTargets = [{
+    job_name = "nix-cache_caddy";
+    port = 443;
+    scheme = "https";
+  }];
 }
-- 
2.49.1


From e7980978c7a79e905dbf4bee09ba62cca83ef00a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= <torjus@usit.uio.no>
Date: Thu, 5 Feb 2026 00:52:39 +0100
Subject: [PATCH 2/2] docs: document monitoring auto-generation in CLAUDE.md

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 CLAUDE.md | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 7db25b4..74112e2 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -122,9 +122,10 @@ This ensures documentation matches the exact nixpkgs version (currently NixOS 25
   - Core modules: nix.nix, sshd.nix, sops.nix, acme.nix, autoupgrade.nix
   - Monitoring: node-exporter and promtail on every host
 - `/modules/` - Custom NixOS modules
-  - `homelab/` - Homelab-specific options (DNS automation, etc.)
+  - `homelab/` - Homelab-specific options (DNS automation, monitoring scrape targets)
 - `/lib/` - Nix library functions
   - `dns-zone.nix` - DNS zone generation functions
+  - `monitoring.nix` - Prometheus scrape target generation functions
 - `/services/` - Reusable service modules, selectively imported by hosts
   - `home-assistant/` - Home automation stack
   - `monitoring/` - Observability stack (Prometheus, Grafana, Loki, Tempo)
@@ -156,6 +157,7 @@ All hosts automatically get:
 - Internal ACME CA integration (ca.home.2rjus.net)
 - Daily auto-upgrades with auto-reboot
 - Prometheus node-exporter + Promtail (logs to monitoring01)
+- Monitoring scrape target auto-registration via `homelab.monitoring` options
 - Custom root CA trust
 - DNS zone auto-registration via `homelab.dns` options
 
@@ -310,7 +312,7 @@ This means:
 11. Deploy by running `nixos-rebuild boot --flake URL#<hostname>` on the host.
 12. Run auto-upgrade on DNS servers (ns1, ns2) to pick up the new host's DNS entry
 
-**Note:** DNS A records are auto-generated from the host's `systemd.network.networks` static IP configuration. No manual zone file editing is required.
+**Note:** DNS A records and Prometheus node-exporter scrape targets are auto-generated from the host's `systemd.network.networks` static IP configuration. No manual zone file or Prometheus config editing is required.
 
 ### Important Patterns
 
@@ -333,6 +335,23 @@ All hosts ship metrics and logs to `monitoring01`:
 - **Tracing**: Tempo for distributed tracing
 - **Profiling**: Pyroscope for continuous profiling
 
+**Scrape Target Auto-Generation:**
+
+Prometheus scrape targets are automatically generated from host configurations, following the same pattern as DNS zone generation:
+
+- **Node-exporter**: All flake hosts with static IPs are automatically added as node-exporter targets
+- **Service targets**: Defined via `homelab.monitoring.scrapeTargets` in service modules
+- **External targets**: Non-flake hosts defined in `/services/monitoring/external-targets.nix`
+- **Library**: `lib/monitoring.nix` provides `generateNodeExporterTargets` and `generateScrapeConfigs`
+
+Host monitoring options (`homelab.monitoring.*`):
+- `enable` (default: `true`) - Include host in Prometheus node-exporter scrape targets
+- `scrapeTargets` (default: `[]`) - Additional scrape targets exposed by this host (job_name, port, metrics_path, scheme, scrape_interval, honor_labels)
+
+Service modules declare their scrape targets directly (e.g., `services/ca/default.nix` declares step-ca on port 9000). The Prometheus config on monitoring01 auto-generates scrape configs from all hosts.
+
+To add monitoring targets for non-NixOS hosts, edit `/services/monitoring/external-targets.nix`.
+
 ### DNS Architecture
 
 - `ns1` (10.69.13.5) - Primary authoritative DNS + resolver
-- 
2.49.1