diff --git a/CLAUDE.md b/CLAUDE.md
index 7db25b4..74112e2 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -122,9 +122,10 @@ This ensures documentation matches the exact nixpkgs version (currently NixOS 25
   - Core modules: nix.nix, sshd.nix, sops.nix, acme.nix, autoupgrade.nix
   - Monitoring: node-exporter and promtail on every host
 - `/modules/` - Custom NixOS modules
-  - `homelab/` - Homelab-specific options (DNS automation, etc.)
+  - `homelab/` - Homelab-specific options (DNS automation, monitoring scrape targets)
 - `/lib/` - Nix library functions
   - `dns-zone.nix` - DNS zone generation functions
+  - `monitoring.nix` - Prometheus scrape target generation functions
 - `/services/` - Reusable service modules, selectively imported by hosts
   - `home-assistant/` - Home automation stack
   - `monitoring/` - Observability stack (Prometheus, Grafana, Loki, Tempo)
@@ -156,6 +157,7 @@ All hosts automatically get:
 - Internal ACME CA integration (ca.home.2rjus.net)
 - Daily auto-upgrades with auto-reboot
 - Prometheus node-exporter + Promtail (logs to monitoring01)
+- Monitoring scrape target auto-registration via `homelab.monitoring` options
 - Custom root CA trust
 - DNS zone auto-registration via `homelab.dns` options
 
@@ -310,7 +312,7 @@ This means:
 11. Deploy by running `nixos-rebuild boot --flake URL#<hostname>` on the host.
 12. Run auto-upgrade on DNS servers (ns1, ns2) to pick up the new host's DNS entry
 
-**Note:** DNS A records are auto-generated from the host's `systemd.network.networks` static IP configuration. No manual zone file editing is required.
+**Note:** DNS A records and Prometheus node-exporter scrape targets are auto-generated from the host's `systemd.network.networks` static IP configuration. No manual zone file or Prometheus config editing is required.
 
 ### Important Patterns
 
@@ -333,6 +335,23 @@ All hosts ship metrics and logs to `monitoring01`:
 - **Tracing**: Tempo for distributed tracing
 - **Profiling**: Pyroscope for continuous profiling
 
+**Scrape Target Auto-Generation:**
+
+Prometheus scrape targets are automatically generated from host configurations, following the same pattern as DNS zone generation:
+
+- **Node-exporter**: All flake hosts with static IPs are automatically added as node-exporter targets
+- **Service targets**: Defined via `homelab.monitoring.scrapeTargets` in service modules
+- **External targets**: Non-flake hosts defined in `/services/monitoring/external-targets.nix`
+- **Library**: `lib/monitoring.nix` provides `generateNodeExporterTargets` and `generateScrapeConfigs`
+
+Host monitoring options (`homelab.monitoring.*`):
+- `enable` (default: `true`) - Include host in Prometheus node-exporter scrape targets
+- `scrapeTargets` (default: `[]`) - Additional scrape targets exposed by this host (job_name, port, metrics_path, scheme, scrape_interval, honor_labels)
+
+Service modules declare their scrape targets directly (e.g., `services/ca/default.nix` declares step-ca on port 9000). The Prometheus config on monitoring01 auto-generates scrape configs from all hosts.
+
+To add monitoring targets for non-NixOS hosts, edit `/services/monitoring/external-targets.nix`.
+
 ### DNS Architecture
 
 - `ns1` (10.69.13.5) - Primary authoritative DNS + resolver
diff --git a/hosts/http-proxy/wireguard.nix b/hosts/http-proxy/wireguard.nix
index 6485e69..5470996 100644
--- a/hosts/http-proxy/wireguard.nix
+++ b/hosts/http-proxy/wireguard.nix
@@ -26,7 +26,11 @@
       };
     };
   };
-  # monitoring
+  homelab.monitoring.scrapeTargets = [{
+    job_name = "wireguard";
+    port = 9586;
+  }];
+
   services.prometheus.exporters.wireguard = {
     enable = true;
   };
diff --git a/lib/monitoring.nix b/lib/monitoring.nix
new file mode 100644
index 0000000..19e522a
--- /dev/null
+++ b/lib/monitoring.nix
@@ -0,0 +1,145 @@
+{ lib }:
+let
+  # Extract IP address from CIDR notation (e.g., "10.69.13.5/24" -> "10.69.13.5")
+  extractIP = address:
+    let
+      parts = lib.splitString "/" address;
+    in
+    builtins.head parts;
+
+  # Check if a network interface name looks like a VPN/tunnel interface
+  isVpnInterface = ifaceName:
+    lib.hasPrefix "wg" ifaceName ||
+    lib.hasPrefix "tun" ifaceName ||
+    lib.hasPrefix "tap" ifaceName ||
+    lib.hasPrefix "vti" ifaceName;
+
+  # Extract monitoring info from a single host configuration
+  # Returns null if host should not be included
+  extractHostMonitoring = name: hostConfig:
+    let
+      cfg = hostConfig.config;
+      monConfig = (cfg.homelab or { }).monitoring or { enable = true; scrapeTargets = [ ]; };
+      dnsConfig = (cfg.homelab or { }).dns or { enable = true; };
+      hostname = cfg.networking.hostName;
+      networks = cfg.systemd.network.networks or { };
+
+      # Filter out VPN interfaces and find networks with static addresses
+      physicalNetworks = lib.filterAttrs
+        (netName: netCfg:
+          let
+            ifaceName = netCfg.matchConfig.Name or "";
+          in
+          !(isVpnInterface ifaceName) && (netCfg.address or [ ]) != [ ])
+        networks;
+
+      # Get addresses from physical networks only
+      networkAddresses = lib.flatten (
+        lib.mapAttrsToList
+          (netName: netCfg: netCfg.address or [ ])
+          physicalNetworks
+      );
+
+      firstAddress = if networkAddresses != [ ] then builtins.head networkAddresses else null;
+    in
+    if !(monConfig.enable or true) || !(dnsConfig.enable or true) || firstAddress == null then
+      null
+    else
+      {
+        inherit hostname;
+        ip = extractIP firstAddress;
+        scrapeTargets = monConfig.scrapeTargets or [ ];
+      };
+
+  # Generate node-exporter targets from all flake hosts
+  generateNodeExporterTargets = self: externalTargets:
+    let
+      nixosConfigs = self.nixosConfigurations or { };
+      hostList = lib.filter (x: x != null) (
+        lib.mapAttrsToList extractHostMonitoring nixosConfigs
+      );
+      flakeTargets = map (host: "${host.hostname}.home.2rjus.net:9100") hostList;
+    in
+    flakeTargets ++ (externalTargets.nodeExporter or [ ]);
+
+  # Generate scrape configs from all flake hosts and external targets
+  generateScrapeConfigs = self: externalTargets:
+    let
+      nixosConfigs = self.nixosConfigurations or { };
+      hostList = lib.filter (x: x != null) (
+        lib.mapAttrsToList extractHostMonitoring nixosConfigs
+      );
+
+      # Collect all scrapeTargets from all hosts, grouped by job_name
+      allTargets = lib.flatten (map
+        (host:
+          map
+            (target: {
+              inherit (target) job_name port metrics_path scheme scrape_interval honor_labels;
+              hostname = host.hostname;
+            })
+            host.scrapeTargets
+        )
+        hostList
+      );
+
+      # Group targets by job_name
+      grouped = lib.groupBy (t: t.job_name) allTargets;
+
+      # Generate a scrape config for each job
+      flakeScrapeConfigs = lib.mapAttrsToList
+        (jobName: targets:
+          let
+            first = builtins.head targets;
+            targetAddrs = map
+              (t:
+                let
+                  portStr = toString t.port;
+                in
+                "${t.hostname}.home.2rjus.net:${portStr}")
+              targets;
+            config = {
+              job_name = jobName;
+              static_configs = [{
+                targets = targetAddrs;
+              }];
+            }
+            // (lib.optionalAttrs (first.metrics_path != "/metrics") {
+              metrics_path = first.metrics_path;
+            })
+            // (lib.optionalAttrs (first.scheme != "http") {
+              scheme = first.scheme;
+            })
+            // (lib.optionalAttrs (first.scrape_interval != null) {
+              scrape_interval = first.scrape_interval;
+            })
+            // (lib.optionalAttrs first.honor_labels {
+              honor_labels = true;
+            });
+          in
+          config
+        )
+        grouped;
+
+      # External scrape configs
+      externalScrapeConfigs = map
+        (ext: {
+          job_name = ext.job_name;
+          static_configs = [{
+            targets = ext.targets;
+          }];
+        } // (lib.optionalAttrs (ext ? metrics_path) {
+          metrics_path = ext.metrics_path;
+        }) // (lib.optionalAttrs (ext ? scheme) {
+          scheme = ext.scheme;
+        }) // (lib.optionalAttrs (ext ? scrape_interval) {
+          scrape_interval = ext.scrape_interval;
+        }))
+        (externalTargets.scrapeConfigs or [ ]);
+    in
+    flakeScrapeConfigs ++ externalScrapeConfigs;
+
+in
+{
+  inherit extractHostMonitoring generateNodeExporterTargets generateScrapeConfigs;
+}
diff --git a/modules/homelab/default.nix b/modules/homelab/default.nix
index 0d2ba01..b945a3d 100644
--- a/modules/homelab/default.nix
+++ b/modules/homelab/default.nix
@@ -2,5 +2,6 @@
 {
   imports = [
     ./dns.nix
+    ./monitoring.nix
   ];
 }
diff --git a/modules/homelab/monitoring.nix b/modules/homelab/monitoring.nix
new file mode 100644
index 0000000..b6e101d
--- /dev/null
+++ b/modules/homelab/monitoring.nix
@@ -0,0 +1,50 @@
+{ config, lib, ... }:
+let
+  cfg = config.homelab.monitoring;
+in
+{
+  options.homelab.monitoring = {
+    enable = lib.mkOption {
+      type = lib.types.bool;
+      default = true;
+      description = "Include this host in Prometheus node-exporter scrape targets";
+    };
+
+    scrapeTargets = lib.mkOption {
+      type = lib.types.listOf (lib.types.submodule {
+        options = {
+          job_name = lib.mkOption {
+            type = lib.types.str;
+            description = "Prometheus scrape job name";
+          };
+          port = lib.mkOption {
+            type = lib.types.port;
+            description = "Port to scrape metrics from";
+          };
+          metrics_path = lib.mkOption {
+            type = lib.types.str;
+            default = "/metrics";
+            description = "HTTP path to scrape metrics from";
+          };
+          scheme = lib.mkOption {
+            type = lib.types.str;
+            default = "http";
+            description = "HTTP scheme (http or https)";
+          };
+          scrape_interval = lib.mkOption {
+            type = lib.types.nullOr lib.types.str;
+            default = null;
+            description = "Override the global scrape interval for this target";
+          };
+          honor_labels = lib.mkOption {
+            type = lib.types.bool;
+            default = false;
+            description = "Whether to honor labels from the scraped target";
+          };
+        };
+      });
+      default = [ ];
+      description = "Additional Prometheus scrape targets exposed by this host";
+    };
+  };
+}
diff --git a/services/ca/default.nix b/services/ca/default.nix
index 9c52015..b5759a0 100644
--- a/services/ca/default.nix
+++ b/services/ca/default.nix
@@ -1,5 +1,9 @@
 { pkgs, unstable, ... }:
 {
+  homelab.monitoring.scrapeTargets = [{
+    job_name = "step-ca";
+    port = 9000;
+  }];
   sops.secrets."ca_root_pw" = {
     sopsFile = ../../secrets/ca/secrets.yaml;
     owner = "step-ca";
diff --git a/services/home-assistant/default.nix b/services/home-assistant/default.nix
index c987fe4..14f4fce 100644
--- a/services/home-assistant/default.nix
+++ b/services/home-assistant/default.nix
@@ -1,5 +1,11 @@
 { pkgs, config, ... }:
 {
+  homelab.monitoring.scrapeTargets = [{
+    job_name = "home-assistant";
+    port = 8123;
+    metrics_path = "/api/prometheus";
+    scrape_interval = "60s";
+  }];
   # Enable the Home Assistant service
   services.home-assistant = {
     enable = true;
diff --git a/services/http-proxy/default.nix b/services/http-proxy/default.nix
index d046d09..07ef28f 100644
--- a/services/http-proxy/default.nix
+++ b/services/http-proxy/default.nix
@@ -3,4 +3,9 @@
   imports = [
     ./proxy.nix
   ];
+
+  homelab.monitoring.scrapeTargets = [{
+    job_name = "caddy";
+    port = 80;
+  }];
 }
diff --git a/services/jellyfin/default.nix b/services/jellyfin/default.nix
index e2322de..729080c 100644
--- a/services/jellyfin/default.nix
+++ b/services/jellyfin/default.nix
@@ -1,5 +1,9 @@
 { pkgs, ... }:
 {
+  homelab.monitoring.scrapeTargets = [{
+    job_name = "jellyfin";
+    port = 8096;
+  }];
   services.jellyfin = {
     enable = true;
   };
diff --git a/services/monitoring/external-targets.nix b/services/monitoring/external-targets.nix
new file mode 100644
index 0000000..debc0d5
--- /dev/null
+++ b/services/monitoring/external-targets.nix
@@ -0,0 +1,12 @@
+# Monitoring targets for hosts not managed by this flake
+# These are manually maintained and combined with auto-generated targets
+{
+  nodeExporter = [
+    "gunter.home.2rjus.net:9100"
+  ];
+  scrapeConfigs = [
+    { job_name = "smartctl"; targets = [ "gunter.home.2rjus.net:9633" ]; }
+    { job_name = "ghettoptt"; targets = [ "gunter.home.2rjus.net:8989" ]; }
+    { job_name = "restic_rest"; targets = [ "10.69.12.52:8000" ]; }
+  ];
+}
diff --git a/services/monitoring/prometheus.nix b/services/monitoring/prometheus.nix
index cb496ad..c96b817 100644
--- a/services/monitoring/prometheus.nix
+++ b/services/monitoring/prometheus.nix
@@ -1,4 +1,11 @@
-{ ... }:
+{ self, lib, ... }:
+let
+  monLib = import ../../lib/monitoring.nix { inherit lib; };
+  externalTargets = import ./external-targets.nix;
+
+  nodeExporterTargets = monLib.generateNodeExporterTargets self externalTargets;
+  autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets;
+in
 {
   services.prometheus = {
     enable = true;
@@ -45,26 +52,16 @@
     ];
 
     scrapeConfigs = [
+      # Auto-generated node-exporter targets from flake hosts + external
       {
         job_name = "node-exporter";
         static_configs = [
           {
-            targets = [
-              "ca.home.2rjus.net:9100"
-              "gunter.home.2rjus.net:9100"
-              "ha1.home.2rjus.net:9100"
-              "http-proxy.home.2rjus.net:9100"
-              "jelly01.home.2rjus.net:9100"
-              "monitoring01.home.2rjus.net:9100"
-              "nix-cache01.home.2rjus.net:9100"
-              "ns1.home.2rjus.net:9100"
-              "ns2.home.2rjus.net:9100"
-              "pgdb1.home.2rjus.net:9100"
-              "nats1.home.2rjus.net:9100"
-            ];
+            targets = nodeExporterTargets;
           }
         ];
       }
+      # Local monitoring services (not auto-generated)
       {
         job_name = "prometheus";
         static_configs = [
@@ -85,7 +82,7 @@
         job_name = "grafana";
         static_configs = [
           {
-            targets = [ "localhost:3100" ];
+            targets = [ "localhost:3000" ];
           }
         ];
       }
@@ -98,13 +95,23 @@
         ];
       }
       {
-        job_name = "restic_rest";
+        job_name = "pushgateway";
+        honor_labels = true;
         static_configs = [
           {
-            targets = [ "10.69.12.52:8000" ];
+            targets = [ "localhost:9091" ];
           }
         ];
       }
+      {
+        job_name = "labmon";
+        static_configs = [
+          {
+            targets = [ "monitoring01.home.2rjus.net:9969" ];
+          }
+        ];
+      }
+      # pve-exporter with complex relabel config
       {
         job_name = "pve-exporter";
         static_configs = [
@@ -133,91 +140,8 @@
           }
         ];
       }
-      {
-        job_name = "caddy";
-        static_configs = [
-          {
-            targets = [ "http-proxy.home.2rjus.net" ];
-          }
-        ];
-      }
-      {
-        job_name = "jellyfin";
-        static_configs = [
-          {
-            targets = [ "jelly01.home.2rjus.net:8096" ];
-          }
-        ];
-      }
-      {
-        job_name = "smartctl";
-        static_configs = [
-          {
-            targets = [ "gunter.home.2rjus.net:9633" ];
-          }
-        ];
-      }
-      {
-        job_name = "wireguard";
-        static_configs = [
-          {
-            targets = [ "http-proxy.home.2rjus.net:9586" ];
-          }
-        ];
-      }
-      {
-        job_name = "home-assistant";
-        scrape_interval = "60s";
-        metrics_path = "/api/prometheus";
-        static_configs = [
-          {
-            targets = [ "ha1.home.2rjus.net:8123" ];
-          }
-        ];
-      }
-      {
-        job_name = "ghettoptt";
-        static_configs = [
-          {
-            targets = [ "gunter.home.2rjus.net:8989" ];
-          }
-        ];
-      }
-      {
-        job_name = "step-ca";
-        static_configs = [
-          {
-            targets = [ "ca.home.2rjus.net:9000" ];
-          }
-        ];
-      }
-      {
-        job_name = "labmon";
-        static_configs = [
-          {
-            targets = [ "monitoring01.home.2rjus.net:9969" ];
-          }
-        ];
-      }
-      {
-        job_name = "pushgateway";
-        honor_labels = true;
-        static_configs = [
-          {
-            targets = [ "localhost:9091" ];
-          }
-        ];
-      }
-      {
-        job_name = "nix-cache_caddy";
-        scheme = "https";
-        static_configs = [
-          {
-            targets = [ "nix-cache.home.2rjus.net" ];
-          }
-        ];
-      }
-    ];
+    ] ++ autoScrapeConfigs;
+
     pushgateway = {
       enable = true;
       web = {
diff --git a/services/monitoring/rules.yml b/services/monitoring/rules.yml
index 4c9afc0..c5eba34 100644
--- a/services/monitoring/rules.yml
+++ b/services/monitoring/rules.yml
@@ -57,6 +57,38 @@ groups:
         annotations:
           summary: "Promtail service not running on {{ $labels.instance }}"
           description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes."
+      - alert: filesystem_filling_up
+        expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0
+        for: 1h
+        labels:
+          severity: warning
+        annotations:
+          summary: "Filesystem predicted to fill within 24h on {{ $labels.instance }}"
+          description: "Based on the last 6h trend, the root filesystem on {{ $labels.instance }} is predicted to run out of space within 24 hours."
+      - alert: systemd_not_running
+        expr: node_systemd_system_running == 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Systemd not in running state on {{ $labels.instance }}"
+          description: "Systemd is not in running state on {{ $labels.instance }}. The system may be in a degraded state."
+      - alert: high_file_descriptors
+        expr: node_filefd_allocated / node_filefd_maximum > 0.8
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High file descriptor usage on {{ $labels.instance }}"
+          description: "More than 80% of file descriptors are in use on {{ $labels.instance }}."
+      - alert: host_reboot
+        expr: changes(node_boot_time_seconds[10m]) > 0
+        for: 0m
+        labels:
+          severity: info
+        annotations:
+          summary: "Host {{ $labels.instance }} has rebooted"
+          description: "Host {{ $labels.instance }} has rebooted."
   - name: nameserver_rules
     rules:
       - alert: unbound_down
@@ -75,7 +107,7 @@ groups:
         annotations:
           summary: "NSD not running on {{ $labels.instance }}"
           description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
-  - name: http-proxy_rules 
+  - name: http_proxy_rules
     rules:
       - alert: caddy_down
         expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0
@@ -85,6 +117,22 @@ groups:
         annotations:
           summary: "Caddy not running on {{ $labels.instance }}"
           description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
+      - alert: caddy_upstream_unhealthy
+        expr: caddy_reverse_proxy_upstreams_healthy == 0
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Caddy upstream unhealthy for {{ $labels.upstream }}"
+          description: "Caddy reverse proxy upstream {{ $labels.upstream }} is unhealthy on {{ $labels.instance }}."
+      - alert: caddy_high_error_rate
+        expr: rate(caddy_http_request_errors_total[5m]) > 1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High HTTP error rate on {{ $labels.instance }}"
+          description: "Caddy is experiencing a high rate of HTTP errors on {{ $labels.instance }}."
   - name: nats_rules
     rules:
       - alert: nats_down
@@ -97,7 +145,7 @@ groups:
           description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
   - name: nix_cache_rules
     rules:
-      - alert: build-flakes_service_not_active_recently
+      - alert: build_flakes_service_not_active_recently
         expr: count_over_time(node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="active"}[1h]) < 1
         for: 0m
         labels:
@@ -138,7 +186,7 @@ groups:
         annotations:
           summary: "Home assistant not running on {{ $labels.instance }}"
           description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes."
-      - alert: zigbee2qmtt_down 
+      - alert: zigbee2mqtt_down
         expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0
         for: 5m
         labels:
@@ -156,7 +204,7 @@ groups:
           description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
   - name: smartctl_rules
     rules:
-      - alert: SmartCriticalWarning
+      - alert: smart_critical_warning
         expr: smartctl_device_critical_warning > 0
         for: 0m
         labels:
@@ -164,7 +212,7 @@ groups:
         annotations:
           summary: SMART critical warning (instance {{ $labels.instance }})
           description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: SmartMediaErrors
+      - alert: smart_media_errors
         expr: smartctl_device_media_errors > 0
         for: 0m
         labels:
@@ -172,7 +220,7 @@ groups:
         annotations:
           summary: SMART media errors (instance {{ $labels.instance }})
           description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: SmartWearoutIndicator
+      - alert: smart_wearout_indicator
         expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold
         for: 0m
         labels:
@@ -180,20 +228,29 @@ groups:
         annotations:
           summary: SMART Wearout Indicator (instance {{ $labels.instance }})
           description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+      - alert: smartctl_high_temperature
+        expr: smartctl_device_temperature > 60
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Disk temperature above 60C on {{ $labels.instance }}"
+          description: "Disk {{ $labels.device }} on {{ $labels.instance }} has temperature {{ $value }}C."
   - name: wireguard_rules
     rules:
-      - alert: WireguardHandshake
-        expr: (time() - wireguard_latest_handshake_seconds{instance="http-proxy.home.2rjus.net:9586",interface="wg0",public_key="32Rb13wExcy8uI92JTnFdiOfkv0mlQ6f181WA741DHs="}) > 300
+      - alert: wireguard_handshake_timeout
+        expr: (time() - wireguard_latest_handshake_seconds{interface="wg0"}) > 300
         for: 1m
         labels:
           severity: warning
         annotations:
           summary: "Wireguard handshake timeout on {{ $labels.instance }}"
-          description: "Wireguard handshake timeout on {{ $labels.instance }} for more than 1 minutes."
+          description: "Wireguard handshake timeout on {{ $labels.instance }} for peer {{ $labels.public_key }}."
   - name: monitoring_rules
     rules:
       - alert: prometheus_not_running
         expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="prometheus.service", state="active"} == 0
+        for: 5m
         labels:
           severity: critical
         annotations:
@@ -201,6 +258,7 @@ groups:
           description: "Prometheus service not running on {{ $labels.instance }}"
       - alert: alertmanager_not_running
         expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
+        for: 5m
         labels:
           severity: critical
         annotations:
@@ -208,13 +266,7 @@ groups:
           description: "Alertmanager service not running on {{ $labels.instance }}"
       - alert: pushgateway_not_running
         expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
-        labels:
-          severity: critical
-        annotations:
-          summary: "Pushgateway service not running on {{ $labels.instance }}"
-          description: "Pushgateway service not running on {{ $labels.instance }}"
-      - alert: pushgateway_not_running
-        expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
+        for: 5m
         labels:
           severity: critical
         annotations:
@@ -222,6 +274,7 @@ groups:
           description: "Pushgateway service not running on {{ $labels.instance }}"
       - alert: loki_not_running
         expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="loki.service", state="active"} == 0
+        for: 5m
         labels:
           severity: critical
         annotations:
@@ -229,6 +282,7 @@ groups:
           description: "Loki service not running on {{ $labels.instance }}"
       - alert: grafana_not_running
         expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
+        for: 5m
         labels:
           severity: warning
         annotations:
@@ -236,6 +290,7 @@ groups:
           description: "Grafana service not running on {{ $labels.instance }}"
       - alert: tempo_not_running
         expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="tempo.service", state="active"} == 0
+        for: 5m
         labels:
           severity: warning
         annotations:
@@ -243,8 +298,53 @@ groups:
           description: "Tempo service not running on {{ $labels.instance }}"
       - alert: pyroscope_not_running
         expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="podman-pyroscope.service", state="active"} == 0
+        for: 5m
         labels:
           severity: warning
         annotations:
           summary: "Pyroscope service not running on {{ $labels.instance }}"
           description: "Pyroscope service not running on {{ $labels.instance }}"
+  - name: certificate_rules
+    rules:
+      - alert: certificate_expiring_soon
+        expr: labmon_tlsconmon_certificate_seconds_left < 86400
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "TLS certificate expiring soon for {{ $labels.instance }}"
+          description: "TLS certificate for {{ $labels.address }} is expiring within 24 hours."
+      - alert: certificate_check_error
+        expr: labmon_tlsconmon_certificate_check_error == 1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Error checking certificate for {{ $labels.address }}"
+          description: "Certificate check is failing for {{ $labels.address }} on {{ $labels.instance }}."
+      - alert: step_ca_certificate_expiring
+        expr: labmon_stepmon_certificate_seconds_left < 3600
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Step-CA certificate expiring for {{ $labels.instance }}"
+          description: "Step-CA certificate is expiring within 1 hour on {{ $labels.instance }}."
+  - name: proxmox_rules
+    rules:
+      - alert: pve_node_down
+        expr: pve_up{id=~"node/.*"} == 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Proxmox node {{ $labels.id }} is down"
+          description: "Proxmox node {{ $labels.id }} has been down for more than 5 minutes."
+      - alert: pve_guest_stopped
+        expr: pve_up{id=~"qemu/.*"} == 0 and pve_onboot_status == 1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Proxmox VM {{ $labels.id }} is stopped"
+          description: "Proxmox VM {{ $labels.id }} ({{ $labels.name }}) has onboot=1 but is stopped."
diff --git a/services/nix-cache/default.nix b/services/nix-cache/default.nix
index 18db40c..5db16b7 100644
--- a/services/nix-cache/default.nix
+++ b/services/nix-cache/default.nix
@@ -6,4 +6,10 @@
     ./proxy.nix
     ./nix.nix
   ];
+
+  homelab.monitoring.scrapeTargets = [{
+    job_name = "nix-cache_caddy";
+    port = 443;
+    scheme = "https";
+  }];
 }