2026-02-04 23:53:46 +00:00
13 changed files with 401 additions and 121 deletions
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -122,9 +122,10 @@ This ensures documentation matches the exact nixpkgs version (currently NixOS 25
  - Core modules: nix.nix, sshd.nix, sops.nix, acme.nix, autoupgrade.nix
  - Monitoring: node-exporter and promtail on every host
 - `/modules/` - Custom NixOS modules
-  - `homelab/` - Homelab-specific options (DNS automation, etc.)
+  - `homelab/` - Homelab-specific options (DNS automation, monitoring scrape targets)
 - `/lib/` - Nix library functions
  - `dns-zone.nix` - DNS zone generation functions
  - `monitoring.nix` - Prometheus scrape target generation functions
 - `/services/` - Reusable service modules, selectively imported by hosts
  - `home-assistant/` - Home automation stack
  - `monitoring/` - Observability stack (Prometheus, Grafana, Loki, Tempo)
@@ -156,6 +157,7 @@ All hosts automatically get:
 - Internal ACME CA integration (ca.home.2rjus.net)
 - Daily auto-upgrades with auto-reboot
 - Prometheus node-exporter + Promtail (logs to monitoring01)
 - Monitoring scrape target auto-registration via `homelab.monitoring` options
 - Custom root CA trust
 - DNS zone auto-registration via `homelab.dns` options
@@ -310,7 +312,7 @@ This means:
 11. Deploy by running `nixos-rebuild boot --flake URL#<hostname>` on the host.
 12. Run auto-upgrade on DNS servers (ns1, ns2) to pick up the new host's DNS entry
-**Note:** DNS A records are auto-generated from the host's `systemd.network.networks` static IP configuration. No manual zone file editing is required.
+**Note:** DNS A records and Prometheus node-exporter scrape targets are auto-generated from the host's `systemd.network.networks` static IP configuration. No manual zone file or Prometheus config editing is required.
 ### Important Patterns
@@ -333,6 +335,23 @@ All hosts ship metrics and logs to `monitoring01`:
 - **Tracing**: Tempo for distributed tracing
 - **Profiling**: Pyroscope for continuous profiling
 **Scrape Target Auto-Generation:**
 Prometheus scrape targets are automatically generated from host configurations, following the same pattern as DNS zone generation:
 - **Node-exporter**: All flake hosts with static IPs are automatically added as node-exporter targets
 - **Service targets**: Defined via `homelab.monitoring.scrapeTargets` in service modules
 - **External targets**: Non-flake hosts defined in `/services/monitoring/external-targets.nix`
 - **Library**: `lib/monitoring.nix` provides `generateNodeExporterTargets` and `generateScrapeConfigs`
 Host monitoring options (`homelab.monitoring.*`):
 - `enable` (default: `true`) - Include host in Prometheus node-exporter scrape targets
 - `scrapeTargets` (default: `[]`) - Additional scrape targets exposed by this host (job_name, port, metrics_path, scheme, scrape_interval, honor_labels)
 Service modules declare their scrape targets directly (e.g., `services/ca/default.nix` declares step-ca on port 9000). The Prometheus config on monitoring01 auto-generates scrape configs from all hosts.
 To add monitoring targets for non-NixOS hosts, edit `/services/monitoring/external-targets.nix`.
 ### DNS Architecture
 - `ns1` (10.69.13.5) - Primary authoritative DNS + resolver
--- a/hosts/http-proxy/wireguard.nix
+++ b/hosts/http-proxy/wireguard.nix
@@ -26,7 +26,11 @@
      };
    };
  };
-  # monitoring
+  homelab.monitoring.scrapeTargets = [{
    job_name = "wireguard";
    port = 9586;
  }];
  services.prometheus.exporters.wireguard = {
    enable = true;
  };
--- a/lib/monitoring.nix
+++ b/lib/monitoring.nix
@@ -0,0 +1,145 @@
 { lib }:
 let
  # Extract IP address from CIDR notation (e.g., "10.69.13.5/24" -> "10.69.13.5")
  extractIP = address:
    let
      parts = lib.splitString "/" address;
    in
    builtins.head parts;
  # Check if a network interface name looks like a VPN/tunnel interface
  isVpnInterface = ifaceName:
    lib.hasPrefix "wg" ifaceName ||
    lib.hasPrefix "tun" ifaceName ||
    lib.hasPrefix "tap" ifaceName ||
    lib.hasPrefix "vti" ifaceName;
  # Extract monitoring info from a single host configuration
  # Returns null if host should not be included
  extractHostMonitoring = name: hostConfig:
    let
      cfg = hostConfig.config;
      monConfig = (cfg.homelab or { }).monitoring or { enable = true; scrapeTargets = [ ]; };
      dnsConfig = (cfg.homelab or { }).dns or { enable = true; };
      hostname = cfg.networking.hostName;
      networks = cfg.systemd.network.networks or { };
      # Filter out VPN interfaces and find networks with static addresses
      physicalNetworks = lib.filterAttrs
        (netName: netCfg:
          let
            ifaceName = netCfg.matchConfig.Name or "";
          in
          !(isVpnInterface ifaceName) && (netCfg.address or [ ]) != [ ])
        networks;
      # Get addresses from physical networks only
      networkAddresses = lib.flatten (
        lib.mapAttrsToList
          (netName: netCfg: netCfg.address or [ ])
          physicalNetworks
      );
      firstAddress = if networkAddresses != [ ] then builtins.head networkAddresses else null;
    in
    if !(monConfig.enable or true) || !(dnsConfig.enable or true) || firstAddress == null then
      null
    else
      {
        inherit hostname;
        ip = extractIP firstAddress;
        scrapeTargets = monConfig.scrapeTargets or [ ];
      };
  # Generate node-exporter targets from all flake hosts
  generateNodeExporterTargets = self: externalTargets:
    let
      nixosConfigs = self.nixosConfigurations or { };
      hostList = lib.filter (x: x != null) (
        lib.mapAttrsToList extractHostMonitoring nixosConfigs
      );
      flakeTargets = map (host: "${host.hostname}.home.2rjus.net:9100") hostList;
    in
    flakeTargets ++ (externalTargets.nodeExporter or [ ]);
  # Generate scrape configs from all flake hosts and external targets
  generateScrapeConfigs = self: externalTargets:
    let
      nixosConfigs = self.nixosConfigurations or { };
      hostList = lib.filter (x: x != null) (
        lib.mapAttrsToList extractHostMonitoring nixosConfigs
      );
      # Collect all scrapeTargets from all hosts, grouped by job_name
      allTargets = lib.flatten (map
        (host:
          map
            (target: {
              inherit (target) job_name port metrics_path scheme scrape_interval honor_labels;
              hostname = host.hostname;
            })
            host.scrapeTargets
        )
        hostList
      );
      # Group targets by job_name
      grouped = lib.groupBy (t: t.job_name) allTargets;
      # Generate a scrape config for each job
      flakeScrapeConfigs = lib.mapAttrsToList
        (jobName: targets:
          let
            first = builtins.head targets;
            targetAddrs = map
              (t:
                let
                  portStr = toString t.port;
                in
                "${t.hostname}.home.2rjus.net:${portStr}")
              targets;
            config = {
              job_name = jobName;
              static_configs = [{
                targets = targetAddrs;
              }];
            }
            // (lib.optionalAttrs (first.metrics_path != "/metrics") {
              metrics_path = first.metrics_path;
            })
            // (lib.optionalAttrs (first.scheme != "http") {
              scheme = first.scheme;
            })
            // (lib.optionalAttrs (first.scrape_interval != null) {
              scrape_interval = first.scrape_interval;
            })
            // (lib.optionalAttrs first.honor_labels {
              honor_labels = true;
            });
          in
          config
        )
        grouped;
      # External scrape configs
      externalScrapeConfigs = map
        (ext: {
          job_name = ext.job_name;
          static_configs = [{
            targets = ext.targets;
          }];
        } // (lib.optionalAttrs (ext ? metrics_path) {
          metrics_path = ext.metrics_path;
        }) // (lib.optionalAttrs (ext ? scheme) {
          scheme = ext.scheme;
        }) // (lib.optionalAttrs (ext ? scrape_interval) {
          scrape_interval = ext.scrape_interval;
        }))
        (externalTargets.scrapeConfigs or [ ]);
    in
    flakeScrapeConfigs ++ externalScrapeConfigs;
 in
 {
  inherit extractHostMonitoring generateNodeExporterTargets generateScrapeConfigs;
 }
--- a/modules/homelab/default.nix
+++ b/modules/homelab/default.nix
@@ -2,5 +2,6 @@
 {
  imports = [
    ./dns.nix
    ./monitoring.nix
  ];
 }
--- a/modules/homelab/monitoring.nix
+++ b/modules/homelab/monitoring.nix
@@ -0,0 +1,50 @@
 { config, lib, ... }:
 let
  cfg = config.homelab.monitoring;
 in
 {
  options.homelab.monitoring = {
    enable = lib.mkOption {
      type = lib.types.bool;
      default = true;
      description = "Include this host in Prometheus node-exporter scrape targets";
    };
    scrapeTargets = lib.mkOption {
      type = lib.types.listOf (lib.types.submodule {
        options = {
          job_name = lib.mkOption {
            type = lib.types.str;
            description = "Prometheus scrape job name";
          };
          port = lib.mkOption {
            type = lib.types.port;
            description = "Port to scrape metrics from";
          };
          metrics_path = lib.mkOption {
            type = lib.types.str;
            default = "/metrics";
            description = "HTTP path to scrape metrics from";
          };
          scheme = lib.mkOption {
            type = lib.types.str;
            default = "http";
            description = "HTTP scheme (http or https)";
          };
          scrape_interval = lib.mkOption {
            type = lib.types.nullOr lib.types.str;
            default = null;
            description = "Override the global scrape interval for this target";
          };
          honor_labels = lib.mkOption {
            type = lib.types.bool;
            default = false;
            description = "Whether to honor labels from the scraped target";
          };
        };
      });
      default = [ ];
      description = "Additional Prometheus scrape targets exposed by this host";
    };
  };
 }
--- a/services/ca/default.nix
+++ b/services/ca/default.nix
@@ -1,5 +1,9 @@
 { pkgs, unstable, ... }:
 {
  homelab.monitoring.scrapeTargets = [{
    job_name = "step-ca";
    port = 9000;
  }];
  sops.secrets."ca_root_pw" = {
    sopsFile = ../../secrets/ca/secrets.yaml;
    owner = "step-ca";
--- a/services/home-assistant/default.nix
+++ b/services/home-assistant/default.nix
@@ -1,5 +1,11 @@
 { pkgs, config, ... }:
 {
  homelab.monitoring.scrapeTargets = [{
    job_name = "home-assistant";
    port = 8123;
    metrics_path = "/api/prometheus";
    scrape_interval = "60s";
  }];
  # Enable the Home Assistant service
  services.home-assistant = {
    enable = true;
--- a/services/http-proxy/default.nix
+++ b/services/http-proxy/default.nix
@@ -3,4 +3,9 @@
  imports = [
    ./proxy.nix
  ];
  homelab.monitoring.scrapeTargets = [{
    job_name = "caddy";
    port = 80;
  }];
 }
--- a/services/jellyfin/default.nix
+++ b/services/jellyfin/default.nix
@@ -1,5 +1,9 @@
 { pkgs, ... }:
 {
  homelab.monitoring.scrapeTargets = [{
    job_name = "jellyfin";
    port = 8096;
  }];
  services.jellyfin = {
    enable = true;
  };
--- a/services/monitoring/external-targets.nix
+++ b/services/monitoring/external-targets.nix
@@ -0,0 +1,12 @@
 # Monitoring targets for hosts not managed by this flake
 # These are manually maintained and combined with auto-generated targets
 {
  nodeExporter = [
    "gunter.home.2rjus.net:9100"
  ];
  scrapeConfigs = [
    { job_name = "smartctl"; targets = [ "gunter.home.2rjus.net:9633" ]; }
    { job_name = "ghettoptt"; targets = [ "gunter.home.2rjus.net:8989" ]; }
    { job_name = "restic_rest"; targets = [ "10.69.12.52:8000" ]; }
  ];
 }
--- a/services/monitoring/prometheus.nix
+++ b/services/monitoring/prometheus.nix
@@ -1,4 +1,11 @@
-{ ... }:
+{ self, lib, ... }:
 let
  monLib = import ../../lib/monitoring.nix { inherit lib; };
  externalTargets = import ./external-targets.nix;
  nodeExporterTargets = monLib.generateNodeExporterTargets self externalTargets;
  autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets;
 in
 {
  services.prometheus = {
    enable = true;
@@ -45,26 +52,16 @@
    ];
    scrapeConfigs = [
      # Auto-generated node-exporter targets from flake hosts + external
      {
        job_name = "node-exporter";
        static_configs = [
          {
-            targets = [
+            targets = nodeExporterTargets;
              "ca.home.2rjus.net:9100"
              "gunter.home.2rjus.net:9100"
              "ha1.home.2rjus.net:9100"
              "http-proxy.home.2rjus.net:9100"
              "jelly01.home.2rjus.net:9100"
              "monitoring01.home.2rjus.net:9100"
              "nix-cache01.home.2rjus.net:9100"
              "ns1.home.2rjus.net:9100"
              "ns2.home.2rjus.net:9100"
              "pgdb1.home.2rjus.net:9100"
              "nats1.home.2rjus.net:9100"
            ];
          }
        ];
      }
      # Local monitoring services (not auto-generated)
      {
        job_name = "prometheus";
        static_configs = [
@@ -85,7 +82,7 @@
        job_name = "grafana";
        static_configs = [
          {
-            targets = [ "localhost:3100" ];
+            targets = [ "localhost:3000" ];
          }
        ];
      }
@@ -98,13 +95,23 @@
        ];
      }
      {
-        job_name = "restic_rest";
+        job_name = "pushgateway";
        honor_labels = true;
        static_configs = [
          {
-            targets = [ "10.69.12.52:8000" ];
+            targets = [ "localhost:9091" ];
          }
        ];
      }
      {
        job_name = "labmon";
        static_configs = [
          {
            targets = [ "monitoring01.home.2rjus.net:9969" ];
          }
        ];
      }
      # pve-exporter with complex relabel config
      {
        job_name = "pve-exporter";
        static_configs = [
@@ -133,91 +140,8 @@
          }
        ];
      }
-      {
+    ] ++ autoScrapeConfigs;
-        job_name = "caddy";
+
        static_configs = [
          {
            targets = [ "http-proxy.home.2rjus.net" ];
          }
        ];
      }
      {
        job_name = "jellyfin";
        static_configs = [
          {
            targets = [ "jelly01.home.2rjus.net:8096" ];
          }
        ];
      }
      {
        job_name = "smartctl";
        static_configs = [
          {
            targets = [ "gunter.home.2rjus.net:9633" ];
          }
        ];
      }
      {
        job_name = "wireguard";
        static_configs = [
          {
            targets = [ "http-proxy.home.2rjus.net:9586" ];
          }
        ];
      }
      {
        job_name = "home-assistant";
        scrape_interval = "60s";
        metrics_path = "/api/prometheus";
        static_configs = [
          {
            targets = [ "ha1.home.2rjus.net:8123" ];
          }
        ];
      }
      {
        job_name = "ghettoptt";
        static_configs = [
          {
            targets = [ "gunter.home.2rjus.net:8989" ];
          }
        ];
      }
      {
        job_name = "step-ca";
        static_configs = [
          {
            targets = [ "ca.home.2rjus.net:9000" ];
          }
        ];
      }
      {
        job_name = "labmon";
        static_configs = [
          {
            targets = [ "monitoring01.home.2rjus.net:9969" ];
          }
        ];
      }
      {
        job_name = "pushgateway";
        honor_labels = true;
        static_configs = [
          {
            targets = [ "localhost:9091" ];
          }
        ];
      }
      {
        job_name = "nix-cache_caddy";
        scheme = "https";
        static_configs = [
          {
            targets = [ "nix-cache.home.2rjus.net" ];
          }
        ];
      }
    ];
    pushgateway = {
      enable = true;
      web = {
--- a/services/monitoring/rules.yml
+++ b/services/monitoring/rules.yml
@@ -57,6 +57,38 @@ groups:
        annotations:
          summary: "Promtail service not running on {{ $labels.instance }}"
          description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes."
      - alert: filesystem_filling_up
        expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0
        for: 1h
        labels:
          severity: warning
        annotations:
          summary: "Filesystem predicted to fill within 24h on {{ $labels.instance }}"
          description: "Based on the last 6h trend, the root filesystem on {{ $labels.instance }} is predicted to run out of space within 24 hours."
      - alert: systemd_not_running
        expr: node_systemd_system_running == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Systemd not in running state on {{ $labels.instance }}"
          description: "Systemd is not in running state on {{ $labels.instance }}. The system may be in a degraded state."
      - alert: high_file_descriptors
        expr: node_filefd_allocated / node_filefd_maximum > 0.8
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High file descriptor usage on {{ $labels.instance }}"
          description: "More than 80% of file descriptors are in use on {{ $labels.instance }}."
      - alert: host_reboot
        expr: changes(node_boot_time_seconds[10m]) > 0
        for: 0m
        labels:
          severity: info
        annotations:
          summary: "Host {{ $labels.instance }} has rebooted"
          description: "Host {{ $labels.instance }} has rebooted."
  - name: nameserver_rules
    rules:
      - alert: unbound_down
@@ -75,7 +107,7 @@ groups:
        annotations:
          summary: "NSD not running on {{ $labels.instance }}"
          description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
-  - name: http-proxy_rules 
+  - name: http_proxy_rules
    rules:
      - alert: caddy_down
        expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0
@@ -85,6 +117,22 @@ groups:
        annotations:
          summary: "Caddy not running on {{ $labels.instance }}"
          description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
      - alert: caddy_upstream_unhealthy
        expr: caddy_reverse_proxy_upstreams_healthy == 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Caddy upstream unhealthy for {{ $labels.upstream }}"
          description: "Caddy reverse proxy upstream {{ $labels.upstream }} is unhealthy on {{ $labels.instance }}."
      - alert: caddy_high_error_rate
        expr: rate(caddy_http_request_errors_total[5m]) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High HTTP error rate on {{ $labels.instance }}"
          description: "Caddy is experiencing a high rate of HTTP errors on {{ $labels.instance }}."
  - name: nats_rules
    rules:
      - alert: nats_down
@@ -97,7 +145,7 @@ groups:
          description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
  - name: nix_cache_rules
    rules:
-      - alert: build-flakes_service_not_active_recently
+      - alert: build_flakes_service_not_active_recently
        expr: count_over_time(node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="active"}[1h]) < 1
        for: 0m
        labels:
@@ -138,7 +186,7 @@ groups:
        annotations:
          summary: "Home assistant not running on {{ $labels.instance }}"
          description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes."
-      - alert: zigbee2qmtt_down 
+      - alert: zigbee2mqtt_down
        expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0
        for: 5m
        labels:
@@ -156,7 +204,7 @@ groups:
          description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
  - name: smartctl_rules
    rules:
-      - alert: SmartCriticalWarning
+      - alert: smart_critical_warning
        expr: smartctl_device_critical_warning > 0
        for: 0m
        labels:
@@ -164,7 +212,7 @@ groups:
        annotations:
          summary: SMART critical warning (instance {{ $labels.instance }})
          description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: SmartMediaErrors
+      - alert: smart_media_errors
        expr: smartctl_device_media_errors > 0
        for: 0m
        labels:
@@ -172,7 +220,7 @@ groups:
        annotations:
          summary: SMART media errors (instance {{ $labels.instance }})
          description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: SmartWearoutIndicator
+      - alert: smart_wearout_indicator
        expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold
        for: 0m
        labels:
@@ -180,20 +228,29 @@ groups:
        annotations:
          summary: SMART Wearout Indicator (instance {{ $labels.instance }})
          description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: smartctl_high_temperature
        expr: smartctl_device_temperature > 60
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Disk temperature above 60C on {{ $labels.instance }}"
          description: "Disk {{ $labels.device }} on {{ $labels.instance }} has temperature {{ $value }}C."
  - name: wireguard_rules
    rules:
-      - alert: WireguardHandshake
+      - alert: wireguard_handshake_timeout
-        expr: (time() - wireguard_latest_handshake_seconds{instance="http-proxy.home.2rjus.net:9586",interface="wg0",public_key="32Rb13wExcy8uI92JTnFdiOfkv0mlQ6f181WA741DHs="}) > 300
+        expr: (time() - wireguard_latest_handshake_seconds{interface="wg0"}) > 300
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Wireguard handshake timeout on {{ $labels.instance }}"
-          description: "Wireguard handshake timeout on {{ $labels.instance }} for more than 1 minutes."
+          description: "Wireguard handshake timeout on {{ $labels.instance }} for peer {{ $labels.public_key }}."
  - name: monitoring_rules
    rules:
      - alert: prometheus_not_running
        expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="prometheus.service", state="active"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
@@ -201,6 +258,7 @@ groups:
          description: "Prometheus service not running on {{ $labels.instance }}"
      - alert: alertmanager_not_running
        expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
@@ -208,13 +266,7 @@ groups:
          description: "Alertmanager service not running on {{ $labels.instance }}"
      - alert: pushgateway_not_running
        expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
-        labels:
+        for: 5m
          severity: critical
        annotations:
          summary: "Pushgateway service not running on {{ $labels.instance }}"
          description: "Pushgateway service not running on {{ $labels.instance }}"
      - alert: pushgateway_not_running
        expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
        labels:
          severity: critical
        annotations:
@@ -222,6 +274,7 @@ groups:
          description: "Pushgateway service not running on {{ $labels.instance }}"
      - alert: loki_not_running
        expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="loki.service", state="active"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
@@ -229,6 +282,7 @@ groups:
          description: "Loki service not running on {{ $labels.instance }}"
      - alert: grafana_not_running
        expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
        for: 5m
        labels:
          severity: warning
        annotations:
@@ -236,6 +290,7 @@ groups:
          description: "Grafana service not running on {{ $labels.instance }}"
      - alert: tempo_not_running
        expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="tempo.service", state="active"} == 0
        for: 5m
        labels:
          severity: warning
        annotations:
@@ -243,8 +298,53 @@ groups:
          description: "Tempo service not running on {{ $labels.instance }}"
      - alert: pyroscope_not_running
        expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="podman-pyroscope.service", state="active"} == 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Pyroscope service not running on {{ $labels.instance }}"
          description: "Pyroscope service not running on {{ $labels.instance }}"
  - name: certificate_rules
    rules:
      - alert: certificate_expiring_soon
        expr: labmon_tlsconmon_certificate_seconds_left < 86400
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "TLS certificate expiring soon for {{ $labels.instance }}"
          description: "TLS certificate for {{ $labels.address }} is expiring within 24 hours."
      - alert: certificate_check_error
        expr: labmon_tlsconmon_certificate_check_error == 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Error checking certificate for {{ $labels.address }}"
          description: "Certificate check is failing for {{ $labels.address }} on {{ $labels.instance }}."
      - alert: step_ca_certificate_expiring
        expr: labmon_stepmon_certificate_seconds_left < 3600
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Step-CA certificate expiring for {{ $labels.instance }}"
          description: "Step-CA certificate is expiring within 1 hour on {{ $labels.instance }}."
  - name: proxmox_rules
    rules:
      - alert: pve_node_down
        expr: pve_up{id=~"node/.*"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Proxmox node {{ $labels.id }} is down"
          description: "Proxmox node {{ $labels.id }} has been down for more than 5 minutes."
      - alert: pve_guest_stopped
        expr: pve_up{id=~"qemu/.*"} == 0 and pve_onboot_status == 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Proxmox VM {{ $labels.id }} is stopped"
          description: "Proxmox VM {{ $labels.id }} ({{ $labels.name }}) has onboot=1 but is stopped."
--- a/services/nix-cache/default.nix
+++ b/services/nix-cache/default.nix
@@ -6,4 +6,10 @@
    ./proxy.nix
    ./nix.nix
  ];
  homelab.monitoring.scrapeTargets = [{
    job_name = "nix-cache_caddy";
    port = 443;
    scheme = "https";
  }];
 }