diff --git a/CLAUDE.md b/CLAUDE.md index 7db25b4..74112e2 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -122,9 +122,10 @@ This ensures documentation matches the exact nixpkgs version (currently NixOS 25 - Core modules: nix.nix, sshd.nix, sops.nix, acme.nix, autoupgrade.nix - Monitoring: node-exporter and promtail on every host - `/modules/` - Custom NixOS modules - - `homelab/` - Homelab-specific options (DNS automation, etc.) + - `homelab/` - Homelab-specific options (DNS automation, monitoring scrape targets) - `/lib/` - Nix library functions - `dns-zone.nix` - DNS zone generation functions + - `monitoring.nix` - Prometheus scrape target generation functions - `/services/` - Reusable service modules, selectively imported by hosts - `home-assistant/` - Home automation stack - `monitoring/` - Observability stack (Prometheus, Grafana, Loki, Tempo) @@ -156,6 +157,7 @@ All hosts automatically get: - Internal ACME CA integration (ca.home.2rjus.net) - Daily auto-upgrades with auto-reboot - Prometheus node-exporter + Promtail (logs to monitoring01) +- Monitoring scrape target auto-registration via `homelab.monitoring` options - Custom root CA trust - DNS zone auto-registration via `homelab.dns` options @@ -310,7 +312,7 @@ This means: 11. Deploy by running `nixos-rebuild boot --flake URL#` on the host. 12. Run auto-upgrade on DNS servers (ns1, ns2) to pick up the new host's DNS entry -**Note:** DNS A records are auto-generated from the host's `systemd.network.networks` static IP configuration. No manual zone file editing is required. +**Note:** DNS A records and Prometheus node-exporter scrape targets are auto-generated from the host's `systemd.network.networks` static IP configuration. No manual zone file or Prometheus config editing is required. ### Important Patterns @@ -333,6 +335,23 @@ All hosts ship metrics and logs to `monitoring01`: - **Tracing**: Tempo for distributed tracing - **Profiling**: Pyroscope for continuous profiling +**Scrape Target Auto-Generation:** + +Prometheus scrape targets are automatically generated from host configurations, following the same pattern as DNS zone generation: + +- **Node-exporter**: All flake hosts with static IPs are automatically added as node-exporter targets +- **Service targets**: Defined via `homelab.monitoring.scrapeTargets` in service modules +- **External targets**: Non-flake hosts defined in `/services/monitoring/external-targets.nix` +- **Library**: `lib/monitoring.nix` provides `generateNodeExporterTargets` and `generateScrapeConfigs` + +Host monitoring options (`homelab.monitoring.*`): +- `enable` (default: `true`) - Include host in Prometheus node-exporter scrape targets +- `scrapeTargets` (default: `[]`) - Additional scrape targets exposed by this host (job_name, port, metrics_path, scheme, scrape_interval, honor_labels) + +Service modules declare their scrape targets directly (e.g., `services/ca/default.nix` declares step-ca on port 9000). The Prometheus config on monitoring01 auto-generates scrape configs from all hosts. + +To add monitoring targets for non-NixOS hosts, edit `/services/monitoring/external-targets.nix`. + ### DNS Architecture - `ns1` (10.69.13.5) - Primary authoritative DNS + resolver diff --git a/hosts/http-proxy/wireguard.nix b/hosts/http-proxy/wireguard.nix index 6485e69..5470996 100644 --- a/hosts/http-proxy/wireguard.nix +++ b/hosts/http-proxy/wireguard.nix @@ -26,7 +26,11 @@ }; }; }; - # monitoring + homelab.monitoring.scrapeTargets = [{ + job_name = "wireguard"; + port = 9586; + }]; + services.prometheus.exporters.wireguard = { enable = true; }; diff --git a/lib/monitoring.nix b/lib/monitoring.nix new file mode 100644 index 0000000..19e522a --- /dev/null +++ b/lib/monitoring.nix @@ -0,0 +1,145 @@ +{ lib }: +let + # Extract IP address from CIDR notation (e.g., "10.69.13.5/24" -> "10.69.13.5") + extractIP = address: + let + parts = lib.splitString "/" address; + in + builtins.head parts; + + # Check if a network interface name looks like a VPN/tunnel interface + isVpnInterface = ifaceName: + lib.hasPrefix "wg" ifaceName || + lib.hasPrefix "tun" ifaceName || + lib.hasPrefix "tap" ifaceName || + lib.hasPrefix "vti" ifaceName; + + # Extract monitoring info from a single host configuration + # Returns null if host should not be included + extractHostMonitoring = name: hostConfig: + let + cfg = hostConfig.config; + monConfig = (cfg.homelab or { }).monitoring or { enable = true; scrapeTargets = [ ]; }; + dnsConfig = (cfg.homelab or { }).dns or { enable = true; }; + hostname = cfg.networking.hostName; + networks = cfg.systemd.network.networks or { }; + + # Filter out VPN interfaces and find networks with static addresses + physicalNetworks = lib.filterAttrs + (netName: netCfg: + let + ifaceName = netCfg.matchConfig.Name or ""; + in + !(isVpnInterface ifaceName) && (netCfg.address or [ ]) != [ ]) + networks; + + # Get addresses from physical networks only + networkAddresses = lib.flatten ( + lib.mapAttrsToList + (netName: netCfg: netCfg.address or [ ]) + physicalNetworks + ); + + firstAddress = if networkAddresses != [ ] then builtins.head networkAddresses else null; + in + if !(monConfig.enable or true) || !(dnsConfig.enable or true) || firstAddress == null then + null + else + { + inherit hostname; + ip = extractIP firstAddress; + scrapeTargets = monConfig.scrapeTargets or [ ]; + }; + + # Generate node-exporter targets from all flake hosts + generateNodeExporterTargets = self: externalTargets: + let + nixosConfigs = self.nixosConfigurations or { }; + hostList = lib.filter (x: x != null) ( + lib.mapAttrsToList extractHostMonitoring nixosConfigs + ); + flakeTargets = map (host: "${host.hostname}.home.2rjus.net:9100") hostList; + in + flakeTargets ++ (externalTargets.nodeExporter or [ ]); + + # Generate scrape configs from all flake hosts and external targets + generateScrapeConfigs = self: externalTargets: + let + nixosConfigs = self.nixosConfigurations or { }; + hostList = lib.filter (x: x != null) ( + lib.mapAttrsToList extractHostMonitoring nixosConfigs + ); + + # Collect all scrapeTargets from all hosts, grouped by job_name + allTargets = lib.flatten (map + (host: + map + (target: { + inherit (target) job_name port metrics_path scheme scrape_interval honor_labels; + hostname = host.hostname; + }) + host.scrapeTargets + ) + hostList + ); + + # Group targets by job_name + grouped = lib.groupBy (t: t.job_name) allTargets; + + # Generate a scrape config for each job + flakeScrapeConfigs = lib.mapAttrsToList + (jobName: targets: + let + first = builtins.head targets; + targetAddrs = map + (t: + let + portStr = toString t.port; + in + "${t.hostname}.home.2rjus.net:${portStr}") + targets; + config = { + job_name = jobName; + static_configs = [{ + targets = targetAddrs; + }]; + } + // (lib.optionalAttrs (first.metrics_path != "/metrics") { + metrics_path = first.metrics_path; + }) + // (lib.optionalAttrs (first.scheme != "http") { + scheme = first.scheme; + }) + // (lib.optionalAttrs (first.scrape_interval != null) { + scrape_interval = first.scrape_interval; + }) + // (lib.optionalAttrs first.honor_labels { + honor_labels = true; + }); + in + config + ) + grouped; + + # External scrape configs + externalScrapeConfigs = map + (ext: { + job_name = ext.job_name; + static_configs = [{ + targets = ext.targets; + }]; + } // (lib.optionalAttrs (ext ? metrics_path) { + metrics_path = ext.metrics_path; + }) // (lib.optionalAttrs (ext ? scheme) { + scheme = ext.scheme; + }) // (lib.optionalAttrs (ext ? scrape_interval) { + scrape_interval = ext.scrape_interval; + })) + (externalTargets.scrapeConfigs or [ ]); + in + flakeScrapeConfigs ++ externalScrapeConfigs; + +in +{ + inherit extractHostMonitoring generateNodeExporterTargets generateScrapeConfigs; +} diff --git a/modules/homelab/default.nix b/modules/homelab/default.nix index 0d2ba01..b945a3d 100644 --- a/modules/homelab/default.nix +++ b/modules/homelab/default.nix @@ -2,5 +2,6 @@ { imports = [ ./dns.nix + ./monitoring.nix ]; } diff --git a/modules/homelab/monitoring.nix b/modules/homelab/monitoring.nix new file mode 100644 index 0000000..b6e101d --- /dev/null +++ b/modules/homelab/monitoring.nix @@ -0,0 +1,50 @@ +{ config, lib, ... }: +let + cfg = config.homelab.monitoring; +in +{ + options.homelab.monitoring = { + enable = lib.mkOption { + type = lib.types.bool; + default = true; + description = "Include this host in Prometheus node-exporter scrape targets"; + }; + + scrapeTargets = lib.mkOption { + type = lib.types.listOf (lib.types.submodule { + options = { + job_name = lib.mkOption { + type = lib.types.str; + description = "Prometheus scrape job name"; + }; + port = lib.mkOption { + type = lib.types.port; + description = "Port to scrape metrics from"; + }; + metrics_path = lib.mkOption { + type = lib.types.str; + default = "/metrics"; + description = "HTTP path to scrape metrics from"; + }; + scheme = lib.mkOption { + type = lib.types.str; + default = "http"; + description = "HTTP scheme (http or https)"; + }; + scrape_interval = lib.mkOption { + type = lib.types.nullOr lib.types.str; + default = null; + description = "Override the global scrape interval for this target"; + }; + honor_labels = lib.mkOption { + type = lib.types.bool; + default = false; + description = "Whether to honor labels from the scraped target"; + }; + }; + }); + default = [ ]; + description = "Additional Prometheus scrape targets exposed by this host"; + }; + }; +} diff --git a/services/ca/default.nix b/services/ca/default.nix index 9c52015..b5759a0 100644 --- a/services/ca/default.nix +++ b/services/ca/default.nix @@ -1,5 +1,9 @@ { pkgs, unstable, ... }: { + homelab.monitoring.scrapeTargets = [{ + job_name = "step-ca"; + port = 9000; + }]; sops.secrets."ca_root_pw" = { sopsFile = ../../secrets/ca/secrets.yaml; owner = "step-ca"; diff --git a/services/home-assistant/default.nix b/services/home-assistant/default.nix index c987fe4..14f4fce 100644 --- a/services/home-assistant/default.nix +++ b/services/home-assistant/default.nix @@ -1,5 +1,11 @@ { pkgs, config, ... }: { + homelab.monitoring.scrapeTargets = [{ + job_name = "home-assistant"; + port = 8123; + metrics_path = "/api/prometheus"; + scrape_interval = "60s"; + }]; # Enable the Home Assistant service services.home-assistant = { enable = true; diff --git a/services/http-proxy/default.nix b/services/http-proxy/default.nix index d046d09..07ef28f 100644 --- a/services/http-proxy/default.nix +++ b/services/http-proxy/default.nix @@ -3,4 +3,9 @@ imports = [ ./proxy.nix ]; + + homelab.monitoring.scrapeTargets = [{ + job_name = "caddy"; + port = 80; + }]; } diff --git a/services/jellyfin/default.nix b/services/jellyfin/default.nix index e2322de..729080c 100644 --- a/services/jellyfin/default.nix +++ b/services/jellyfin/default.nix @@ -1,5 +1,9 @@ { pkgs, ... }: { + homelab.monitoring.scrapeTargets = [{ + job_name = "jellyfin"; + port = 8096; + }]; services.jellyfin = { enable = true; }; diff --git a/services/monitoring/external-targets.nix b/services/monitoring/external-targets.nix new file mode 100644 index 0000000..debc0d5 --- /dev/null +++ b/services/monitoring/external-targets.nix @@ -0,0 +1,12 @@ +# Monitoring targets for hosts not managed by this flake +# These are manually maintained and combined with auto-generated targets +{ + nodeExporter = [ + "gunter.home.2rjus.net:9100" + ]; + scrapeConfigs = [ + { job_name = "smartctl"; targets = [ "gunter.home.2rjus.net:9633" ]; } + { job_name = "ghettoptt"; targets = [ "gunter.home.2rjus.net:8989" ]; } + { job_name = "restic_rest"; targets = [ "10.69.12.52:8000" ]; } + ]; +} diff --git a/services/monitoring/prometheus.nix b/services/monitoring/prometheus.nix index cb496ad..c96b817 100644 --- a/services/monitoring/prometheus.nix +++ b/services/monitoring/prometheus.nix @@ -1,4 +1,11 @@ -{ ... }: +{ self, lib, ... }: +let + monLib = import ../../lib/monitoring.nix { inherit lib; }; + externalTargets = import ./external-targets.nix; + + nodeExporterTargets = monLib.generateNodeExporterTargets self externalTargets; + autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets; +in { services.prometheus = { enable = true; @@ -45,26 +52,16 @@ ]; scrapeConfigs = [ + # Auto-generated node-exporter targets from flake hosts + external { job_name = "node-exporter"; static_configs = [ { - targets = [ - "ca.home.2rjus.net:9100" - "gunter.home.2rjus.net:9100" - "ha1.home.2rjus.net:9100" - "http-proxy.home.2rjus.net:9100" - "jelly01.home.2rjus.net:9100" - "monitoring01.home.2rjus.net:9100" - "nix-cache01.home.2rjus.net:9100" - "ns1.home.2rjus.net:9100" - "ns2.home.2rjus.net:9100" - "pgdb1.home.2rjus.net:9100" - "nats1.home.2rjus.net:9100" - ]; + targets = nodeExporterTargets; } ]; } + # Local monitoring services (not auto-generated) { job_name = "prometheus"; static_configs = [ @@ -85,7 +82,7 @@ job_name = "grafana"; static_configs = [ { - targets = [ "localhost:3100" ]; + targets = [ "localhost:3000" ]; } ]; } @@ -98,13 +95,23 @@ ]; } { - job_name = "restic_rest"; + job_name = "pushgateway"; + honor_labels = true; static_configs = [ { - targets = [ "10.69.12.52:8000" ]; + targets = [ "localhost:9091" ]; } ]; } + { + job_name = "labmon"; + static_configs = [ + { + targets = [ "monitoring01.home.2rjus.net:9969" ]; + } + ]; + } + # pve-exporter with complex relabel config { job_name = "pve-exporter"; static_configs = [ @@ -133,91 +140,8 @@ } ]; } - { - job_name = "caddy"; - static_configs = [ - { - targets = [ "http-proxy.home.2rjus.net" ]; - } - ]; - } - { - job_name = "jellyfin"; - static_configs = [ - { - targets = [ "jelly01.home.2rjus.net:8096" ]; - } - ]; - } - { - job_name = "smartctl"; - static_configs = [ - { - targets = [ "gunter.home.2rjus.net:9633" ]; - } - ]; - } - { - job_name = "wireguard"; - static_configs = [ - { - targets = [ "http-proxy.home.2rjus.net:9586" ]; - } - ]; - } - { - job_name = "home-assistant"; - scrape_interval = "60s"; - metrics_path = "/api/prometheus"; - static_configs = [ - { - targets = [ "ha1.home.2rjus.net:8123" ]; - } - ]; - } - { - job_name = "ghettoptt"; - static_configs = [ - { - targets = [ "gunter.home.2rjus.net:8989" ]; - } - ]; - } - { - job_name = "step-ca"; - static_configs = [ - { - targets = [ "ca.home.2rjus.net:9000" ]; - } - ]; - } - { - job_name = "labmon"; - static_configs = [ - { - targets = [ "monitoring01.home.2rjus.net:9969" ]; - } - ]; - } - { - job_name = "pushgateway"; - honor_labels = true; - static_configs = [ - { - targets = [ "localhost:9091" ]; - } - ]; - } - { - job_name = "nix-cache_caddy"; - scheme = "https"; - static_configs = [ - { - targets = [ "nix-cache.home.2rjus.net" ]; - } - ]; - } - ]; + ] ++ autoScrapeConfigs; + pushgateway = { enable = true; web = { diff --git a/services/monitoring/rules.yml b/services/monitoring/rules.yml index 4c9afc0..c5eba34 100644 --- a/services/monitoring/rules.yml +++ b/services/monitoring/rules.yml @@ -57,6 +57,38 @@ groups: annotations: summary: "Promtail service not running on {{ $labels.instance }}" description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes." + - alert: filesystem_filling_up + expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0 + for: 1h + labels: + severity: warning + annotations: + summary: "Filesystem predicted to fill within 24h on {{ $labels.instance }}" + description: "Based on the last 6h trend, the root filesystem on {{ $labels.instance }} is predicted to run out of space within 24 hours." + - alert: systemd_not_running + expr: node_systemd_system_running == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Systemd not in running state on {{ $labels.instance }}" + description: "Systemd is not in running state on {{ $labels.instance }}. The system may be in a degraded state." + - alert: high_file_descriptors + expr: node_filefd_allocated / node_filefd_maximum > 0.8 + for: 5m + labels: + severity: warning + annotations: + summary: "High file descriptor usage on {{ $labels.instance }}" + description: "More than 80% of file descriptors are in use on {{ $labels.instance }}." + - alert: host_reboot + expr: changes(node_boot_time_seconds[10m]) > 0 + for: 0m + labels: + severity: info + annotations: + summary: "Host {{ $labels.instance }} has rebooted" + description: "Host {{ $labels.instance }} has rebooted." - name: nameserver_rules rules: - alert: unbound_down @@ -75,7 +107,7 @@ groups: annotations: summary: "NSD not running on {{ $labels.instance }}" description: "NSD has been down on {{ $labels.instance }} more than 5 minutes." - - name: http-proxy_rules + - name: http_proxy_rules rules: - alert: caddy_down expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0 @@ -85,6 +117,22 @@ groups: annotations: summary: "Caddy not running on {{ $labels.instance }}" description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes." + - alert: caddy_upstream_unhealthy + expr: caddy_reverse_proxy_upstreams_healthy == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Caddy upstream unhealthy for {{ $labels.upstream }}" + description: "Caddy reverse proxy upstream {{ $labels.upstream }} is unhealthy on {{ $labels.instance }}." + - alert: caddy_high_error_rate + expr: rate(caddy_http_request_errors_total[5m]) > 1 + for: 5m + labels: + severity: warning + annotations: + summary: "High HTTP error rate on {{ $labels.instance }}" + description: "Caddy is experiencing a high rate of HTTP errors on {{ $labels.instance }}." - name: nats_rules rules: - alert: nats_down @@ -97,7 +145,7 @@ groups: description: "NATS has been down on {{ $labels.instance }} more than 5 minutes." - name: nix_cache_rules rules: - - alert: build-flakes_service_not_active_recently + - alert: build_flakes_service_not_active_recently expr: count_over_time(node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="active"}[1h]) < 1 for: 0m labels: @@ -138,7 +186,7 @@ groups: annotations: summary: "Home assistant not running on {{ $labels.instance }}" description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes." - - alert: zigbee2qmtt_down + - alert: zigbee2mqtt_down expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0 for: 5m labels: @@ -156,7 +204,7 @@ groups: description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes." - name: smartctl_rules rules: - - alert: SmartCriticalWarning + - alert: smart_critical_warning expr: smartctl_device_critical_warning > 0 for: 0m labels: @@ -164,7 +212,7 @@ groups: annotations: summary: SMART critical warning (instance {{ $labels.instance }}) description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: SmartMediaErrors + - alert: smart_media_errors expr: smartctl_device_media_errors > 0 for: 0m labels: @@ -172,7 +220,7 @@ groups: annotations: summary: SMART media errors (instance {{ $labels.instance }}) description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: SmartWearoutIndicator + - alert: smart_wearout_indicator expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold for: 0m labels: @@ -180,20 +228,29 @@ groups: annotations: summary: SMART Wearout Indicator (instance {{ $labels.instance }}) description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: smartctl_high_temperature + expr: smartctl_device_temperature > 60 + for: 5m + labels: + severity: warning + annotations: + summary: "Disk temperature above 60C on {{ $labels.instance }}" + description: "Disk {{ $labels.device }} on {{ $labels.instance }} has temperature {{ $value }}C." - name: wireguard_rules rules: - - alert: WireguardHandshake - expr: (time() - wireguard_latest_handshake_seconds{instance="http-proxy.home.2rjus.net:9586",interface="wg0",public_key="32Rb13wExcy8uI92JTnFdiOfkv0mlQ6f181WA741DHs="}) > 300 + - alert: wireguard_handshake_timeout + expr: (time() - wireguard_latest_handshake_seconds{interface="wg0"}) > 300 for: 1m labels: severity: warning annotations: summary: "Wireguard handshake timeout on {{ $labels.instance }}" - description: "Wireguard handshake timeout on {{ $labels.instance }} for more than 1 minutes." + description: "Wireguard handshake timeout on {{ $labels.instance }} for peer {{ $labels.public_key }}." - name: monitoring_rules rules: - alert: prometheus_not_running expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="prometheus.service", state="active"} == 0 + for: 5m labels: severity: critical annotations: @@ -201,6 +258,7 @@ groups: description: "Prometheus service not running on {{ $labels.instance }}" - alert: alertmanager_not_running expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0 + for: 5m labels: severity: critical annotations: @@ -208,13 +266,7 @@ groups: description: "Alertmanager service not running on {{ $labels.instance }}" - alert: pushgateway_not_running expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0 - labels: - severity: critical - annotations: - summary: "Pushgateway service not running on {{ $labels.instance }}" - description: "Pushgateway service not running on {{ $labels.instance }}" - - alert: pushgateway_not_running - expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0 + for: 5m labels: severity: critical annotations: @@ -222,6 +274,7 @@ groups: description: "Pushgateway service not running on {{ $labels.instance }}" - alert: loki_not_running expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="loki.service", state="active"} == 0 + for: 5m labels: severity: critical annotations: @@ -229,6 +282,7 @@ groups: description: "Loki service not running on {{ $labels.instance }}" - alert: grafana_not_running expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="grafana.service", state="active"} == 0 + for: 5m labels: severity: warning annotations: @@ -236,6 +290,7 @@ groups: description: "Grafana service not running on {{ $labels.instance }}" - alert: tempo_not_running expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="tempo.service", state="active"} == 0 + for: 5m labels: severity: warning annotations: @@ -243,8 +298,53 @@ groups: description: "Tempo service not running on {{ $labels.instance }}" - alert: pyroscope_not_running expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="podman-pyroscope.service", state="active"} == 0 + for: 5m labels: severity: warning annotations: summary: "Pyroscope service not running on {{ $labels.instance }}" description: "Pyroscope service not running on {{ $labels.instance }}" + - name: certificate_rules + rules: + - alert: certificate_expiring_soon + expr: labmon_tlsconmon_certificate_seconds_left < 86400 + for: 5m + labels: + severity: warning + annotations: + summary: "TLS certificate expiring soon for {{ $labels.instance }}" + description: "TLS certificate for {{ $labels.address }} is expiring within 24 hours." + - alert: certificate_check_error + expr: labmon_tlsconmon_certificate_check_error == 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Error checking certificate for {{ $labels.address }}" + description: "Certificate check is failing for {{ $labels.address }} on {{ $labels.instance }}." + - alert: step_ca_certificate_expiring + expr: labmon_stepmon_certificate_seconds_left < 3600 + for: 5m + labels: + severity: critical + annotations: + summary: "Step-CA certificate expiring for {{ $labels.instance }}" + description: "Step-CA certificate is expiring within 1 hour on {{ $labels.instance }}." + - name: proxmox_rules + rules: + - alert: pve_node_down + expr: pve_up{id=~"node/.*"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Proxmox node {{ $labels.id }} is down" + description: "Proxmox node {{ $labels.id }} has been down for more than 5 minutes." + - alert: pve_guest_stopped + expr: pve_up{id=~"qemu/.*"} == 0 and pve_onboot_status == 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Proxmox VM {{ $labels.id }} is stopped" + description: "Proxmox VM {{ $labels.id }} ({{ $labels.name }}) has onboot=1 but is stopped." diff --git a/services/nix-cache/default.nix b/services/nix-cache/default.nix index 18db40c..5db16b7 100644 --- a/services/nix-cache/default.nix +++ b/services/nix-cache/default.nix @@ -6,4 +6,10 @@ ./proxy.nix ./nix.nix ]; + + homelab.monitoring.scrapeTargets = [{ + job_name = "nix-cache_caddy"; + port = 443; + scheme = "https"; + }]; }