diff --git a/docs/plans/prometheus-scrape-target-labels.md b/docs/plans/prometheus-scrape-target-labels.md index 0255347..d1b2508 100644 --- a/docs/plans/prometheus-scrape-target-labels.md +++ b/docs/plans/prometheus-scrape-target-labels.md @@ -5,20 +5,19 @@ | Step | Status | Notes | |------|--------|-------| | 1. Create `homelab.host` module | ✅ Complete | `modules/homelab/host.nix` | -| 2. Update `lib/monitoring.nix` | ❌ Not started | Labels not extracted or propagated | -| 3. Update Prometheus config | ❌ Not started | Still uses flat target list | -| 4. Set metadata on hosts | ⚠️ Partial | Some hosts configured, see below | -| 5. Update alert rules | ❌ Not started | | -| 6. Labels for service targets | ❌ Not started | Optional | +| 2. Update `lib/monitoring.nix` | ✅ Complete | Labels extracted and propagated | +| 3. Update Prometheus config | ✅ Complete | Uses structured static_configs | +| 4. Set metadata on hosts | ✅ Complete | All relevant hosts configured | +| 5. Update alert rules | ✅ Complete | Role-based filtering implemented | +| 6. Labels for service targets | ✅ Complete | Host labels propagated to all services | **Hosts with metadata configured:** - `ns1`, `ns2`: `role = "dns"`, `labels.dns_role = "primary"/"secondary"` -- `nix-cache01`: `role = "build-host"` (missing `priority = "low"` from plan) +- `nix-cache01`: `role = "build-host"` - `vault01`: `role = "vault"` -- `jump`: `role = "bastion"` -- `template`, `template2`, `testvm*`: `tier` and `priority` set +- `testvm01/02/03`: `tier = "test"` -**Key gap:** The `homelab.host` module exists and some hosts use it, but `lib/monitoring.nix` does not extract these values—they are not propagated to Prometheus scrape targets. +**Implementation complete.** Branch: `prometheus-scrape-target-labels` --- @@ -119,7 +118,7 @@ Import this module in `modules/homelab/default.nix`. ### 2. Update `lib/monitoring.nix` -❌ **Not started.** The current implementation does not extract `homelab.host` values. +✅ **Complete.** Labels are now extracted and propagated. - `extractHostMonitoring` should also extract `homelab.host` values (priority, role, labels). - Build the combined label set from `homelab.host`: @@ -149,7 +148,7 @@ This requires grouping hosts by their label attrset and producing one `static_co ### 3. Update `services/monitoring/prometheus.nix` -❌ **Not started.** Still uses flat target list (`static_configs = [{ targets = nodeExporterTargets; }]`). +✅ **Complete.** Now uses structured static_configs output. Change the node-exporter scrape config to use the new structured output: @@ -163,7 +162,7 @@ static_configs = nodeExporterTargets; ### 4. Set metadata on hosts -⚠️ **Partial.** Some hosts configured (see status table above). Current `nix-cache01` only has `role`, missing the `priority = "low"` suggested below. +✅ **Complete.** All relevant hosts have metadata configured. Note: The implementation filters by `role` rather than `priority`, which matches the existing nix-cache01 configuration. Example in `hosts/nix-cache01/configuration.nix`: @@ -189,17 +188,11 @@ homelab.host = { ### 5. Update alert rules -❌ **Not started.** Requires steps 2-3 to be completed first. +✅ **Complete.** Updated `services/monitoring/rules.yml`: -After implementing labels, review and update `services/monitoring/rules.yml`: +- `high_cpu_load`: Replaced `instance!="nix-cache01..."` with `role!="build-host"` for standard hosts (15m duration) and `role="build-host"` for build hosts (2h duration). +- `unbound_low_cache_hit_ratio`: Added `dns_role="primary"` filter to only alert on the primary DNS resolver (secondary has a cold cache). -- Replace instance-name exclusions with label-based filters (e.g. `{priority!="low"}` instead of `{instance!="nix-cache01.home.2rjus.net:9100"}`). -- Consider whether any other rules should differentiate by priority or role. +### 6. Labels for `generateScrapeConfigs` (service targets) -Specifically, the `high_cpu_load` rule currently has a nix-cache01 exclusion that should be replaced with a `priority`-based filter. - -### 6. Consider labels for `generateScrapeConfigs` (service targets) - -❌ **Not started.** Optional enhancement. - -The same label propagation could be applied to service-level scrape targets. This is optional and can be deferred -- service targets are more specialized and less likely to need generic label-based filtering. +✅ **Complete.** Host labels are now propagated to all auto-generated service scrape targets (unbound, homelab-deploy, nixos-exporter, etc.). This enables semantic filtering on any service metric, such as using `dns_role="primary"` with the unbound job. diff --git a/lib/monitoring.nix b/lib/monitoring.nix index 19e522a..dbb62b8 100644 --- a/lib/monitoring.nix +++ b/lib/monitoring.nix @@ -21,6 +21,7 @@ let cfg = hostConfig.config; monConfig = (cfg.homelab or { }).monitoring or { enable = true; scrapeTargets = [ ]; }; dnsConfig = (cfg.homelab or { }).dns or { enable = true; }; + hostConfig' = (cfg.homelab or { }).host or { }; hostname = cfg.networking.hostName; networks = cfg.systemd.network.networks or { }; @@ -49,20 +50,64 @@ let inherit hostname; ip = extractIP firstAddress; scrapeTargets = monConfig.scrapeTargets or [ ]; + # Host metadata for label propagation + tier = hostConfig'.tier or "prod"; + priority = hostConfig'.priority or "high"; + role = hostConfig'.role or null; + labels = hostConfig'.labels or { }; }; + # Build effective labels for a host (only include non-default values) + buildEffectiveLabels = host: + (lib.optionalAttrs (host.tier != "prod") { tier = host.tier; }) + // (lib.optionalAttrs (host.priority != "high") { priority = host.priority; }) + // (lib.optionalAttrs (host.role != null) { role = host.role; }) + // host.labels; + # Generate node-exporter targets from all flake hosts + # Returns a list of static_configs entries with labels generateNodeExporterTargets = self: externalTargets: let nixosConfigs = self.nixosConfigurations or { }; hostList = lib.filter (x: x != null) ( lib.mapAttrsToList extractHostMonitoring nixosConfigs ); - flakeTargets = map (host: "${host.hostname}.home.2rjus.net:9100") hostList; + + # Build target entries with labels for each host + flakeEntries = map + (host: { + target = "${host.hostname}.home.2rjus.net:9100"; + labels = buildEffectiveLabels host; + }) + hostList; + + # External targets have no labels + externalEntries = map + (target: { inherit target; labels = { }; }) + (externalTargets.nodeExporter or [ ]); + + allEntries = flakeEntries ++ externalEntries; + + # Group entries by their label set for efficient static_configs + # Convert labels attrset to a string key for grouping + labelKey = entry: builtins.toJSON entry.labels; + grouped = lib.groupBy labelKey allEntries; + + # Convert groups to static_configs format + staticConfigs = lib.mapAttrsToList + (key: entries: + let + labels = (builtins.head entries).labels; + in + { targets = map (e: e.target) entries; } + // (lib.optionalAttrs (labels != { }) { inherit labels; }) + ) + grouped; in - flakeTargets ++ (externalTargets.nodeExporter or [ ]); + staticConfigs; # Generate scrape configs from all flake hosts and external targets + # Host labels are propagated to service targets for semantic alert filtering generateScrapeConfigs = self: externalTargets: let nixosConfigs = self.nixosConfigurations or { }; @@ -70,13 +115,14 @@ let lib.mapAttrsToList extractHostMonitoring nixosConfigs ); - # Collect all scrapeTargets from all hosts, grouped by job_name + # Collect all scrapeTargets from all hosts, including host labels allTargets = lib.flatten (map (host: map (target: { inherit (target) job_name port metrics_path scheme scrape_interval honor_labels; hostname = host.hostname; + hostLabels = buildEffectiveLabels host; }) host.scrapeTargets ) @@ -87,22 +133,32 @@ let grouped = lib.groupBy (t: t.job_name) allTargets; # Generate a scrape config for each job + # Within each job, group targets by their host labels for efficient static_configs flakeScrapeConfigs = lib.mapAttrsToList (jobName: targets: let first = builtins.head targets; - targetAddrs = map - (t: + + # Group targets within this job by their host labels + labelKey = t: builtins.toJSON t.hostLabels; + groupedByLabels = lib.groupBy labelKey targets; + + staticConfigs = lib.mapAttrsToList + (key: labelTargets: let - portStr = toString t.port; + labels = (builtins.head labelTargets).hostLabels; + targetAddrs = map + (t: "${t.hostname}.home.2rjus.net:${toString t.port}") + labelTargets; in - "${t.hostname}.home.2rjus.net:${portStr}") - targets; + { targets = targetAddrs; } + // (lib.optionalAttrs (labels != { }) { inherit labels; }) + ) + groupedByLabels; + config = { job_name = jobName; - static_configs = [{ - targets = targetAddrs; - }]; + static_configs = staticConfigs; } // (lib.optionalAttrs (first.metrics_path != "/metrics") { metrics_path = first.metrics_path; diff --git a/services/monitoring/prometheus.nix b/services/monitoring/prometheus.nix index 57bc86d..c37bd32 100644 --- a/services/monitoring/prometheus.nix +++ b/services/monitoring/prometheus.nix @@ -121,22 +121,20 @@ in scrapeConfigs = [ # Auto-generated node-exporter targets from flake hosts + external + # Each static_config entry may have labels from homelab.host metadata { job_name = "node-exporter"; - static_configs = [ - { - targets = nodeExporterTargets; - } - ]; + static_configs = nodeExporterTargets; } # Systemd exporter on all hosts (same targets, different port) + # Preserves the same label grouping as node-exporter { job_name = "systemd-exporter"; - static_configs = [ - { - targets = map (t: builtins.replaceStrings [":9100"] [":9558"] t) nodeExporterTargets; - } - ]; + static_configs = map + (cfg: cfg // { + targets = map (t: builtins.replaceStrings [ ":9100" ] [ ":9558" ] t) cfg.targets; + }) + nodeExporterTargets; } # Local monitoring services (not auto-generated) { diff --git a/services/monitoring/rules.yml b/services/monitoring/rules.yml index 9e612eb..88c5e6c 100644 --- a/services/monitoring/rules.yml +++ b/services/monitoring/rules.yml @@ -17,8 +17,9 @@ groups: annotations: summary: "Disk space low on {{ $labels.instance }}" description: "Disk space is low on {{ $labels.instance }}. Please check." + # Build hosts (e.g., nix-cache01) are expected to have high CPU during builds - alert: high_cpu_load - expr: max(node_load5{instance!="nix-cache01.home.2rjus.net:9100"}) by (instance) > (count by (instance)(node_cpu_seconds_total{instance!="nix-cache01.home.2rjus.net:9100", mode="idle"}) * 0.7) + expr: max(node_load5{role!="build-host"}) by (instance) > (count by (instance)(node_cpu_seconds_total{role!="build-host", mode="idle"}) * 0.7) for: 15m labels: severity: warning @@ -26,7 +27,7 @@ groups: summary: "High CPU load on {{ $labels.instance }}" description: "CPU load is high on {{ $labels.instance }}. Please check." - alert: high_cpu_load - expr: max(node_load5{instance="nix-cache01.home.2rjus.net:9100"}) by (instance) > (count by (instance)(node_cpu_seconds_total{instance="nix-cache01.home.2rjus.net:9100", mode="idle"}) * 0.7) + expr: max(node_load5{role="build-host"}) by (instance) > (count by (instance)(node_cpu_seconds_total{role="build-host", mode="idle"}) * 0.7) for: 2h labels: severity: warning @@ -115,8 +116,9 @@ groups: annotations: summary: "NSD not running on {{ $labels.instance }}" description: "NSD has been down on {{ $labels.instance }} more than 5 minutes." + # Only alert on primary DNS (secondary has cold cache after failover) - alert: unbound_low_cache_hit_ratio - expr: (rate(unbound_cache_hits_total[5m]) / (rate(unbound_cache_hits_total[5m]) + rate(unbound_cache_misses_total[5m]))) < 0.5 + expr: (rate(unbound_cache_hits_total{dns_role="primary"}[5m]) / (rate(unbound_cache_hits_total{dns_role="primary"}[5m]) + rate(unbound_cache_misses_total{dns_role="primary"}[5m]))) < 0.5 for: 15m labels: severity: warning