monitoring: propagate host labels to Prometheus scrape targets
Extract homelab.host metadata (tier, priority, role, labels) from host configurations and propagate them to Prometheus scrape targets. This enables semantic alert filtering using labels instead of hardcoded instance names. Changes: - lib/monitoring.nix: Extract host metadata, group targets by labels - prometheus.nix: Use structured static_configs with labels - rules.yml: Replace instance filters with role-based filters Example labels in Prometheus: - ns1/ns2: role=dns, dns_role=primary/secondary - nix-cache01: role=build-host - testvm*: tier=test Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -17,8 +17,9 @@ groups:
|
||||
annotations:
|
||||
summary: "Disk space low on {{ $labels.instance }}"
|
||||
description: "Disk space is low on {{ $labels.instance }}. Please check."
|
||||
# Build hosts (e.g., nix-cache01) are expected to have high CPU during builds
|
||||
- alert: high_cpu_load
|
||||
expr: max(node_load5{instance!="nix-cache01.home.2rjus.net:9100"}) by (instance) > (count by (instance)(node_cpu_seconds_total{instance!="nix-cache01.home.2rjus.net:9100", mode="idle"}) * 0.7)
|
||||
expr: max(node_load5{role!="build-host"}) by (instance) > (count by (instance)(node_cpu_seconds_total{role!="build-host", mode="idle"}) * 0.7)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -26,7 +27,7 @@ groups:
|
||||
summary: "High CPU load on {{ $labels.instance }}"
|
||||
description: "CPU load is high on {{ $labels.instance }}. Please check."
|
||||
- alert: high_cpu_load
|
||||
expr: max(node_load5{instance="nix-cache01.home.2rjus.net:9100"}) by (instance) > (count by (instance)(node_cpu_seconds_total{instance="nix-cache01.home.2rjus.net:9100", mode="idle"}) * 0.7)
|
||||
expr: max(node_load5{role="build-host"}) by (instance) > (count by (instance)(node_cpu_seconds_total{role="build-host", mode="idle"}) * 0.7)
|
||||
for: 2h
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -115,8 +116,9 @@ groups:
|
||||
annotations:
|
||||
summary: "NSD not running on {{ $labels.instance }}"
|
||||
description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
|
||||
# Only alert on primary DNS (secondary has cold cache after failover)
|
||||
- alert: unbound_low_cache_hit_ratio
|
||||
expr: (rate(unbound_cache_hits_total[5m]) / (rate(unbound_cache_hits_total[5m]) + rate(unbound_cache_misses_total[5m]))) < 0.5
|
||||
expr: (rate(unbound_cache_hits_total{dns_role="primary"}[5m]) / (rate(unbound_cache_hits_total{dns_role="primary"}[5m]) + rate(unbound_cache_misses_total{dns_role="primary"}[5m]))) < 0.5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
Reference in New Issue
Block a user