{ self, lib, pkgs, ... }: let monLib = import ../../lib/monitoring.nix { inherit lib; }; externalTargets = import ./external-targets.nix; nodeExporterTargets = monLib.generateNodeExporterTargets self externalTargets; autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets; # Script to fetch AppRole token for Prometheus to use when scraping OpenBao metrics fetchOpenbaoToken = pkgs.writeShellApplication { name = "fetch-openbao-token"; runtimeInputs = [ pkgs.curl pkgs.jq ]; text = '' VAULT_ADDR="https://vault01.home.2rjus.net:8200" APPROLE_DIR="/var/lib/vault/approle" OUTPUT_FILE="/run/secrets/prometheus/openbao-token" # Read AppRole credentials if [ ! -f "$APPROLE_DIR/role-id" ] || [ ! -f "$APPROLE_DIR/secret-id" ]; then echo "AppRole credentials not found at $APPROLE_DIR" >&2 exit 1 fi ROLE_ID=$(cat "$APPROLE_DIR/role-id") SECRET_ID=$(cat "$APPROLE_DIR/secret-id") # Authenticate to Vault AUTH_RESPONSE=$(curl -sf -k -X POST \ -d "{\"role_id\":\"$ROLE_ID\",\"secret_id\":\"$SECRET_ID\"}" \ "$VAULT_ADDR/v1/auth/approle/login") # Extract token VAULT_TOKEN=$(echo "$AUTH_RESPONSE" | jq -r '.auth.client_token') if [ -z "$VAULT_TOKEN" ] || [ "$VAULT_TOKEN" = "null" ]; then echo "Failed to extract Vault token from response" >&2 exit 1 fi # Write token to file mkdir -p "$(dirname "$OUTPUT_FILE")" echo -n "$VAULT_TOKEN" > "$OUTPUT_FILE" chown prometheus:prometheus "$OUTPUT_FILE" chmod 0400 "$OUTPUT_FILE" echo "Successfully fetched OpenBao token" ''; }; in { # Systemd service to fetch AppRole token for Prometheus OpenBao scraping # The token is used to authenticate when scraping /v1/sys/metrics systemd.services.prometheus-openbao-token = { description = "Fetch OpenBao token for Prometheus metrics scraping"; after = [ "network-online.target" ]; wants = [ "network-online.target" ]; before = [ "prometheus.service" ]; requiredBy = [ "prometheus.service" ]; serviceConfig = { Type = "oneshot"; ExecStart = fetchOpenbaoToken; RemainAfterExit = true; }; }; # Timer to periodically refresh the token (AppRole tokens have 1-hour TTL) systemd.timers.prometheus-openbao-token = { description = "Refresh OpenBao token for Prometheus"; wantedBy = [ "timers.target" ]; timerConfig = { OnBootSec = "5min"; OnUnitActiveSec = "30min"; RandomizedDelaySec = "5min"; }; }; services.prometheus = { enable = true; # syntax-only check because we use external credential files (e.g., openbao-token) checkConfig = "syntax-only"; alertmanager = { enable = true; configuration = { global = { }; route = { receiver = "webhook_natstonotify"; group_wait = "30s"; group_interval = "5m"; repeat_interval = "1h"; group_by = [ "alertname" ]; }; receivers = [ { name = "webhook_natstonotify"; webhook_configs = [ { url = "http://localhost:5001/alert"; } ]; } ]; }; }; alertmanagers = [ { static_configs = [ { targets = [ "localhost:9093" ]; } ]; } ]; retentionTime = "30d"; globalConfig = { scrape_interval = "15s"; }; rules = [ (builtins.readFile ./rules.yml) ]; scrapeConfigs = [ # Auto-generated node-exporter targets from flake hosts + external { job_name = "node-exporter"; static_configs = [ { targets = nodeExporterTargets; } ]; } # Systemd exporter on all hosts (same targets, different port) { job_name = "systemd-exporter"; static_configs = [ { targets = map (t: builtins.replaceStrings [":9100"] [":9558"] t) nodeExporterTargets; } ]; } # Local monitoring services (not auto-generated) { job_name = "prometheus"; static_configs = [ { targets = [ "localhost:9090" ]; } ]; } { job_name = "loki"; static_configs = [ { targets = [ "localhost:3100" ]; } ]; } { job_name = "grafana"; static_configs = [ { targets = [ "localhost:3000" ]; } ]; } { job_name = "alertmanager"; static_configs = [ { targets = [ "localhost:9093" ]; } ]; } { job_name = "pushgateway"; honor_labels = true; static_configs = [ { targets = [ "localhost:9091" ]; } ]; } { job_name = "labmon"; static_configs = [ { targets = [ "monitoring01.home.2rjus.net:9969" ]; } ]; } # TODO: nix-cache_caddy can't be auto-generated because the cert is issued # for nix-cache.home.2rjus.net (service CNAME), not nix-cache01 (hostname). # Consider adding a target override to homelab.monitoring.scrapeTargets. { job_name = "nix-cache_caddy"; scheme = "https"; static_configs = [ { targets = [ "nix-cache.home.2rjus.net" ]; } ]; } # pve-exporter with complex relabel config { job_name = "pve-exporter"; static_configs = [ { targets = [ "10.69.12.75" ]; } ]; metrics_path = "/pve"; params = { module = [ "default" ]; cluster = [ "1" ]; node = [ "1" ]; }; relabel_configs = [ { source_labels = [ "__address__" ]; target_label = "__param_target"; } { source_labels = [ "__param_target" ]; target_label = "instance"; } { target_label = "__address__"; replacement = "127.0.0.1:9221"; } ]; } # OpenBao metrics with bearer token auth { job_name = "openbao"; scheme = "https"; metrics_path = "/v1/sys/metrics"; params = { format = [ "prometheus" ]; }; static_configs = [{ targets = [ "vault01.home.2rjus.net:8200" ]; }]; authorization = { type = "Bearer"; credentials_file = "/run/secrets/prometheus/openbao-token"; }; } ] ++ autoScrapeConfigs; pushgateway = { enable = true; web = { external-url = "https://pushgw.home.2rjus.net"; }; }; }; }