{ self, config, lib, pkgs, ... }: let monLib = import ../../lib/monitoring.nix { inherit lib; }; externalTargets = import ../monitoring/external-targets.nix; nodeExporterTargets = monLib.generateNodeExporterTargets self externalTargets; autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets; # Script to fetch AppRole token for VictoriaMetrics to use when scraping OpenBao metrics fetchOpenbaoToken = pkgs.writeShellApplication { name = "fetch-openbao-token-vm"; runtimeInputs = [ pkgs.curl pkgs.jq ]; text = '' VAULT_ADDR="https://vault01.home.2rjus.net:8200" APPROLE_DIR="/var/lib/vault/approle" OUTPUT_FILE="/run/secrets/victoriametrics/openbao-token" # Read AppRole credentials if [ ! -f "$APPROLE_DIR/role-id" ] || [ ! -f "$APPROLE_DIR/secret-id" ]; then echo "AppRole credentials not found at $APPROLE_DIR" >&2 exit 1 fi ROLE_ID=$(cat "$APPROLE_DIR/role-id") SECRET_ID=$(cat "$APPROLE_DIR/secret-id") # Authenticate to Vault AUTH_RESPONSE=$(curl -sf -k -X POST \ -d "{\"role_id\":\"$ROLE_ID\",\"secret_id\":\"$SECRET_ID\"}" \ "$VAULT_ADDR/v1/auth/approle/login") # Extract token VAULT_TOKEN=$(echo "$AUTH_RESPONSE" | jq -r '.auth.client_token') if [ -z "$VAULT_TOKEN" ] || [ "$VAULT_TOKEN" = "null" ]; then echo "Failed to extract Vault token from response" >&2 exit 1 fi # Write token to file mkdir -p "$(dirname "$OUTPUT_FILE")" echo -n "$VAULT_TOKEN" > "$OUTPUT_FILE" chown victoriametrics:victoriametrics "$OUTPUT_FILE" chmod 0400 "$OUTPUT_FILE" echo "Successfully fetched OpenBao token" ''; }; scrapeConfigs = [ # Auto-generated node-exporter targets from flake hosts + external { job_name = "node-exporter"; static_configs = nodeExporterTargets; } # Systemd exporter on all hosts (same targets, different port) { job_name = "systemd-exporter"; static_configs = map (cfg: cfg // { targets = map (t: builtins.replaceStrings [ ":9100" ] [ ":9558" ] t) cfg.targets; }) nodeExporterTargets; } # Local monitoring services { job_name = "victoriametrics"; static_configs = [{ targets = [ "localhost:8428" ]; }]; } { job_name = "loki"; static_configs = [{ targets = [ "localhost:3100" ]; }]; } { job_name = "grafana"; static_configs = [{ targets = [ "localhost:3000" ]; }]; } { job_name = "alertmanager"; static_configs = [{ targets = [ "localhost:9093" ]; }]; } # Caddy metrics from nix-cache02 { job_name = "nix-cache_caddy"; scheme = "https"; static_configs = [{ targets = [ "nix-cache.home.2rjus.net" ]; }]; } # OpenBao metrics with bearer token auth { job_name = "openbao"; scheme = "https"; metrics_path = "/v1/sys/metrics"; params = { format = [ "prometheus" ]; }; static_configs = [{ targets = [ "vault01.home.2rjus.net:8200" ]; }]; authorization = { type = "Bearer"; credentials_file = "/run/secrets/victoriametrics/openbao-token"; }; } # Apiary external service { job_name = "apiary"; scheme = "https"; scrape_interval = "60s"; static_configs = [{ targets = [ "apiary.t-juice.club" ]; }]; authorization = { type = "Bearer"; credentials_file = "/run/secrets/victoriametrics-apiary-token"; }; } ] ++ autoScrapeConfigs; in { # Static user for VictoriaMetrics (overrides DynamicUser) so vault.secrets # and credential files can be owned by this user users.users.victoriametrics = { isSystemUser = true; group = "victoriametrics"; }; users.groups.victoriametrics = { }; # Override DynamicUser since we need a static user for credential file access systemd.services.victoriametrics.serviceConfig = { DynamicUser = lib.mkForce false; User = "victoriametrics"; Group = "victoriametrics"; }; # Systemd service to fetch AppRole token for OpenBao scraping systemd.services.victoriametrics-openbao-token = { description = "Fetch OpenBao token for VictoriaMetrics metrics scraping"; after = [ "network-online.target" ]; wants = [ "network-online.target" ]; before = [ "victoriametrics.service" ]; requiredBy = [ "victoriametrics.service" ]; serviceConfig = { Type = "oneshot"; ExecStart = lib.getExe fetchOpenbaoToken; }; }; # Timer to periodically refresh the token (AppRole tokens have 1-hour TTL) systemd.timers.victoriametrics-openbao-token = { description = "Refresh OpenBao token for VictoriaMetrics"; wantedBy = [ "timers.target" ]; timerConfig = { OnBootSec = "5min"; OnUnitActiveSec = "30min"; RandomizedDelaySec = "5min"; }; }; # Fetch apiary bearer token from Vault vault.secrets.victoriametrics-apiary-token = { secretPath = "hosts/monitoring01/apiary-token"; extractKey = "password"; owner = "victoriametrics"; group = "victoriametrics"; services = [ "victoriametrics" ]; }; services.victoriametrics = { enable = true; retentionPeriod = "3"; # 3 months # Disable config check since we reference external credential files checkConfig = false; prometheusConfig = { global.scrape_interval = "15s"; scrape_configs = scrapeConfigs; }; }; # vmalert for alerting rules - no notifier during parallel operation services.vmalert.instances.default = { enable = true; settings = { "datasource.url" = "http://localhost:8428"; # Blackhole notifications during parallel operation to prevent duplicate alerts. # Replace with notifier.url after cutover from monitoring01: # "notifier.url" = [ "http://localhost:9093" ]; "notifier.blackhole" = true; "rule" = [ ../monitoring/rules.yml ]; }; }; # Caddy reverse proxy for VictoriaMetrics and vmalert services.caddy.virtualHosts."metrics.home.2rjus.net".extraConfig = '' reverse_proxy http://127.0.0.1:8428 ''; services.caddy.virtualHosts."vmalert.home.2rjus.net".extraConfig = '' reverse_proxy http://127.0.0.1:8880 ''; # Alertmanager - same config as monitoring01 but will only receive # alerts after cutover (vmalert notifier is disabled above) services.prometheus.alertmanager = { enable = true; configuration = { global = { }; route = { receiver = "webhook_natstonotify"; group_wait = "30s"; group_interval = "5m"; repeat_interval = "1h"; group_by = [ "alertname" ]; }; receivers = [ { name = "webhook_natstonotify"; webhook_configs = [ { url = "http://localhost:5001/alert"; } ]; } ]; }; }; }