Instead of creating a long-lived Vault token in Terraform (which gets invalidated when Terraform recreates it), monitoring01 now uses its existing AppRole credentials to fetch a fresh token for Prometheus. Changes: - Add prometheus-metrics policy to monitoring01's AppRole - Remove vault_token.prometheus_metrics resource from Terraform - Remove openbao-token KV secret from Terraform - Add systemd service to fetch AppRole token on boot - Add systemd timer to refresh token every 30 minutes This ensures Prometheus always has a valid token without depending on Terraform state or manual intervention. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
257 lines
6.7 KiB
Nix
257 lines
6.7 KiB
Nix
{ self, lib, pkgs, ... }:
|
|
let
|
|
monLib = import ../../lib/monitoring.nix { inherit lib; };
|
|
externalTargets = import ./external-targets.nix;
|
|
|
|
nodeExporterTargets = monLib.generateNodeExporterTargets self externalTargets;
|
|
autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets;
|
|
|
|
# Script to fetch AppRole token for Prometheus to use when scraping OpenBao metrics
|
|
fetchOpenbaoToken = pkgs.writeShellScript "fetch-openbao-token" ''
|
|
set -euo pipefail
|
|
|
|
VAULT_ADDR="https://vault01.home.2rjus.net:8200"
|
|
APPROLE_DIR="/var/lib/vault/approle"
|
|
OUTPUT_FILE="/run/secrets/prometheus/openbao-token"
|
|
|
|
# Read AppRole credentials
|
|
if [ ! -f "$APPROLE_DIR/role-id" ] || [ ! -f "$APPROLE_DIR/secret-id" ]; then
|
|
echo "AppRole credentials not found at $APPROLE_DIR" >&2
|
|
exit 1
|
|
fi
|
|
|
|
ROLE_ID=$(cat "$APPROLE_DIR/role-id")
|
|
SECRET_ID=$(cat "$APPROLE_DIR/secret-id")
|
|
|
|
# Authenticate to Vault
|
|
AUTH_RESPONSE=$(${pkgs.curl}/bin/curl -sf -k -X POST \
|
|
-d "{\"role_id\":\"$ROLE_ID\",\"secret_id\":\"$SECRET_ID\"}" \
|
|
"$VAULT_ADDR/v1/auth/approle/login")
|
|
|
|
# Extract token
|
|
VAULT_TOKEN=$(echo "$AUTH_RESPONSE" | ${pkgs.jq}/bin/jq -r '.auth.client_token')
|
|
if [ -z "$VAULT_TOKEN" ] || [ "$VAULT_TOKEN" = "null" ]; then
|
|
echo "Failed to extract Vault token from response" >&2
|
|
exit 1
|
|
fi
|
|
|
|
# Write token to file
|
|
mkdir -p "$(dirname "$OUTPUT_FILE")"
|
|
echo -n "$VAULT_TOKEN" > "$OUTPUT_FILE"
|
|
chown prometheus:prometheus "$OUTPUT_FILE"
|
|
chmod 0400 "$OUTPUT_FILE"
|
|
|
|
echo "Successfully fetched OpenBao token"
|
|
'';
|
|
in
|
|
{
|
|
# Systemd service to fetch AppRole token for Prometheus OpenBao scraping
|
|
# The token is used to authenticate when scraping /v1/sys/metrics
|
|
systemd.services.prometheus-openbao-token = {
|
|
description = "Fetch OpenBao token for Prometheus metrics scraping";
|
|
after = [ "network-online.target" ];
|
|
wants = [ "network-online.target" ];
|
|
before = [ "prometheus.service" ];
|
|
requiredBy = [ "prometheus.service" ];
|
|
|
|
serviceConfig = {
|
|
Type = "oneshot";
|
|
ExecStart = fetchOpenbaoToken;
|
|
RemainAfterExit = true;
|
|
};
|
|
};
|
|
|
|
# Timer to periodically refresh the token (AppRole tokens have 1-hour TTL)
|
|
systemd.timers.prometheus-openbao-token = {
|
|
description = "Refresh OpenBao token for Prometheus";
|
|
wantedBy = [ "timers.target" ];
|
|
timerConfig = {
|
|
OnBootSec = "5min";
|
|
OnUnitActiveSec = "30min";
|
|
RandomizedDelaySec = "5min";
|
|
};
|
|
};
|
|
|
|
services.prometheus = {
|
|
enable = true;
|
|
# syntax-only check because we use external credential files (e.g., openbao-token)
|
|
checkConfig = "syntax-only";
|
|
alertmanager = {
|
|
enable = true;
|
|
configuration = {
|
|
global = {
|
|
};
|
|
route = {
|
|
receiver = "webhook_natstonotify";
|
|
group_wait = "30s";
|
|
group_interval = "5m";
|
|
repeat_interval = "1h";
|
|
group_by = [ "alertname" ];
|
|
};
|
|
receivers = [
|
|
{
|
|
name = "webhook_natstonotify";
|
|
webhook_configs = [
|
|
{
|
|
url = "http://localhost:5001/alert";
|
|
}
|
|
];
|
|
}
|
|
];
|
|
};
|
|
};
|
|
alertmanagers = [
|
|
{
|
|
static_configs = [
|
|
{
|
|
targets = [ "localhost:9093" ];
|
|
}
|
|
];
|
|
}
|
|
];
|
|
|
|
retentionTime = "30d";
|
|
globalConfig = {
|
|
scrape_interval = "15s";
|
|
};
|
|
rules = [
|
|
(builtins.readFile ./rules.yml)
|
|
];
|
|
|
|
scrapeConfigs = [
|
|
# Auto-generated node-exporter targets from flake hosts + external
|
|
{
|
|
job_name = "node-exporter";
|
|
static_configs = [
|
|
{
|
|
targets = nodeExporterTargets;
|
|
}
|
|
];
|
|
}
|
|
# Systemd exporter on all hosts (same targets, different port)
|
|
{
|
|
job_name = "systemd-exporter";
|
|
static_configs = [
|
|
{
|
|
targets = map (t: builtins.replaceStrings [":9100"] [":9558"] t) nodeExporterTargets;
|
|
}
|
|
];
|
|
}
|
|
# Local monitoring services (not auto-generated)
|
|
{
|
|
job_name = "prometheus";
|
|
static_configs = [
|
|
{
|
|
targets = [ "localhost:9090" ];
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "loki";
|
|
static_configs = [
|
|
{
|
|
targets = [ "localhost:3100" ];
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "grafana";
|
|
static_configs = [
|
|
{
|
|
targets = [ "localhost:3000" ];
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "alertmanager";
|
|
static_configs = [
|
|
{
|
|
targets = [ "localhost:9093" ];
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "pushgateway";
|
|
honor_labels = true;
|
|
static_configs = [
|
|
{
|
|
targets = [ "localhost:9091" ];
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "labmon";
|
|
static_configs = [
|
|
{
|
|
targets = [ "monitoring01.home.2rjus.net:9969" ];
|
|
}
|
|
];
|
|
}
|
|
# TODO: nix-cache_caddy can't be auto-generated because the cert is issued
|
|
# for nix-cache.home.2rjus.net (service CNAME), not nix-cache01 (hostname).
|
|
# Consider adding a target override to homelab.monitoring.scrapeTargets.
|
|
{
|
|
job_name = "nix-cache_caddy";
|
|
scheme = "https";
|
|
static_configs = [
|
|
{
|
|
targets = [ "nix-cache.home.2rjus.net" ];
|
|
}
|
|
];
|
|
}
|
|
# pve-exporter with complex relabel config
|
|
{
|
|
job_name = "pve-exporter";
|
|
static_configs = [
|
|
{
|
|
targets = [ "10.69.12.75" ];
|
|
}
|
|
];
|
|
metrics_path = "/pve";
|
|
params = {
|
|
module = [ "default" ];
|
|
cluster = [ "1" ];
|
|
node = [ "1" ];
|
|
};
|
|
relabel_configs = [
|
|
{
|
|
source_labels = [ "__address__" ];
|
|
target_label = "__param_target";
|
|
}
|
|
{
|
|
source_labels = [ "__param_target" ];
|
|
target_label = "instance";
|
|
}
|
|
{
|
|
target_label = "__address__";
|
|
replacement = "127.0.0.1:9221";
|
|
}
|
|
];
|
|
}
|
|
# OpenBao metrics with bearer token auth
|
|
{
|
|
job_name = "openbao";
|
|
scheme = "https";
|
|
metrics_path = "/v1/sys/metrics";
|
|
params = {
|
|
format = [ "prometheus" ];
|
|
};
|
|
static_configs = [{
|
|
targets = [ "vault01.home.2rjus.net:8200" ];
|
|
}];
|
|
authorization = {
|
|
type = "Bearer";
|
|
credentials_file = "/run/secrets/prometheus/openbao-token";
|
|
};
|
|
}
|
|
] ++ autoScrapeConfigs;
|
|
|
|
pushgateway = {
|
|
enable = true;
|
|
web = {
|
|
external-url = "https://pushgw.home.2rjus.net";
|
|
};
|
|
};
|
|
};
|
|
}
|