RemainAfterExit=true kept the service in "active" state, which prevented OnUnitActiveSec from scheduling new triggers since there was no new "activation" event. Removing it allows the service to properly go inactive, enabling the timer to reschedule correctly. Also fix ExecStart to use lib.getExe for proper path resolution with writeShellApplication. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
258 lines
6.8 KiB
Nix
258 lines
6.8 KiB
Nix
{ self, lib, pkgs, ... }:
|
|
let
|
|
monLib = import ../../lib/monitoring.nix { inherit lib; };
|
|
externalTargets = import ./external-targets.nix;
|
|
|
|
nodeExporterTargets = monLib.generateNodeExporterTargets self externalTargets;
|
|
autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets;
|
|
|
|
# Script to fetch AppRole token for Prometheus to use when scraping OpenBao metrics
|
|
fetchOpenbaoToken = pkgs.writeShellApplication {
|
|
name = "fetch-openbao-token";
|
|
runtimeInputs = [ pkgs.curl pkgs.jq ];
|
|
text = ''
|
|
VAULT_ADDR="https://vault01.home.2rjus.net:8200"
|
|
APPROLE_DIR="/var/lib/vault/approle"
|
|
OUTPUT_FILE="/run/secrets/prometheus/openbao-token"
|
|
|
|
# Read AppRole credentials
|
|
if [ ! -f "$APPROLE_DIR/role-id" ] || [ ! -f "$APPROLE_DIR/secret-id" ]; then
|
|
echo "AppRole credentials not found at $APPROLE_DIR" >&2
|
|
exit 1
|
|
fi
|
|
|
|
ROLE_ID=$(cat "$APPROLE_DIR/role-id")
|
|
SECRET_ID=$(cat "$APPROLE_DIR/secret-id")
|
|
|
|
# Authenticate to Vault
|
|
AUTH_RESPONSE=$(curl -sf -k -X POST \
|
|
-d "{\"role_id\":\"$ROLE_ID\",\"secret_id\":\"$SECRET_ID\"}" \
|
|
"$VAULT_ADDR/v1/auth/approle/login")
|
|
|
|
# Extract token
|
|
VAULT_TOKEN=$(echo "$AUTH_RESPONSE" | jq -r '.auth.client_token')
|
|
if [ -z "$VAULT_TOKEN" ] || [ "$VAULT_TOKEN" = "null" ]; then
|
|
echo "Failed to extract Vault token from response" >&2
|
|
exit 1
|
|
fi
|
|
|
|
# Write token to file
|
|
mkdir -p "$(dirname "$OUTPUT_FILE")"
|
|
echo -n "$VAULT_TOKEN" > "$OUTPUT_FILE"
|
|
chown prometheus:prometheus "$OUTPUT_FILE"
|
|
chmod 0400 "$OUTPUT_FILE"
|
|
|
|
echo "Successfully fetched OpenBao token"
|
|
'';
|
|
};
|
|
in
|
|
{
|
|
# Systemd service to fetch AppRole token for Prometheus OpenBao scraping
|
|
# The token is used to authenticate when scraping /v1/sys/metrics
|
|
systemd.services.prometheus-openbao-token = {
|
|
description = "Fetch OpenBao token for Prometheus metrics scraping";
|
|
after = [ "network-online.target" ];
|
|
wants = [ "network-online.target" ];
|
|
before = [ "prometheus.service" ];
|
|
requiredBy = [ "prometheus.service" ];
|
|
|
|
serviceConfig = {
|
|
Type = "oneshot";
|
|
ExecStart = lib.getExe fetchOpenbaoToken;
|
|
};
|
|
};
|
|
|
|
# Timer to periodically refresh the token (AppRole tokens have 1-hour TTL)
|
|
systemd.timers.prometheus-openbao-token = {
|
|
description = "Refresh OpenBao token for Prometheus";
|
|
wantedBy = [ "timers.target" ];
|
|
timerConfig = {
|
|
OnBootSec = "5min";
|
|
OnUnitActiveSec = "30min";
|
|
RandomizedDelaySec = "5min";
|
|
};
|
|
};
|
|
|
|
services.prometheus = {
|
|
enable = true;
|
|
# syntax-only check because we use external credential files (e.g., openbao-token)
|
|
checkConfig = "syntax-only";
|
|
alertmanager = {
|
|
enable = true;
|
|
configuration = {
|
|
global = {
|
|
};
|
|
route = {
|
|
receiver = "webhook_natstonotify";
|
|
group_wait = "30s";
|
|
group_interval = "5m";
|
|
repeat_interval = "1h";
|
|
group_by = [ "alertname" ];
|
|
};
|
|
receivers = [
|
|
{
|
|
name = "webhook_natstonotify";
|
|
webhook_configs = [
|
|
{
|
|
url = "http://localhost:5001/alert";
|
|
}
|
|
];
|
|
}
|
|
];
|
|
};
|
|
};
|
|
alertmanagers = [
|
|
{
|
|
static_configs = [
|
|
{
|
|
targets = [ "localhost:9093" ];
|
|
}
|
|
];
|
|
}
|
|
];
|
|
|
|
retentionTime = "30d";
|
|
globalConfig = {
|
|
scrape_interval = "15s";
|
|
};
|
|
rules = [
|
|
(builtins.readFile ./rules.yml)
|
|
];
|
|
|
|
scrapeConfigs = [
|
|
# Auto-generated node-exporter targets from flake hosts + external
|
|
{
|
|
job_name = "node-exporter";
|
|
static_configs = [
|
|
{
|
|
targets = nodeExporterTargets;
|
|
}
|
|
];
|
|
}
|
|
# Systemd exporter on all hosts (same targets, different port)
|
|
{
|
|
job_name = "systemd-exporter";
|
|
static_configs = [
|
|
{
|
|
targets = map (t: builtins.replaceStrings [":9100"] [":9558"] t) nodeExporterTargets;
|
|
}
|
|
];
|
|
}
|
|
# Local monitoring services (not auto-generated)
|
|
{
|
|
job_name = "prometheus";
|
|
static_configs = [
|
|
{
|
|
targets = [ "localhost:9090" ];
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "loki";
|
|
static_configs = [
|
|
{
|
|
targets = [ "localhost:3100" ];
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "grafana";
|
|
static_configs = [
|
|
{
|
|
targets = [ "localhost:3000" ];
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "alertmanager";
|
|
static_configs = [
|
|
{
|
|
targets = [ "localhost:9093" ];
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "pushgateway";
|
|
honor_labels = true;
|
|
static_configs = [
|
|
{
|
|
targets = [ "localhost:9091" ];
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "labmon";
|
|
static_configs = [
|
|
{
|
|
targets = [ "monitoring01.home.2rjus.net:9969" ];
|
|
}
|
|
];
|
|
}
|
|
# TODO: nix-cache_caddy can't be auto-generated because the cert is issued
|
|
# for nix-cache.home.2rjus.net (service CNAME), not nix-cache01 (hostname).
|
|
# Consider adding a target override to homelab.monitoring.scrapeTargets.
|
|
{
|
|
job_name = "nix-cache_caddy";
|
|
scheme = "https";
|
|
static_configs = [
|
|
{
|
|
targets = [ "nix-cache.home.2rjus.net" ];
|
|
}
|
|
];
|
|
}
|
|
# pve-exporter with complex relabel config
|
|
{
|
|
job_name = "pve-exporter";
|
|
static_configs = [
|
|
{
|
|
targets = [ "10.69.12.75" ];
|
|
}
|
|
];
|
|
metrics_path = "/pve";
|
|
params = {
|
|
module = [ "default" ];
|
|
cluster = [ "1" ];
|
|
node = [ "1" ];
|
|
};
|
|
relabel_configs = [
|
|
{
|
|
source_labels = [ "__address__" ];
|
|
target_label = "__param_target";
|
|
}
|
|
{
|
|
source_labels = [ "__param_target" ];
|
|
target_label = "instance";
|
|
}
|
|
{
|
|
target_label = "__address__";
|
|
replacement = "127.0.0.1:9221";
|
|
}
|
|
];
|
|
}
|
|
# OpenBao metrics with bearer token auth
|
|
{
|
|
job_name = "openbao";
|
|
scheme = "https";
|
|
metrics_path = "/v1/sys/metrics";
|
|
params = {
|
|
format = [ "prometheus" ];
|
|
};
|
|
static_configs = [{
|
|
targets = [ "vault01.home.2rjus.net:8200" ];
|
|
}];
|
|
authorization = {
|
|
type = "Bearer";
|
|
credentials_file = "/run/secrets/prometheus/openbao-token";
|
|
};
|
|
}
|
|
] ++ autoScrapeConfigs;
|
|
|
|
pushgateway = {
|
|
enable = true;
|
|
web = {
|
|
external-url = "https://pushgw.home.2rjus.net";
|
|
};
|
|
};
|
|
};
|
|
}
|