monitoring02: add VictoriaMetrics, vmalert, and Alertmanager
Some checks failed
Run nix flake check / flake-check (push) Has been cancelled
Some checks failed
Run nix flake check / flake-check (push) Has been cancelled
Set up the core metrics stack on monitoring02 as Phase 2 of the monitoring migration. VictoriaMetrics replaces Prometheus with identical scrape configs (22 jobs including auto-generated targets). - VictoriaMetrics with 3-month retention and all scrape configs - vmalert evaluating existing rules.yml (notifier disabled) - Alertmanager with same routing config (no alerts during parallel op) - Grafana datasources updated: local VictoriaMetrics as default - Static user override for credential file access (OpenBao, Apiary) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
210
services/victoriametrics/default.nix
Normal file
210
services/victoriametrics/default.nix
Normal file
@@ -0,0 +1,210 @@
|
||||
{ self, config, lib, pkgs, ... }:
|
||||
let
|
||||
monLib = import ../../lib/monitoring.nix { inherit lib; };
|
||||
externalTargets = import ../monitoring/external-targets.nix;
|
||||
|
||||
nodeExporterTargets = monLib.generateNodeExporterTargets self externalTargets;
|
||||
autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets;
|
||||
|
||||
# Script to fetch AppRole token for VictoriaMetrics to use when scraping OpenBao metrics
|
||||
fetchOpenbaoToken = pkgs.writeShellApplication {
|
||||
name = "fetch-openbao-token-vm";
|
||||
runtimeInputs = [ pkgs.curl pkgs.jq ];
|
||||
text = ''
|
||||
VAULT_ADDR="https://vault01.home.2rjus.net:8200"
|
||||
APPROLE_DIR="/var/lib/vault/approle"
|
||||
OUTPUT_FILE="/run/secrets/victoriametrics/openbao-token"
|
||||
|
||||
# Read AppRole credentials
|
||||
if [ ! -f "$APPROLE_DIR/role-id" ] || [ ! -f "$APPROLE_DIR/secret-id" ]; then
|
||||
echo "AppRole credentials not found at $APPROLE_DIR" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ROLE_ID=$(cat "$APPROLE_DIR/role-id")
|
||||
SECRET_ID=$(cat "$APPROLE_DIR/secret-id")
|
||||
|
||||
# Authenticate to Vault
|
||||
AUTH_RESPONSE=$(curl -sf -k -X POST \
|
||||
-d "{\"role_id\":\"$ROLE_ID\",\"secret_id\":\"$SECRET_ID\"}" \
|
||||
"$VAULT_ADDR/v1/auth/approle/login")
|
||||
|
||||
# Extract token
|
||||
VAULT_TOKEN=$(echo "$AUTH_RESPONSE" | jq -r '.auth.client_token')
|
||||
if [ -z "$VAULT_TOKEN" ] || [ "$VAULT_TOKEN" = "null" ]; then
|
||||
echo "Failed to extract Vault token from response" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Write token to file
|
||||
mkdir -p "$(dirname "$OUTPUT_FILE")"
|
||||
echo -n "$VAULT_TOKEN" > "$OUTPUT_FILE"
|
||||
chown victoriametrics:victoriametrics "$OUTPUT_FILE"
|
||||
chmod 0400 "$OUTPUT_FILE"
|
||||
|
||||
echo "Successfully fetched OpenBao token"
|
||||
'';
|
||||
};
|
||||
|
||||
scrapeConfigs = [
|
||||
# Auto-generated node-exporter targets from flake hosts + external
|
||||
{
|
||||
job_name = "node-exporter";
|
||||
static_configs = nodeExporterTargets;
|
||||
}
|
||||
# Systemd exporter on all hosts (same targets, different port)
|
||||
{
|
||||
job_name = "systemd-exporter";
|
||||
static_configs = map
|
||||
(cfg: cfg // {
|
||||
targets = map (t: builtins.replaceStrings [ ":9100" ] [ ":9558" ] t) cfg.targets;
|
||||
})
|
||||
nodeExporterTargets;
|
||||
}
|
||||
# Local monitoring services
|
||||
{
|
||||
job_name = "victoriametrics";
|
||||
static_configs = [{ targets = [ "localhost:8428" ]; }];
|
||||
}
|
||||
{
|
||||
job_name = "loki";
|
||||
static_configs = [{ targets = [ "localhost:3100" ]; }];
|
||||
}
|
||||
{
|
||||
job_name = "grafana";
|
||||
static_configs = [{ targets = [ "localhost:3000" ]; }];
|
||||
}
|
||||
{
|
||||
job_name = "alertmanager";
|
||||
static_configs = [{ targets = [ "localhost:9093" ]; }];
|
||||
}
|
||||
# Caddy metrics from nix-cache02
|
||||
{
|
||||
job_name = "nix-cache_caddy";
|
||||
scheme = "https";
|
||||
static_configs = [{ targets = [ "nix-cache.home.2rjus.net" ]; }];
|
||||
}
|
||||
# OpenBao metrics with bearer token auth
|
||||
{
|
||||
job_name = "openbao";
|
||||
scheme = "https";
|
||||
metrics_path = "/v1/sys/metrics";
|
||||
params = { format = [ "prometheus" ]; };
|
||||
static_configs = [{ targets = [ "vault01.home.2rjus.net:8200" ]; }];
|
||||
authorization = {
|
||||
type = "Bearer";
|
||||
credentials_file = "/run/secrets/victoriametrics/openbao-token";
|
||||
};
|
||||
}
|
||||
# Apiary external service
|
||||
{
|
||||
job_name = "apiary";
|
||||
scheme = "https";
|
||||
scrape_interval = "60s";
|
||||
static_configs = [{ targets = [ "apiary.t-juice.club" ]; }];
|
||||
authorization = {
|
||||
type = "Bearer";
|
||||
credentials_file = "/run/secrets/victoriametrics-apiary-token";
|
||||
};
|
||||
}
|
||||
] ++ autoScrapeConfigs;
|
||||
in
|
||||
{
|
||||
# Static user for VictoriaMetrics (overrides DynamicUser) so vault.secrets
|
||||
# and credential files can be owned by this user
|
||||
users.users.victoriametrics = {
|
||||
isSystemUser = true;
|
||||
group = "victoriametrics";
|
||||
};
|
||||
users.groups.victoriametrics = { };
|
||||
|
||||
# Override DynamicUser since we need a static user for credential file access
|
||||
systemd.services.victoriametrics.serviceConfig = {
|
||||
DynamicUser = lib.mkForce false;
|
||||
User = "victoriametrics";
|
||||
Group = "victoriametrics";
|
||||
};
|
||||
|
||||
# Systemd service to fetch AppRole token for OpenBao scraping
|
||||
systemd.services.victoriametrics-openbao-token = {
|
||||
description = "Fetch OpenBao token for VictoriaMetrics metrics scraping";
|
||||
after = [ "network-online.target" ];
|
||||
wants = [ "network-online.target" ];
|
||||
before = [ "victoriametrics.service" ];
|
||||
requiredBy = [ "victoriametrics.service" ];
|
||||
|
||||
serviceConfig = {
|
||||
Type = "oneshot";
|
||||
ExecStart = lib.getExe fetchOpenbaoToken;
|
||||
};
|
||||
};
|
||||
|
||||
# Timer to periodically refresh the token (AppRole tokens have 1-hour TTL)
|
||||
systemd.timers.victoriametrics-openbao-token = {
|
||||
description = "Refresh OpenBao token for VictoriaMetrics";
|
||||
wantedBy = [ "timers.target" ];
|
||||
timerConfig = {
|
||||
OnBootSec = "5min";
|
||||
OnUnitActiveSec = "30min";
|
||||
RandomizedDelaySec = "5min";
|
||||
};
|
||||
};
|
||||
|
||||
# Fetch apiary bearer token from Vault
|
||||
vault.secrets.victoriametrics-apiary-token = {
|
||||
secretPath = "hosts/monitoring01/apiary-token";
|
||||
extractKey = "password";
|
||||
owner = "victoriametrics";
|
||||
group = "victoriametrics";
|
||||
services = [ "victoriametrics" ];
|
||||
};
|
||||
|
||||
services.victoriametrics = {
|
||||
enable = true;
|
||||
retentionPeriod = "3"; # 3 months
|
||||
# Disable config check since we reference external credential files
|
||||
checkConfig = false;
|
||||
prometheusConfig = {
|
||||
global.scrape_interval = "15s";
|
||||
scrape_configs = scrapeConfigs;
|
||||
};
|
||||
};
|
||||
|
||||
# vmalert for alerting rules - no notifier during parallel operation
|
||||
services.vmalert.instances.default = {
|
||||
enable = true;
|
||||
settings = {
|
||||
"datasource.url" = "http://localhost:8428";
|
||||
# Notifier disabled during parallel operation to prevent duplicate alerts
|
||||
# Uncomment after cutover from monitoring01:
|
||||
# "notifier.url" = [ "http://localhost:9093" ];
|
||||
"rule" = [ ../monitoring/rules.yml ];
|
||||
};
|
||||
};
|
||||
|
||||
# Alertmanager - same config as monitoring01 but will only receive
|
||||
# alerts after cutover (vmalert notifier is disabled above)
|
||||
services.prometheus.alertmanager = {
|
||||
enable = true;
|
||||
configuration = {
|
||||
global = { };
|
||||
route = {
|
||||
receiver = "webhook_natstonotify";
|
||||
group_wait = "30s";
|
||||
group_interval = "5m";
|
||||
repeat_interval = "1h";
|
||||
group_by = [ "alertname" ];
|
||||
};
|
||||
receivers = [
|
||||
{
|
||||
name = "webhook_natstonotify";
|
||||
webhook_configs = [
|
||||
{
|
||||
url = "http://localhost:5001/alert";
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user