monitoring01: remove host and migrate services to monitoring02
Remove monitoring01 host configuration and unused service modules (prometheus, grafana, loki, tempo, pyroscope). Migrate blackbox, exportarr, and pve exporters to monitoring02 with scrape configs moved to VictoriaMetrics. Update alert rules, terraform vault policies/secrets, http-proxy entries, and documentation to reflect the monitoring02 migration. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,33 +1,4 @@
|
||||
{ pkgs, ... }:
|
||||
let
|
||||
# TLS endpoints to monitor for certificate expiration
|
||||
# These are all services using ACME certificates from OpenBao PKI
|
||||
tlsTargets = [
|
||||
# Direct ACME certs (security.acme.certs)
|
||||
"https://vault.home.2rjus.net:8200"
|
||||
"https://auth.home.2rjus.net"
|
||||
"https://testvm01.home.2rjus.net"
|
||||
|
||||
# Caddy auto-TLS on http-proxy
|
||||
"https://nzbget.home.2rjus.net"
|
||||
"https://radarr.home.2rjus.net"
|
||||
"https://sonarr.home.2rjus.net"
|
||||
"https://ha.home.2rjus.net"
|
||||
"https://z2m.home.2rjus.net"
|
||||
"https://prometheus.home.2rjus.net"
|
||||
"https://alertmanager.home.2rjus.net"
|
||||
"https://grafana.home.2rjus.net"
|
||||
"https://jelly.home.2rjus.net"
|
||||
"https://pyroscope.home.2rjus.net"
|
||||
"https://pushgw.home.2rjus.net"
|
||||
|
||||
# Caddy auto-TLS on nix-cache02
|
||||
"https://nix-cache.home.2rjus.net"
|
||||
|
||||
# Caddy auto-TLS on grafana01
|
||||
"https://grafana-test.home.2rjus.net"
|
||||
];
|
||||
in
|
||||
{
|
||||
services.prometheus.exporters.blackbox = {
|
||||
enable = true;
|
||||
@@ -57,36 +28,4 @@ in
|
||||
- 503
|
||||
'';
|
||||
};
|
||||
|
||||
# Add blackbox scrape config to Prometheus
|
||||
# Alert rules are in rules.yml (certificate_rules group)
|
||||
services.prometheus.scrapeConfigs = [
|
||||
{
|
||||
job_name = "blackbox_tls";
|
||||
metrics_path = "/probe";
|
||||
params = {
|
||||
module = [ "https_cert" ];
|
||||
};
|
||||
static_configs = [{
|
||||
targets = tlsTargets;
|
||||
}];
|
||||
relabel_configs = [
|
||||
# Pass the target URL to blackbox as a parameter
|
||||
{
|
||||
source_labels = [ "__address__" ];
|
||||
target_label = "__param_target";
|
||||
}
|
||||
# Use the target URL as the instance label
|
||||
{
|
||||
source_labels = [ "__param_target" ];
|
||||
target_label = "instance";
|
||||
}
|
||||
# Point the actual scrape at the local blackbox exporter
|
||||
{
|
||||
target_label = "__address__";
|
||||
replacement = "127.0.0.1:9115";
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
}
|
||||
|
||||
@@ -1,14 +0,0 @@
|
||||
{ ... }:
|
||||
{
|
||||
imports = [
|
||||
./loki.nix
|
||||
./grafana.nix
|
||||
./prometheus.nix
|
||||
./blackbox.nix
|
||||
./exportarr.nix
|
||||
./pve.nix
|
||||
./alerttonotify.nix
|
||||
./pyroscope.nix
|
||||
./tempo.nix
|
||||
];
|
||||
}
|
||||
@@ -14,14 +14,4 @@
|
||||
apiKeyFile = config.vault.secrets.sonarr-api-key.outputDir;
|
||||
port = 9709;
|
||||
};
|
||||
|
||||
# Scrape config
|
||||
services.prometheus.scrapeConfigs = [
|
||||
{
|
||||
job_name = "sonarr";
|
||||
static_configs = [{
|
||||
targets = [ "localhost:9709" ];
|
||||
}];
|
||||
}
|
||||
];
|
||||
}
|
||||
|
||||
@@ -1,11 +0,0 @@
|
||||
{ pkgs, ... }:
|
||||
{
|
||||
services.grafana = {
|
||||
enable = true;
|
||||
settings = {
|
||||
server = {
|
||||
http_addr = "";
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
@@ -1,58 +0,0 @@
|
||||
{ ... }:
|
||||
{
|
||||
services.loki = {
|
||||
enable = true;
|
||||
configuration = {
|
||||
auth_enabled = false;
|
||||
|
||||
server = {
|
||||
http_listen_port = 3100;
|
||||
};
|
||||
common = {
|
||||
ring = {
|
||||
instance_addr = "127.0.0.1";
|
||||
kvstore = {
|
||||
store = "inmemory";
|
||||
};
|
||||
};
|
||||
replication_factor = 1;
|
||||
path_prefix = "/var/lib/loki";
|
||||
};
|
||||
schema_config = {
|
||||
configs = [
|
||||
{
|
||||
from = "2024-01-01";
|
||||
store = "tsdb";
|
||||
object_store = "filesystem";
|
||||
schema = "v13";
|
||||
index = {
|
||||
prefix = "loki_index_";
|
||||
period = "24h";
|
||||
};
|
||||
}
|
||||
];
|
||||
};
|
||||
storage_config = {
|
||||
filesystem = {
|
||||
directory = "/var/lib/loki/chunks";
|
||||
};
|
||||
};
|
||||
compactor = {
|
||||
working_directory = "/var/lib/loki/compactor";
|
||||
compaction_interval = "10m";
|
||||
retention_enabled = true;
|
||||
retention_delete_delay = "2h";
|
||||
retention_delete_worker_count = 150;
|
||||
delete_request_store = "filesystem";
|
||||
};
|
||||
limits_config = {
|
||||
retention_period = "30d";
|
||||
ingestion_rate_mb = 10;
|
||||
ingestion_burst_size_mb = 20;
|
||||
max_streams_per_user = 10000;
|
||||
max_query_series = 500;
|
||||
max_query_parallelism = 8;
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
@@ -1,267 +0,0 @@
|
||||
{ self, lib, pkgs, ... }:
|
||||
let
|
||||
monLib = import ../../lib/monitoring.nix { inherit lib; };
|
||||
externalTargets = import ./external-targets.nix;
|
||||
|
||||
nodeExporterTargets = monLib.generateNodeExporterTargets self externalTargets;
|
||||
autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets;
|
||||
|
||||
# Script to fetch AppRole token for Prometheus to use when scraping OpenBao metrics
|
||||
fetchOpenbaoToken = pkgs.writeShellApplication {
|
||||
name = "fetch-openbao-token";
|
||||
runtimeInputs = [ pkgs.curl pkgs.jq ];
|
||||
text = ''
|
||||
VAULT_ADDR="https://vault01.home.2rjus.net:8200"
|
||||
APPROLE_DIR="/var/lib/vault/approle"
|
||||
OUTPUT_FILE="/run/secrets/prometheus/openbao-token"
|
||||
|
||||
# Read AppRole credentials
|
||||
if [ ! -f "$APPROLE_DIR/role-id" ] || [ ! -f "$APPROLE_DIR/secret-id" ]; then
|
||||
echo "AppRole credentials not found at $APPROLE_DIR" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ROLE_ID=$(cat "$APPROLE_DIR/role-id")
|
||||
SECRET_ID=$(cat "$APPROLE_DIR/secret-id")
|
||||
|
||||
# Authenticate to Vault
|
||||
AUTH_RESPONSE=$(curl -sf -k -X POST \
|
||||
-d "{\"role_id\":\"$ROLE_ID\",\"secret_id\":\"$SECRET_ID\"}" \
|
||||
"$VAULT_ADDR/v1/auth/approle/login")
|
||||
|
||||
# Extract token
|
||||
VAULT_TOKEN=$(echo "$AUTH_RESPONSE" | jq -r '.auth.client_token')
|
||||
if [ -z "$VAULT_TOKEN" ] || [ "$VAULT_TOKEN" = "null" ]; then
|
||||
echo "Failed to extract Vault token from response" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Write token to file
|
||||
mkdir -p "$(dirname "$OUTPUT_FILE")"
|
||||
echo -n "$VAULT_TOKEN" > "$OUTPUT_FILE"
|
||||
chown prometheus:prometheus "$OUTPUT_FILE"
|
||||
chmod 0400 "$OUTPUT_FILE"
|
||||
|
||||
echo "Successfully fetched OpenBao token"
|
||||
'';
|
||||
};
|
||||
in
|
||||
{
|
||||
# Systemd service to fetch AppRole token for Prometheus OpenBao scraping
|
||||
# The token is used to authenticate when scraping /v1/sys/metrics
|
||||
systemd.services.prometheus-openbao-token = {
|
||||
description = "Fetch OpenBao token for Prometheus metrics scraping";
|
||||
after = [ "network-online.target" ];
|
||||
wants = [ "network-online.target" ];
|
||||
before = [ "prometheus.service" ];
|
||||
requiredBy = [ "prometheus.service" ];
|
||||
|
||||
serviceConfig = {
|
||||
Type = "oneshot";
|
||||
ExecStart = lib.getExe fetchOpenbaoToken;
|
||||
};
|
||||
};
|
||||
|
||||
# Timer to periodically refresh the token (AppRole tokens have 1-hour TTL)
|
||||
systemd.timers.prometheus-openbao-token = {
|
||||
description = "Refresh OpenBao token for Prometheus";
|
||||
wantedBy = [ "timers.target" ];
|
||||
timerConfig = {
|
||||
OnBootSec = "5min";
|
||||
OnUnitActiveSec = "30min";
|
||||
RandomizedDelaySec = "5min";
|
||||
};
|
||||
};
|
||||
|
||||
# Fetch apiary bearer token from Vault
|
||||
vault.secrets.prometheus-apiary-token = {
|
||||
secretPath = "hosts/monitoring01/apiary-token";
|
||||
extractKey = "password";
|
||||
owner = "prometheus";
|
||||
group = "prometheus";
|
||||
services = [ "prometheus" ];
|
||||
};
|
||||
|
||||
services.prometheus = {
|
||||
enable = true;
|
||||
# syntax-only check because we use external credential files (e.g., openbao-token)
|
||||
checkConfig = "syntax-only";
|
||||
alertmanager = {
|
||||
enable = true;
|
||||
configuration = {
|
||||
global = {
|
||||
};
|
||||
route = {
|
||||
receiver = "webhook_natstonotify";
|
||||
group_wait = "30s";
|
||||
group_interval = "5m";
|
||||
repeat_interval = "1h";
|
||||
group_by = [ "alertname" ];
|
||||
};
|
||||
receivers = [
|
||||
{
|
||||
name = "webhook_natstonotify";
|
||||
webhook_configs = [
|
||||
{
|
||||
url = "http://localhost:5001/alert";
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
alertmanagers = [
|
||||
{
|
||||
static_configs = [
|
||||
{
|
||||
targets = [ "localhost:9093" ];
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
|
||||
retentionTime = "30d";
|
||||
globalConfig = {
|
||||
scrape_interval = "15s";
|
||||
};
|
||||
rules = [
|
||||
(builtins.readFile ./rules.yml)
|
||||
];
|
||||
|
||||
scrapeConfigs = [
|
||||
# Auto-generated node-exporter targets from flake hosts + external
|
||||
# Each static_config entry may have labels from homelab.host metadata
|
||||
{
|
||||
job_name = "node-exporter";
|
||||
static_configs = nodeExporterTargets;
|
||||
}
|
||||
# Systemd exporter on all hosts (same targets, different port)
|
||||
# Preserves the same label grouping as node-exporter
|
||||
{
|
||||
job_name = "systemd-exporter";
|
||||
static_configs = map
|
||||
(cfg: cfg // {
|
||||
targets = map (t: builtins.replaceStrings [ ":9100" ] [ ":9558" ] t) cfg.targets;
|
||||
})
|
||||
nodeExporterTargets;
|
||||
}
|
||||
# Local monitoring services (not auto-generated)
|
||||
{
|
||||
job_name = "prometheus";
|
||||
static_configs = [
|
||||
{
|
||||
targets = [ "localhost:9090" ];
|
||||
}
|
||||
];
|
||||
}
|
||||
{
|
||||
job_name = "loki";
|
||||
static_configs = [
|
||||
{
|
||||
targets = [ "localhost:3100" ];
|
||||
}
|
||||
];
|
||||
}
|
||||
{
|
||||
job_name = "grafana";
|
||||
static_configs = [
|
||||
{
|
||||
targets = [ "localhost:3000" ];
|
||||
}
|
||||
];
|
||||
}
|
||||
{
|
||||
job_name = "alertmanager";
|
||||
static_configs = [
|
||||
{
|
||||
targets = [ "localhost:9093" ];
|
||||
}
|
||||
];
|
||||
}
|
||||
{
|
||||
job_name = "pushgateway";
|
||||
honor_labels = true;
|
||||
static_configs = [
|
||||
{
|
||||
targets = [ "localhost:9091" ];
|
||||
}
|
||||
];
|
||||
}
|
||||
# Caddy metrics from nix-cache02 (serves nix-cache.home.2rjus.net)
|
||||
{
|
||||
job_name = "nix-cache_caddy";
|
||||
scheme = "https";
|
||||
static_configs = [
|
||||
{
|
||||
targets = [ "nix-cache.home.2rjus.net" ];
|
||||
}
|
||||
];
|
||||
}
|
||||
# pve-exporter with complex relabel config
|
||||
{
|
||||
job_name = "pve-exporter";
|
||||
static_configs = [
|
||||
{
|
||||
targets = [ "10.69.12.75" ];
|
||||
}
|
||||
];
|
||||
metrics_path = "/pve";
|
||||
params = {
|
||||
module = [ "default" ];
|
||||
cluster = [ "1" ];
|
||||
node = [ "1" ];
|
||||
};
|
||||
relabel_configs = [
|
||||
{
|
||||
source_labels = [ "__address__" ];
|
||||
target_label = "__param_target";
|
||||
}
|
||||
{
|
||||
source_labels = [ "__param_target" ];
|
||||
target_label = "instance";
|
||||
}
|
||||
{
|
||||
target_label = "__address__";
|
||||
replacement = "127.0.0.1:9221";
|
||||
}
|
||||
];
|
||||
}
|
||||
# OpenBao metrics with bearer token auth
|
||||
{
|
||||
job_name = "openbao";
|
||||
scheme = "https";
|
||||
metrics_path = "/v1/sys/metrics";
|
||||
params = {
|
||||
format = [ "prometheus" ];
|
||||
};
|
||||
static_configs = [{
|
||||
targets = [ "vault01.home.2rjus.net:8200" ];
|
||||
}];
|
||||
authorization = {
|
||||
type = "Bearer";
|
||||
credentials_file = "/run/secrets/prometheus/openbao-token";
|
||||
};
|
||||
}
|
||||
# Apiary external service
|
||||
{
|
||||
job_name = "apiary";
|
||||
scheme = "https";
|
||||
scrape_interval = "60s";
|
||||
static_configs = [{
|
||||
targets = [ "apiary.t-juice.club" ];
|
||||
}];
|
||||
authorization = {
|
||||
type = "Bearer";
|
||||
credentials_file = "/run/secrets/prometheus-apiary-token";
|
||||
};
|
||||
}
|
||||
] ++ autoScrapeConfigs;
|
||||
|
||||
pushgateway = {
|
||||
enable = true;
|
||||
web = {
|
||||
external-url = "https://pushgw.home.2rjus.net";
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
{ config, ... }:
|
||||
{
|
||||
vault.secrets.pve-exporter = {
|
||||
secretPath = "hosts/monitoring01/pve-exporter";
|
||||
secretPath = "hosts/monitoring02/pve-exporter";
|
||||
extractKey = "config";
|
||||
outputDir = "/run/secrets/pve_exporter";
|
||||
mode = "0444";
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
{ ... }:
|
||||
{
|
||||
virtualisation.oci-containers.containers.pyroscope = {
|
||||
pull = "missing";
|
||||
image = "grafana/pyroscope:latest";
|
||||
ports = [ "4040:4040" ];
|
||||
};
|
||||
}
|
||||
@@ -259,32 +259,32 @@ groups:
|
||||
description: "Wireguard handshake timeout on {{ $labels.instance }} for peer {{ $labels.public_key }}."
|
||||
- name: monitoring_rules
|
||||
rules:
|
||||
- alert: prometheus_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="prometheus.service", state="active"} == 0
|
||||
- alert: victoriametrics_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="victoriametrics.service", state="active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Prometheus service not running on {{ $labels.instance }}"
|
||||
description: "Prometheus service not running on {{ $labels.instance }}"
|
||||
summary: "VictoriaMetrics service not running on {{ $labels.instance }}"
|
||||
description: "VictoriaMetrics service not running on {{ $labels.instance }}"
|
||||
- alert: vmalert_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="vmalert.service", state="active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "vmalert service not running on {{ $labels.instance }}"
|
||||
description: "vmalert service not running on {{ $labels.instance }}"
|
||||
- alert: alertmanager_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
|
||||
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Alertmanager service not running on {{ $labels.instance }}"
|
||||
description: "Alertmanager service not running on {{ $labels.instance }}"
|
||||
- alert: pushgateway_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Pushgateway service not running on {{ $labels.instance }}"
|
||||
description: "Pushgateway service not running on {{ $labels.instance }}"
|
||||
- alert: loki_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="loki.service", state="active"} == 0
|
||||
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="loki.service", state="active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -292,29 +292,13 @@ groups:
|
||||
summary: "Loki service not running on {{ $labels.instance }}"
|
||||
description: "Loki service not running on {{ $labels.instance }}"
|
||||
- alert: grafana_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
|
||||
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Grafana service not running on {{ $labels.instance }}"
|
||||
description: "Grafana service not running on {{ $labels.instance }}"
|
||||
- alert: tempo_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="tempo.service", state="active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Tempo service not running on {{ $labels.instance }}"
|
||||
description: "Tempo service not running on {{ $labels.instance }}"
|
||||
- alert: pyroscope_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="podman-pyroscope.service", state="active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Pyroscope service not running on {{ $labels.instance }}"
|
||||
description: "Pyroscope service not running on {{ $labels.instance }}"
|
||||
- name: proxmox_rules
|
||||
rules:
|
||||
- alert: pve_node_down
|
||||
|
||||
@@ -1,37 +0,0 @@
|
||||
{ ... }:
|
||||
{
|
||||
services.tempo = {
|
||||
enable = true;
|
||||
settings = {
|
||||
server = {
|
||||
http_listen_port = 3200;
|
||||
grpc_listen_port = 3201;
|
||||
};
|
||||
distributor = {
|
||||
receivers = {
|
||||
otlp = {
|
||||
protocols = {
|
||||
http = {
|
||||
endpoint = ":4318";
|
||||
cors = {
|
||||
allowed_origins = [ "*.home.2rjus.net" ];
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
storage = {
|
||||
trace = {
|
||||
backend = "local";
|
||||
local = {
|
||||
path = "/var/lib/tempo";
|
||||
};
|
||||
wal = {
|
||||
path = "/var/lib/tempo/wal";
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user