From e329f87b0bdb6b97a9e38c40fe7936b50638da58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Tue, 17 Feb 2026 00:29:34 +0100 Subject: [PATCH 1/3] monitoring02: add VictoriaMetrics, vmalert, and Alertmanager Set up the core metrics stack on monitoring02 as Phase 2 of the monitoring migration. VictoriaMetrics replaces Prometheus with identical scrape configs (22 jobs including auto-generated targets). - VictoriaMetrics with 3-month retention and all scrape configs - vmalert evaluating existing rules.yml (notifier disabled) - Alertmanager with same routing config (no alerts during parallel op) - Grafana datasources updated: local VictoriaMetrics as default - Static user override for credential file access (OpenBao, Apiary) Co-Authored-By: Claude Opus 4.6 --- .../monitoring-migration-victoriametrics.md | 118 ++++------ hosts/monitoring02/default.nix | 1 + services/grafana/default.nix | 12 +- services/victoriametrics/default.nix | 211 ++++++++++++++++++ 4 files changed, 264 insertions(+), 78 deletions(-) create mode 100644 services/victoriametrics/default.nix diff --git a/docs/plans/monitoring-migration-victoriametrics.md b/docs/plans/monitoring-migration-victoriametrics.md index 1eafea9..7c6c349 100644 --- a/docs/plans/monitoring-migration-victoriametrics.md +++ b/docs/plans/monitoring-migration-victoriametrics.md @@ -61,53 +61,53 @@ If multi-year retention with downsampling becomes necessary later, Thanos can be ## Implementation Plan -### Phase 1: Create monitoring02 Host +### Phase 1: Create monitoring02 Host [COMPLETE] -Use `create-host` script which handles flake.nix and terraform/vms.tf automatically. - -1. **Run create-host**: `nix develop -c create-host monitoring02 10.69.13.24` -2. **Update VM resources** in `terraform/vms.tf`: - - 4 cores (same as monitoring01) - - 8GB RAM (double, for VictoriaMetrics headroom) - - 100GB disk (for 3+ months retention with compression) -3. **Update host configuration**: Import monitoring services -4. **Create Vault AppRole**: Add to `terraform/vault/approle.tf` +Host created and deployed at 10.69.13.24 (prod tier) with: +- 4 CPU cores, 8GB RAM, 60GB disk +- Vault integration enabled +- NATS-based remote deployment enabled +- Grafana with Kanidm OIDC deployed as test instance (`grafana-test.home.2rjus.net`) ### Phase 2: Set Up VictoriaMetrics Stack -Create new service module at `services/monitoring/victoriametrics/` for testing alongside existing -Prometheus config. Once validated, this can replace the Prometheus module. +New service module at `services/victoriametrics/` for VictoriaMetrics + vmalert + Alertmanager. +Imported by monitoring02 alongside the existing Grafana service. -1. **VictoriaMetrics** (port 8428): +1. **VictoriaMetrics** (port 8428): [DONE] - `services.victoriametrics.enable = true` - - `services.victoriametrics.retentionPeriod = "3m"` (3 months, increase later based on disk usage) - - Migrate scrape configs via `prometheusConfig` - - Use native push support (replaces Pushgateway) + - `retentionPeriod = "3"` (3 months) + - All scrape configs migrated from Prometheus (22 jobs including auto-generated) + - Static user override (DynamicUser disabled) for credential file access + - OpenBao token fetch service + 30min refresh timer + - Apiary bearer token via vault.secrets -2. **vmalert** for alerting rules: - - `services.vmalert.enable = true` - - Point to VictoriaMetrics for metrics evaluation - - Keep rules in separate `rules.yml` file (same format as Prometheus) - - No receiver configured during parallel operation (prevents duplicate alerts) +2. **vmalert** for alerting rules: [DONE] + - Points to VictoriaMetrics datasource at localhost:8428 + - Reuses existing `services/monitoring/rules.yml` directly via `settings.rule` + - No notifier configured during parallel operation (prevents duplicate alerts) -3. **Alertmanager** (port 9093): - - Keep existing configuration (alerttonotify webhook routing) - - Only enable receiver after cutover from monitoring01 +3. **Alertmanager** (port 9093): [DONE] + - Same configuration as monitoring01 (alerttonotify webhook routing) + - Will only receive alerts after cutover (vmalert notifier disabled) -4. **Loki** (port 3100): - - Same configuration as current +4. **Grafana** (port 3000): [DONE] + - VictoriaMetrics datasource (localhost:8428) as default + - monitoring01 Prometheus datasource kept for comparison during parallel operation + - Loki datasource pointing to monitoring01 (until Loki migrated) -5. **Grafana** (port 3000): - - Define dashboards declaratively via NixOS options (not imported from monitoring01) - - Reference existing dashboards on monitoring01 for content inspiration - - Configure VictoriaMetrics datasource (port 8428) - - Configure Loki datasource +5. **Loki** (port 3100): + - TODO: Same configuration as current 6. **Tempo** (ports 3200, 3201): - - Same configuration + - TODO: Same configuration 7. **Pyroscope** (port 4040): - - Same Docker-based deployment + - TODO: Same Docker-based deployment + +**Note:** pve-exporter and pushgateway scrape targets are not included on monitoring02. +pve-exporter requires a local exporter instance; pushgateway is replaced by VictoriaMetrics +native push support. ### Phase 3: Parallel Operation @@ -171,24 +171,9 @@ Once ready to cut over: ## Current Progress -### monitoring02 Host Created (2026-02-08) - -Host deployed at 10.69.13.24 (test tier) with: -- 4 CPU cores, 8GB RAM, 60GB disk -- Vault integration enabled -- NATS-based remote deployment enabled - -### Grafana with Kanidm OIDC (2026-02-08) - -Grafana deployed on monitoring02 as a test instance (`grafana-test.home.2rjus.net`): -- Kanidm OIDC authentication (PKCE enabled) -- Role mapping: `admins` → Admin, others → Viewer -- Declarative datasources pointing to monitoring01 (Prometheus, Loki) -- Local Caddy for TLS termination via internal ACME CA - -This validates the Grafana + OIDC pattern before the full VictoriaMetrics migration. The existing -`services/monitoring/grafana.nix` on monitoring01 can be replaced with the new `services/grafana/` -module once monitoring02 becomes the primary monitoring host. +- **Phase 1** complete (2026-02-08): monitoring02 host created, Grafana with Kanidm OIDC validated +- **Phase 2** in progress (2026-02-17): VictoriaMetrics, vmalert, Alertmanager, Grafana datasources configured + - Remaining: Loki, Tempo, Pyroscope migration ## Open Questions @@ -198,31 +183,14 @@ module once monitoring02 becomes the primary monitoring host. ## VictoriaMetrics Service Configuration -Example NixOS configuration for monitoring02: +Implemented in `services/victoriametrics/default.nix`. Key design decisions: -```nix -# VictoriaMetrics replaces Prometheus -services.victoriametrics = { - enable = true; - retentionPeriod = "3m"; # 3 months, increase based on disk usage - prometheusConfig = { - global.scrape_interval = "15s"; - scrape_configs = [ - # Auto-generated node-exporter targets - # Service-specific scrape targets - # External targets - ]; - }; -}; - -# vmalert for alerting rules (no receiver during parallel operation) -services.vmalert = { - enable = true; - datasource.url = "http://localhost:8428"; - # notifier.alertmanager.url = "http://localhost:9093"; # Enable after cutover - rule = [ ./rules.yml ]; -}; -``` +- **Static user**: VictoriaMetrics NixOS module uses `DynamicUser`, overridden with a static + `victoriametrics` user so vault.secrets and credential files work correctly +- **Shared rules**: vmalert reuses `services/monitoring/rules.yml` via `settings.rule` path + reference (no YAML-to-Nix conversion needed) +- **Scrape config reuse**: Uses the same `lib/monitoring.nix` functions and + `services/monitoring/external-targets.nix` as Prometheus for auto-generated targets ## Rollback Plan diff --git a/hosts/monitoring02/default.nix b/hosts/monitoring02/default.nix index a102f2b..ea273a4 100644 --- a/hosts/monitoring02/default.nix +++ b/hosts/monitoring02/default.nix @@ -2,5 +2,6 @@ imports = [ ./configuration.nix ../../services/grafana + ../../services/victoriametrics ]; } \ No newline at end of file diff --git a/services/grafana/default.nix b/services/grafana/default.nix index 75413c1..b22a5d7 100644 --- a/services/grafana/default.nix +++ b/services/grafana/default.nix @@ -34,15 +34,21 @@ }; }; - # Declarative datasources pointing to monitoring01 + # Declarative datasources provision.datasources.settings = { apiVersion = 1; datasources = [ { - name = "Prometheus"; + name = "VictoriaMetrics"; + type = "prometheus"; + url = "http://localhost:8428"; + isDefault = true; + uid = "victoriametrics"; + } + { + name = "Prometheus (monitoring01)"; type = "prometheus"; url = "http://monitoring01.home.2rjus.net:9090"; - isDefault = true; uid = "prometheus"; } { diff --git a/services/victoriametrics/default.nix b/services/victoriametrics/default.nix new file mode 100644 index 0000000..e0c4f93 --- /dev/null +++ b/services/victoriametrics/default.nix @@ -0,0 +1,211 @@ +{ self, config, lib, pkgs, ... }: +let + monLib = import ../../lib/monitoring.nix { inherit lib; }; + externalTargets = import ../monitoring/external-targets.nix; + + nodeExporterTargets = monLib.generateNodeExporterTargets self externalTargets; + autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets; + + # Script to fetch AppRole token for VictoriaMetrics to use when scraping OpenBao metrics + fetchOpenbaoToken = pkgs.writeShellApplication { + name = "fetch-openbao-token-vm"; + runtimeInputs = [ pkgs.curl pkgs.jq ]; + text = '' + VAULT_ADDR="https://vault01.home.2rjus.net:8200" + APPROLE_DIR="/var/lib/vault/approle" + OUTPUT_FILE="/run/secrets/victoriametrics/openbao-token" + + # Read AppRole credentials + if [ ! -f "$APPROLE_DIR/role-id" ] || [ ! -f "$APPROLE_DIR/secret-id" ]; then + echo "AppRole credentials not found at $APPROLE_DIR" >&2 + exit 1 + fi + + ROLE_ID=$(cat "$APPROLE_DIR/role-id") + SECRET_ID=$(cat "$APPROLE_DIR/secret-id") + + # Authenticate to Vault + AUTH_RESPONSE=$(curl -sf -k -X POST \ + -d "{\"role_id\":\"$ROLE_ID\",\"secret_id\":\"$SECRET_ID\"}" \ + "$VAULT_ADDR/v1/auth/approle/login") + + # Extract token + VAULT_TOKEN=$(echo "$AUTH_RESPONSE" | jq -r '.auth.client_token') + if [ -z "$VAULT_TOKEN" ] || [ "$VAULT_TOKEN" = "null" ]; then + echo "Failed to extract Vault token from response" >&2 + exit 1 + fi + + # Write token to file + mkdir -p "$(dirname "$OUTPUT_FILE")" + echo -n "$VAULT_TOKEN" > "$OUTPUT_FILE" + chown victoriametrics:victoriametrics "$OUTPUT_FILE" + chmod 0400 "$OUTPUT_FILE" + + echo "Successfully fetched OpenBao token" + ''; + }; + + scrapeConfigs = [ + # Auto-generated node-exporter targets from flake hosts + external + { + job_name = "node-exporter"; + static_configs = nodeExporterTargets; + } + # Systemd exporter on all hosts (same targets, different port) + { + job_name = "systemd-exporter"; + static_configs = map + (cfg: cfg // { + targets = map (t: builtins.replaceStrings [ ":9100" ] [ ":9558" ] t) cfg.targets; + }) + nodeExporterTargets; + } + # Local monitoring services + { + job_name = "victoriametrics"; + static_configs = [{ targets = [ "localhost:8428" ]; }]; + } + { + job_name = "loki"; + static_configs = [{ targets = [ "localhost:3100" ]; }]; + } + { + job_name = "grafana"; + static_configs = [{ targets = [ "localhost:3000" ]; }]; + } + { + job_name = "alertmanager"; + static_configs = [{ targets = [ "localhost:9093" ]; }]; + } + # Caddy metrics from nix-cache02 + { + job_name = "nix-cache_caddy"; + scheme = "https"; + static_configs = [{ targets = [ "nix-cache.home.2rjus.net" ]; }]; + } + # OpenBao metrics with bearer token auth + { + job_name = "openbao"; + scheme = "https"; + metrics_path = "/v1/sys/metrics"; + params = { format = [ "prometheus" ]; }; + static_configs = [{ targets = [ "vault01.home.2rjus.net:8200" ]; }]; + authorization = { + type = "Bearer"; + credentials_file = "/run/secrets/victoriametrics/openbao-token"; + }; + } + # Apiary external service + { + job_name = "apiary"; + scheme = "https"; + scrape_interval = "60s"; + static_configs = [{ targets = [ "apiary.t-juice.club" ]; }]; + authorization = { + type = "Bearer"; + credentials_file = "/run/secrets/victoriametrics-apiary-token"; + }; + } + ] ++ autoScrapeConfigs; +in +{ + # Static user for VictoriaMetrics (overrides DynamicUser) so vault.secrets + # and credential files can be owned by this user + users.users.victoriametrics = { + isSystemUser = true; + group = "victoriametrics"; + }; + users.groups.victoriametrics = { }; + + # Override DynamicUser since we need a static user for credential file access + systemd.services.victoriametrics.serviceConfig = { + DynamicUser = lib.mkForce false; + User = "victoriametrics"; + Group = "victoriametrics"; + }; + + # Systemd service to fetch AppRole token for OpenBao scraping + systemd.services.victoriametrics-openbao-token = { + description = "Fetch OpenBao token for VictoriaMetrics metrics scraping"; + after = [ "network-online.target" ]; + wants = [ "network-online.target" ]; + before = [ "victoriametrics.service" ]; + requiredBy = [ "victoriametrics.service" ]; + + serviceConfig = { + Type = "oneshot"; + ExecStart = lib.getExe fetchOpenbaoToken; + }; + }; + + # Timer to periodically refresh the token (AppRole tokens have 1-hour TTL) + systemd.timers.victoriametrics-openbao-token = { + description = "Refresh OpenBao token for VictoriaMetrics"; + wantedBy = [ "timers.target" ]; + timerConfig = { + OnBootSec = "5min"; + OnUnitActiveSec = "30min"; + RandomizedDelaySec = "5min"; + }; + }; + + # Fetch apiary bearer token from Vault + vault.secrets.victoriametrics-apiary-token = { + secretPath = "hosts/monitoring01/apiary-token"; + extractKey = "password"; + owner = "victoriametrics"; + group = "victoriametrics"; + services = [ "victoriametrics" ]; + }; + + services.victoriametrics = { + enable = true; + retentionPeriod = "3"; # 3 months + # Disable config check since we reference external credential files + checkConfig = false; + prometheusConfig = { + global.scrape_interval = "15s"; + scrape_configs = scrapeConfigs; + }; + }; + + # vmalert for alerting rules - no notifier during parallel operation + services.vmalert.instances.default = { + enable = true; + settings = { + "datasource.url" = "http://localhost:8428"; + # Blackhole notifications during parallel operation to prevent duplicate alerts. + # Replace with notifier.url after cutover from monitoring01: + # "notifier.url" = [ "http://localhost:9093" ]; + "notifier.blackhole" = true; + "rule" = [ ../monitoring/rules.yml ]; + }; + }; + + # Alertmanager - same config as monitoring01 but will only receive + # alerts after cutover (vmalert notifier is disabled above) + services.prometheus.alertmanager = { + enable = true; + configuration = { + global = { }; + route = { + receiver = "webhook_natstonotify"; + group_wait = "30s"; + group_interval = "5m"; + repeat_interval = "1h"; + group_by = [ "alertname" ]; + }; + receivers = [ + { + name = "webhook_natstonotify"; + webhook_configs = [ + { + url = "http://localhost:5001/alert"; + } + ]; + } + ]; + }; + }; +} -- 2.49.1 From 4cbaa334753eac5329c2f93b35942f7f14894cf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Tue, 17 Feb 2026 00:36:11 +0100 Subject: [PATCH 2/3] monitoring02: add Caddy reverse proxy for VictoriaMetrics and vmalert Add metrics.home.2rjus.net and vmalert.home.2rjus.net CNAMEs with Caddy TLS termination via internal ACME CA. Refactors Grafana's Caddy config from configFile to globalConfig + virtualHosts so both modules can contribute routes to the same Caddy instance. Co-Authored-By: Claude Opus 4.6 --- hosts/monitoring02/configuration.nix | 3 +-- services/grafana/default.nix | 26 ++++++++++++-------------- services/victoriametrics/default.nix | 8 ++++++++ 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/hosts/monitoring02/configuration.nix b/hosts/monitoring02/configuration.nix index 1031c36..3cf2f8d 100644 --- a/hosts/monitoring02/configuration.nix +++ b/hosts/monitoring02/configuration.nix @@ -18,8 +18,7 @@ role = "monitoring"; }; - # DNS CNAME for Grafana test instance - homelab.dns.cnames = [ "grafana-test" ]; + homelab.dns.cnames = [ "grafana-test" "metrics" "vmalert" ]; # Enable Vault integration vault.enable = true; diff --git a/services/grafana/default.nix b/services/grafana/default.nix index b22a5d7..a0dc7b4 100644 --- a/services/grafana/default.nix +++ b/services/grafana/default.nix @@ -87,22 +87,20 @@ services.caddy = { enable = true; package = pkgs.unstable.caddy; - configFile = pkgs.writeText "Caddyfile" '' - { - acme_ca https://vault.home.2rjus.net:8200/v1/pki_int/acme/directory - metrics - } - - grafana-test.home.2rjus.net { - log { - output file /var/log/caddy/grafana.log { - mode 644 - } + globalConfig = '' + acme_ca https://vault.home.2rjus.net:8200/v1/pki_int/acme/directory + metrics + ''; + virtualHosts."grafana-test.home.2rjus.net".extraConfig = '' + log { + output file /var/log/caddy/grafana.log { + mode 644 } - - reverse_proxy http://127.0.0.1:3000 } - + reverse_proxy http://127.0.0.1:3000 + ''; + # Metrics endpoint on plain HTTP for Prometheus scraping + extraConfig = '' http://${config.networking.hostName}.home.2rjus.net/metrics { metrics } diff --git a/services/victoriametrics/default.nix b/services/victoriametrics/default.nix index e0c4f93..02aee75 100644 --- a/services/victoriametrics/default.nix +++ b/services/victoriametrics/default.nix @@ -183,6 +183,14 @@ in }; }; + # Caddy reverse proxy for VictoriaMetrics and vmalert + services.caddy.virtualHosts."metrics.home.2rjus.net".extraConfig = '' + reverse_proxy http://127.0.0.1:8428 + ''; + services.caddy.virtualHosts."vmalert.home.2rjus.net".extraConfig = '' + reverse_proxy http://127.0.0.1:8880 + ''; + # Alertmanager - same config as monitoring01 but will only receive # alerts after cutover (vmalert notifier is disabled above) services.prometheus.alertmanager = { -- 2.49.1 From a013e80f1acaec43d2a8f2055afb30905ba2af72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Tue, 17 Feb 2026 00:52:07 +0100 Subject: [PATCH 3/3] terraform: grant monitoring02 access to apiary-token secret Co-Authored-By: Claude Opus 4.6 --- terraform/vault/approle.tf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/terraform/vault/approle.tf b/terraform/vault/approle.tf index 6f05de0..8542812 100644 --- a/terraform/vault/approle.tf +++ b/terraform/vault/approle.tf @@ -104,10 +104,11 @@ locals { ] } - # monitoring02: Grafana test instance + # monitoring02: Grafana + VictoriaMetrics "monitoring02" = { paths = [ "secret/data/hosts/monitoring02/*", + "secret/data/hosts/monitoring01/apiary-token", "secret/data/services/grafana/*", ] } -- 2.49.1