From 6184f4cbbb4282dce1a387f17b06724256daf7c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= <torjus@usit.uio.no>
Date: Tue, 17 Feb 2026 20:56:55 +0100
Subject: [PATCH] monitoring02: enable alerting and migrate CNAMEs from
 http-proxy

- Switch vmalert from blackhole mode to sending alerts to local
  Alertmanager
- Import alerttonotify service so alerts route to NATS notifications
- Move alertmanager and grafana CNAMEs from http-proxy to monitoring02
- Add monitoring CNAME to monitoring02
- Add Caddy reverse proxy entries for alertmanager and grafana
- Remove prometheus, alertmanager, and grafana Caddy entries from
  http-proxy (now served directly by monitoring02)
- Move monitoring02 Vault AppRole to hosts-generated.tf with
  extra_policies support and prometheus-metrics policy
- Update Promtail to use authenticated loki.home.2rjus.net endpoint
  only (remove unauthenticated monitoring01 client)
- Update pipe-to-loki and bootstrap to use loki.home.2rjus.net with
  basic auth from Vault secret
- Update migration plan with current status

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../monitoring-migration-victoriametrics.md   | 139 ++++++------------
 hosts/http-proxy/configuration.nix            |   3 -
 hosts/monitoring02/configuration.nix          |   2 +-
 hosts/monitoring02/default.nix                |   1 +
 hosts/template2/bootstrap.nix                 |   9 +-
 services/grafana/default.nix                  |   8 +
 services/http-proxy/proxy.nix                 |  25 +---
 services/victoriametrics/default.nix          |  14 +-
 system/monitoring/logs.nix                    |   4 -
 system/pipe-to-loki.nix                       |   9 +-
 terraform/vault/approle.tf                    |   9 --
 terraform/vault/hosts-generated.tf            |  16 +-
 12 files changed, 95 insertions(+), 144 deletions(-)

diff --git a/docs/plans/monitoring-migration-victoriametrics.md b/docs/plans/monitoring-migration-victoriametrics.md
index d562c41..ffe1b5f 100644
--- a/docs/plans/monitoring-migration-victoriametrics.md
+++ b/docs/plans/monitoring-migration-victoriametrics.md
@@ -8,23 +8,17 @@ a `monitoring` CNAME for seamless transition.
 
 ## Current State
 
-**monitoring01** (10.69.13.13):
-- 4 CPU cores, 4GB RAM, 33GB disk
-- Prometheus with 30-day retention (15s scrape interval)
-- Alertmanager (routes to alerttonotify webhook)
-- Grafana (dashboards, datasources)
-- Loki (log aggregation from all hosts via Promtail)
-- Tempo (distributed tracing) - not actively used
-- Pyroscope (continuous profiling) - not actively used
+**monitoring02** (10.69.13.24) - **PRIMARY**:
+- 4 CPU cores, 8GB RAM, 60GB disk
+- VictoriaMetrics with 3-month retention
+- vmalert with alerting enabled (routes to local Alertmanager)
+- Alertmanager -> alerttonotify -> NATS notification pipeline
+- Grafana with Kanidm OIDC (`grafana.home.2rjus.net`)
+- Loki (log aggregation)
+- CNAMEs: monitoring, alertmanager, grafana, grafana-test, metrics, vmalert, loki
 
-**Hardcoded References to monitoring01:**
-- `system/monitoring/logs.nix` - Promtail sends logs to `http://monitoring01.home.2rjus.net:3100`
-- `hosts/template2/bootstrap.nix` - Bootstrap logs to Loki (keep as-is until decommission)
-- `services/http-proxy/proxy.nix` - Caddy proxies Prometheus, Alertmanager, Grafana, Pyroscope, Pushgateway
-
-**Auto-generated:**
-- Prometheus scrape targets (from `lib/monitoring.nix` + `homelab.monitoring.scrapeTargets`)
-- Node-exporter targets (from all hosts with static IPs)
+**monitoring01** (10.69.13.13) - **SHUT DOWN**:
+- No longer running, pending decommission
 
 ## Decision: VictoriaMetrics
 
@@ -67,12 +61,12 @@ Host created and deployed at 10.69.13.24 (prod tier) with:
 - NATS-based remote deployment enabled
 - Grafana with Kanidm OIDC deployed as test instance (`grafana-test.home.2rjus.net`)
 
-### Phase 2: Set Up VictoriaMetrics Stack
+### Phase 2: Set Up VictoriaMetrics Stack [COMPLETE]
 
 New service module at `services/victoriametrics/` for VictoriaMetrics + vmalert + Alertmanager.
 Imported by monitoring02 alongside the existing Grafana service.
 
-1. **VictoriaMetrics** (port 8428): [DONE]
+1. **VictoriaMetrics** (port 8428):
    - `services.victoriametrics.enable = true`
    - `retentionPeriod = "3"` (3 months)
    - All scrape configs migrated from Prometheus (22 jobs including auto-generated)
@@ -80,21 +74,20 @@ Imported by monitoring02 alongside the existing Grafana service.
    - OpenBao token fetch service + 30min refresh timer
    - Apiary bearer token via vault.secrets
 
-2. **vmalert** for alerting rules: [DONE]
+2. **vmalert** for alerting rules:
    - Points to VictoriaMetrics datasource at localhost:8428
    - Reuses existing `services/monitoring/rules.yml` directly via `settings.rule`
-   - No notifier configured during parallel operation (prevents duplicate alerts)
+   - Notifier sends to local Alertmanager at localhost:9093
 
-3. **Alertmanager** (port 9093): [DONE]
+3. **Alertmanager** (port 9093):
    - Same configuration as monitoring01 (alerttonotify webhook routing)
-   - Will only receive alerts after cutover (vmalert notifier disabled)
+   - alerttonotify imported on monitoring02, routes alerts via NATS
 
-4. **Grafana** (port 3000): [DONE]
+4. **Grafana** (port 3000):
    - VictoriaMetrics datasource (localhost:8428) as default
-   - monitoring01 Prometheus datasource kept for comparison during parallel operation
-   - Loki datasource pointing to localhost (after Loki migrated to monitoring02)
+   - Loki datasource pointing to localhost:3100
 
-5. **Loki** (port 3100): [DONE]
+5. **Loki** (port 3100):
    - Same configuration as monitoring01 in standalone `services/loki/` module
    - Grafana datasource updated to localhost:3100
 
@@ -102,76 +95,45 @@ Imported by monitoring02 alongside the existing Grafana service.
 pve-exporter requires a local exporter instance; pushgateway is replaced by VictoriaMetrics
 native push support.
 
-### Phase 3: Parallel Operation
+### Phase 3: Parallel Operation [COMPLETE]
 
-Run both monitoring01 and monitoring02 simultaneously:
+Ran both monitoring01 and monitoring02 simultaneously to validate data collection and dashboards.
 
-1. **Dual scraping**: Both hosts scrape the same targets
-   - Validates VictoriaMetrics is collecting data correctly
+### Phase 4: Add monitoring CNAME [COMPLETE]
 
-2. **Dual log shipping**: Configure Promtail to send logs to both Loki instances
-   - Add second client in `system/monitoring/logs.nix` pointing to monitoring02
+Added CNAMEs to monitoring02: monitoring, alertmanager, grafana, metrics, vmalert, loki.
 
-3. **Validate dashboards**: Access Grafana on monitoring02, verify dashboards work
+### Phase 5: Update References [COMPLETE]
 
-4. **Validate alerts**: Verify vmalert evaluates rules correctly (no receiver = no notifications)
+- Moved alertmanager, grafana, prometheus CNAMEs from http-proxy to monitoring02
+- Removed corresponding Caddy reverse proxy entries from http-proxy
+- monitoring02 Caddy serves alertmanager, grafana, metrics, vmalert directly
 
-5. **Compare resource usage**: Monitor disk/memory consumption between hosts
+### Phase 6: Enable Alerting [COMPLETE]
 
-### Phase 4: Add monitoring CNAME
+- Switched vmalert from blackhole mode to local Alertmanager
+- alerttonotify service running on monitoring02 (NATS nkey from Vault)
+- prometheus-metrics Vault policy added for OpenBao scraping
+- Full alerting pipeline verified: vmalert -> Alertmanager -> alerttonotify -> NATS
 
-Add CNAME to monitoring02 once validated:
+### Phase 7: Cutover and Decommission [IN PROGRESS]
 
-```nix
-# hosts/monitoring02/configuration.nix
-homelab.dns.cnames = [ "monitoring" ];
-```
+- monitoring01 shut down (2026-02-17)
+- Vault AppRole moved from approle.tf to hosts-generated.tf with extra_policies support
 
-This creates `monitoring.home.2rjus.net` pointing to monitoring02.
+**Remaining cleanup (separate branch):**
+- [ ] Update `system/monitoring/logs.nix` - Promtail still points to monitoring01
+- [ ] Update `hosts/template2/bootstrap.nix` - Bootstrap Loki URL still points to monitoring01
+- [ ] Remove monitoring01 from flake.nix and host configuration
+- [ ] Destroy monitoring01 VM in Proxmox
+- [ ] Remove monitoring01 from terraform state
+- [ ] Remove or archive `services/monitoring/` (Prometheus config)
 
-### Phase 5: Update References
+## Completed
 
-Update hardcoded references to use the CNAME:
-
-1. **system/monitoring/logs.nix**:
-   - Remove dual-shipping, point only to `http://monitoring.home.2rjus.net:3100`
-
-2. **services/http-proxy/proxy.nix**: Update reverse proxy backends:
-   - prometheus.home.2rjus.net -> monitoring.home.2rjus.net:8428
-   - alertmanager.home.2rjus.net -> monitoring.home.2rjus.net:9093
-   - grafana.home.2rjus.net -> monitoring.home.2rjus.net:3000
-
-Note: `hosts/template2/bootstrap.nix` stays pointed at monitoring01 until decommission.
-
-### Phase 6: Enable Alerting
-
-Once ready to cut over:
-1. Enable Alertmanager receiver on monitoring02
-2. Verify test alerts route correctly
-
-### Phase 7: Cutover and Decommission
-
-1. **Stop monitoring01**: Prevent duplicate alerts during transition
-2. **Update bootstrap.nix**: Point to `monitoring.home.2rjus.net`
-3. **Verify all targets scraped**: Check VictoriaMetrics UI
-4. **Verify logs flowing**: Check Loki on monitoring02
-5. **Decommission monitoring01**:
-   - Remove from flake.nix
-   - Remove host configuration
-   - Destroy VM in Proxmox
-   - Remove from terraform state
-
-## Current Progress
-
-- **Phase 1** complete (2026-02-08): monitoring02 host created, Grafana with Kanidm OIDC validated
-- **Phase 2** complete (2026-02-17): VictoriaMetrics, vmalert, Alertmanager, Loki, Grafana datasources configured
-  - Tempo and Pyroscope deferred (not actively used; can be added later if needed)
-
-## Open Questions
-
-- [ ] What disk size for monitoring02? Current 60GB may need expansion for 3+ months with VictoriaMetrics
-- [ ] Which dashboards to recreate declaratively? (Review monitoring01 Grafana for current set)
-- [ ] Consider replacing Promtail with Grafana Alloy (`services.alloy`, v1.12.2 in nixpkgs). Promtail is in maintenance mode and Grafana recommends Alloy as the successor. Alloy is a unified collector (logs, metrics, traces, profiles) but uses its own "River" config format instead of YAML, so less Nix-native ergonomics. Could bundle the migration with monitoring02 to consolidate disruption.
+- 2026-02-08: Phase 1 - monitoring02 host created
+- 2026-02-17: Phase 2 - VictoriaMetrics, vmalert, Alertmanager, Loki, Grafana configured
+- 2026-02-17: Phase 6 - Alerting enabled, CNAMEs migrated, monitoring01 shut down
 
 ## VictoriaMetrics Service Configuration
 
@@ -184,14 +146,6 @@ Implemented in `services/victoriametrics/default.nix`. Key design decisions:
 - **Scrape config reuse**: Uses the same `lib/monitoring.nix` functions and
   `services/monitoring/external-targets.nix` as Prometheus for auto-generated targets
 
-## Rollback Plan
-
-If issues arise after cutover:
-1. Move `monitoring` CNAME back to monitoring01
-2. Restart monitoring01 services
-3. Revert Promtail config to point only to monitoring01
-4. Revert http-proxy backends
-
 ## Notes
 
 - VictoriaMetrics uses port 8428 vs Prometheus 9090
@@ -199,3 +153,4 @@ If issues arise after cutover:
 - VictoriaMetrics native push replaces Pushgateway (remove from http-proxy if not needed)
 - monitoring02 deployed via OpenTofu using `create-host` script
 - Grafana dashboards defined declaratively via NixOS, not imported from monitoring01 state
+- Tempo and Pyroscope deferred (not actively used; can be added later if needed)
diff --git a/hosts/http-proxy/configuration.nix b/hosts/http-proxy/configuration.nix
index 75364f8..25e080d 100644
--- a/hosts/http-proxy/configuration.nix
+++ b/hosts/http-proxy/configuration.nix
@@ -18,9 +18,6 @@
     "sonarr"
     "ha"
     "z2m"
-    "grafana"
-    "prometheus"
-    "alertmanager"
     "jelly"
     "pyroscope"
     "pushgw"
diff --git a/hosts/monitoring02/configuration.nix b/hosts/monitoring02/configuration.nix
index 2616555..8e792ea 100644
--- a/hosts/monitoring02/configuration.nix
+++ b/hosts/monitoring02/configuration.nix
@@ -18,7 +18,7 @@
     role = "monitoring";
   };
 
-  homelab.dns.cnames = [ "grafana-test" "metrics" "vmalert" "loki" ];
+  homelab.dns.cnames = [ "monitoring" "alertmanager" "grafana" "grafana-test" "metrics" "vmalert" "loki" ];
 
   # Enable Vault integration
   vault.enable = true;
diff --git a/hosts/monitoring02/default.nix b/hosts/monitoring02/default.nix
index a8ef155..252daf0 100644
--- a/hosts/monitoring02/default.nix
+++ b/hosts/monitoring02/default.nix
@@ -4,5 +4,6 @@
     ../../services/grafana
     ../../services/victoriametrics
     ../../services/loki
+    ../../services/monitoring/alerttonotify.nix
   ];
 }
\ No newline at end of file
diff --git a/hosts/template2/bootstrap.nix b/hosts/template2/bootstrap.nix
index 8accb5a..e9fc4fc 100644
--- a/hosts/template2/bootstrap.nix
+++ b/hosts/template2/bootstrap.nix
@@ -6,7 +6,8 @@ let
     text = ''
       set -euo pipefail
 
-      LOKI_URL="http://monitoring01.home.2rjus.net:3100/loki/api/v1/push"
+      LOKI_URL="https://loki.home.2rjus.net/loki/api/v1/push"
+      LOKI_AUTH_FILE="/run/secrets/promtail-loki-auth"
 
       # Send a log entry to Loki with bootstrap status
       # Usage: log_to_loki <stage> <message>
@@ -36,8 +37,14 @@ let
             }]
           }')
 
+        local auth_args=()
+        if [[ -f "$LOKI_AUTH_FILE" ]]; then
+          auth_args=(-u "promtail:$(cat "$LOKI_AUTH_FILE")")
+        fi
+
         curl -s --connect-timeout 2 --max-time 5 \
           -X POST \
+          "''${auth_args[@]}" \
           -H "Content-Type: application/json" \
           -d "$payload" \
           "$LOKI_URL" >/dev/null 2>&1 || true
diff --git a/services/grafana/default.nix b/services/grafana/default.nix
index ed5aece..8fb645f 100644
--- a/services/grafana/default.nix
+++ b/services/grafana/default.nix
@@ -91,6 +91,14 @@
       acme_ca https://vault.home.2rjus.net:8200/v1/pki_int/acme/directory
       metrics
     '';
+    virtualHosts."grafana.home.2rjus.net".extraConfig = ''
+      log {
+        output file /var/log/caddy/grafana.log {
+          mode 644
+        }
+      }
+      reverse_proxy http://127.0.0.1:3000
+    '';
     virtualHosts."grafana-test.home.2rjus.net".extraConfig = ''
       log {
         output file /var/log/caddy/grafana.log {
diff --git a/services/http-proxy/proxy.nix b/services/http-proxy/proxy.nix
index 8756dd4..613a162 100644
--- a/services/http-proxy/proxy.nix
+++ b/services/http-proxy/proxy.nix
@@ -54,30 +54,7 @@
         }
         reverse_proxy http://ha1.home.2rjus.net:8080
       }
-      prometheus.home.2rjus.net {
-        log {
-          output file /var/log/caddy/prometheus.log {
-            mode 644
-          }
-        }
-        reverse_proxy http://monitoring01.home.2rjus.net:9090
-      }
-      alertmanager.home.2rjus.net {
-        log {
-          output file /var/log/caddy/alertmanager.log {
-            mode 644
-          }
-        }
-        reverse_proxy http://monitoring01.home.2rjus.net:9093
-      }
-      grafana.home.2rjus.net {
-        log {
-          output file /var/log/caddy/grafana.log {
-            mode 644
-          }
-        }
-        reverse_proxy http://monitoring01.home.2rjus.net:3000
-      }
+
       jelly.home.2rjus.net {
         log {
           output file /var/log/caddy/jelly.log {
diff --git a/services/victoriametrics/default.nix b/services/victoriametrics/default.nix
index 02aee75..2c2af1b 100644
--- a/services/victoriametrics/default.nix
+++ b/services/victoriametrics/default.nix
@@ -170,15 +170,12 @@ in
     };
   };
 
-  # vmalert for alerting rules - no notifier during parallel operation
+  # vmalert for alerting rules
   services.vmalert.instances.default = {
     enable = true;
     settings = {
       "datasource.url" = "http://localhost:8428";
-      # Blackhole notifications during parallel operation to prevent duplicate alerts.
-      # Replace with notifier.url after cutover from monitoring01:
-      # "notifier.url" = [ "http://localhost:9093" ];
-      "notifier.blackhole" = true;
+      "notifier.url" = [ "http://localhost:9093" ];
       "rule" = [ ../monitoring/rules.yml ];
     };
   };
@@ -191,8 +188,11 @@ in
     reverse_proxy http://127.0.0.1:8880
   '';
 
-  # Alertmanager - same config as monitoring01 but will only receive
-  # alerts after cutover (vmalert notifier is disabled above)
+  # Alertmanager
+  services.caddy.virtualHosts."alertmanager.home.2rjus.net".extraConfig = ''
+    reverse_proxy http://127.0.0.1:9093
+  '';
+
   services.prometheus.alertmanager = {
     enable = true;
     configuration = {
diff --git a/system/monitoring/logs.nix b/system/monitoring/logs.nix
index 6a21a62..a497a19 100644
--- a/system/monitoring/logs.nix
+++ b/system/monitoring/logs.nix
@@ -38,10 +38,6 @@ in
       };
 
       clients = [
-        {
-          url = "http://monitoring01.home.2rjus.net:3100/loki/api/v1/push";
-        }
-      ] ++ lib.optionals config.vault.enable [
         {
           url = "https://loki.home.2rjus.net/loki/api/v1/push";
           basic_auth = {
diff --git a/system/pipe-to-loki.nix b/system/pipe-to-loki.nix
index 7c4f3e4..e12d13b 100644
--- a/system/pipe-to-loki.nix
+++ b/system/pipe-to-loki.nix
@@ -16,7 +16,8 @@ let
     text = ''
       set -euo pipefail
 
-      LOKI_URL="http://monitoring01.home.2rjus.net:3100/loki/api/v1/push"
+      LOKI_URL="https://loki.home.2rjus.net/loki/api/v1/push"
+      LOKI_AUTH_FILE="/run/secrets/promtail-loki-auth"
       HOSTNAME=$(hostname)
       SESSION_ID=""
       RECORD_MODE=false
@@ -69,7 +70,13 @@ let
             }]
           }')
 
+        local auth_args=()
+        if [[ -f "$LOKI_AUTH_FILE" ]]; then
+          auth_args=(-u "promtail:$(cat "$LOKI_AUTH_FILE")")
+        fi
+
         if curl -s -X POST "$LOKI_URL" \
+          "''${auth_args[@]}" \
           -H "Content-Type: application/json" \
           -d "$payload" > /dev/null; then
           return 0
diff --git a/terraform/vault/approle.tf b/terraform/vault/approle.tf
index 5f76056..1e5956a 100644
--- a/terraform/vault/approle.tf
+++ b/terraform/vault/approle.tf
@@ -115,15 +115,6 @@ locals {
       ]
     }
 
-    # monitoring02: Grafana + VictoriaMetrics
-    "monitoring02" = {
-      paths = [
-        "secret/data/hosts/monitoring02/*",
-        "secret/data/hosts/monitoring01/apiary-token",
-        "secret/data/services/grafana/*",
-      ]
-    }
-
   }
 }
 
diff --git a/terraform/vault/hosts-generated.tf b/terraform/vault/hosts-generated.tf
index 4854b70..5257919 100644
--- a/terraform/vault/hosts-generated.tf
+++ b/terraform/vault/hosts-generated.tf
@@ -44,7 +44,16 @@ locals {
         "secret/data/hosts/garage01/*",
       ]
     }
-  
+    "monitoring02" = {
+      paths = [
+        "secret/data/hosts/monitoring02/*",
+        "secret/data/hosts/monitoring01/apiary-token",
+        "secret/data/services/grafana/*",
+        "secret/data/shared/nats/nkey",
+      ]
+      extra_policies = ["prometheus-metrics"]
+    }
+
   }
 
   # Placeholder secrets - user should add actual secrets manually or via tofu
@@ -74,7 +83,10 @@ resource "vault_approle_auth_backend_role" "generated_hosts" {
 
   backend            = vault_auth_backend.approle.path
   role_name          = each.key
-  token_policies     = ["host-${each.key}", "homelab-deploy", "nixos-exporter", "loki-push"]
+  token_policies     = concat(
+    ["host-${each.key}", "homelab-deploy", "nixos-exporter", "loki-push"],
+    lookup(each.value, "extra_policies", [])
+  )
   secret_id_ttl      = 0 # Never expire (wrapped tokens provide time limit)
   token_ttl          = 3600
   token_max_ttl      = 3600