From 4f593126c00430502a85227e3692dc5a8f9cd1c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Tue, 17 Feb 2026 21:50:20 +0100 Subject: [PATCH] monitoring01: remove host and migrate services to monitoring02 Remove monitoring01 host configuration and unused service modules (prometheus, grafana, loki, tempo, pyroscope). Migrate blackbox, exportarr, and pve exporters to monitoring02 with scrape configs moved to VictoriaMetrics. Update alert rules, terraform vault policies/secrets, http-proxy entries, and documentation to reflect the monitoring02 migration. Co-Authored-By: Claude Opus 4.6 --- .claude/agents/investigate-alarm.md | 2 +- .claude/skills/observability/SKILL.md | 23 +- CLAUDE.md | 20 +- README.md | 4 +- flake.nix | 9 - hosts/http-proxy/configuration.nix | 2 - hosts/monitoring01/configuration.nix | 114 -------- hosts/monitoring01/default.nix | 7 - hosts/monitoring01/hardware-configuration.nix | 42 --- hosts/monitoring02/default.nix | 3 + scripts/vault-fetch/README.md | 12 +- scripts/vault-fetch/vault-fetch.sh | 4 +- services/grafana/default.nix | 8 +- services/http-proxy/proxy.nix | 16 -- services/monitoring/blackbox.nix | 61 ---- services/monitoring/default.nix | 14 - services/monitoring/exportarr.nix | 10 - services/monitoring/grafana.nix | 11 - services/monitoring/loki.nix | 58 ---- services/monitoring/prometheus.nix | 267 ------------------ services/monitoring/pve.nix | 2 +- services/monitoring/pyroscope.nix | 8 - services/monitoring/rules.yml | 46 +-- services/monitoring/tempo.nix | 37 --- services/victoriametrics/default.nix | 53 +++- system/vault-secrets.nix | 14 +- terraform/vault/approle.tf | 24 +- terraform/vault/hosts-generated.tf | 2 +- terraform/vault/secrets.tf | 15 +- 29 files changed, 115 insertions(+), 773 deletions(-) delete mode 100644 hosts/monitoring01/configuration.nix delete mode 100644 hosts/monitoring01/default.nix delete mode 100644 hosts/monitoring01/hardware-configuration.nix delete mode 100644 services/monitoring/default.nix delete mode 100644 services/monitoring/grafana.nix delete mode 100644 services/monitoring/loki.nix delete mode 100644 services/monitoring/prometheus.nix delete mode 100644 services/monitoring/pyroscope.nix delete mode 100644 services/monitoring/tempo.nix diff --git a/.claude/agents/investigate-alarm.md b/.claude/agents/investigate-alarm.md index 11462ea..ebdf233 100644 --- a/.claude/agents/investigate-alarm.md +++ b/.claude/agents/investigate-alarm.md @@ -130,7 +130,7 @@ get_commit_info() # Get full details of a specific change ``` **Example workflow for a service-related alert:** -1. Query `nixos_flake_info{hostname="monitoring01"}` → `current_rev: 8959829` +1. Query `nixos_flake_info{hostname="monitoring02"}` → `current_rev: 8959829` 2. `resolve_ref("master")` → `4633421` 3. `is_ancestor("8959829", "4633421")` → Yes, host is behind 4. `commits_between("8959829", "4633421")` → 7 commits missing diff --git a/.claude/skills/observability/SKILL.md b/.claude/skills/observability/SKILL.md index f89ea93..3b3886f 100644 --- a/.claude/skills/observability/SKILL.md +++ b/.claude/skills/observability/SKILL.md @@ -30,7 +30,7 @@ Use the `lab-monitoring` MCP server tools: ### Label Reference Available labels for log queries: -- `hostname` - Hostname (e.g., `ns1`, `monitoring01`, `ha1`) - matches the Prometheus `hostname` label +- `hostname` - Hostname (e.g., `ns1`, `monitoring02`, `ha1`) - matches the Prometheus `hostname` label - `systemd_unit` - Systemd unit name (e.g., `nsd.service`, `nixos-upgrade.service`) - `job` - Either `systemd-journal` (most logs), `varlog` (file-based logs), or `bootstrap` (VM bootstrap logs) - `filename` - For `varlog` job, the log file path @@ -54,7 +54,7 @@ Journal logs are JSON-formatted. Key fields: **All logs from a host:** ```logql -{hostname="monitoring01"} +{hostname="monitoring02"} ``` **Logs from a service across all hosts:** @@ -74,7 +74,7 @@ Journal logs are JSON-formatted. Key fields: **Regex matching:** ```logql -{systemd_unit="prometheus.service"} |~ "scrape.*failed" +{systemd_unit="victoriametrics.service"} |~ "scrape.*failed" ``` **Filter by level (journal scrape only):** @@ -109,7 +109,7 @@ Default lookback is 1 hour. Use `start` parameter for older logs: Useful systemd units for troubleshooting: - `nixos-upgrade.service` - Daily auto-upgrade logs - `nsd.service` - DNS server (ns1/ns2) -- `prometheus.service` - Metrics collection +- `victoriametrics.service` - Metrics collection - `loki.service` - Log aggregation - `caddy.service` - Reverse proxy - `home-assistant.service` - Home automation @@ -152,7 +152,7 @@ VMs provisioned from template2 send bootstrap progress directly to Loki via curl Parse JSON and filter on fields: ```logql -{systemd_unit="prometheus.service"} | json | PRIORITY="3" +{systemd_unit="victoriametrics.service"} | json | PRIORITY="3" ``` --- @@ -242,12 +242,11 @@ All available Prometheus job names: - `unbound` - DNS resolver metrics (ns1, ns2) - `wireguard` - VPN tunnel metrics (http-proxy) -**Monitoring stack (localhost on monitoring01):** -- `prometheus` - Prometheus self-metrics +**Monitoring stack (localhost on monitoring02):** +- `victoriametrics` - VictoriaMetrics self-metrics - `loki` - Loki self-metrics - `grafana` - Grafana self-metrics - `alertmanager` - Alertmanager metrics -- `pushgateway` - Push-based metrics gateway **External/infrastructure:** - `pve-exporter` - Proxmox hypervisor metrics @@ -262,7 +261,7 @@ All scrape targets have these labels: **Standard labels:** - `instance` - Full target address (`.home.2rjus.net:`) - `job` - Job name (e.g., `node-exporter`, `unbound`, `nixos-exporter`) -- `hostname` - Short hostname (e.g., `ns1`, `monitoring01`) - use this for host filtering +- `hostname` - Short hostname (e.g., `ns1`, `monitoring02`) - use this for host filtering **Host metadata labels** (when configured in `homelab.host`): - `role` - Host role (e.g., `dns`, `build-host`, `vault`) @@ -275,7 +274,7 @@ Use the `hostname` label for easy host filtering across all jobs: ```promql {hostname="ns1"} # All metrics from ns1 -node_load1{hostname="monitoring01"} # Specific metric by hostname +node_load1{hostname="monitoring02"} # Specific metric by hostname up{hostname="ha1"} # Check if ha1 is up ``` @@ -283,10 +282,10 @@ This is simpler than wildcarding the `instance` label: ```promql # Old way (still works but verbose) -up{instance=~"monitoring01.*"} +up{instance=~"monitoring02.*"} # New way (preferred) -up{hostname="monitoring01"} +up{hostname="monitoring02"} ``` ### Filtering by Role/Tier diff --git a/CLAUDE.md b/CLAUDE.md index 452aea8..e7fc33f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -247,7 +247,7 @@ nix develop -c homelab-deploy -- deploy \ deploy.prod. ``` -Subject format: `deploy..` (e.g., `deploy.prod.monitoring01`, `deploy.test.testvm01`) +Subject format: `deploy..` (e.g., `deploy.prod.monitoring02`, `deploy.test.testvm01`) **Verifying Deployments:** @@ -309,7 +309,7 @@ All hosts automatically get: - OpenBao (Vault) secrets management via AppRole - Internal ACME CA integration (OpenBao PKI at vault.home.2rjus.net) - Daily auto-upgrades with auto-reboot -- Prometheus node-exporter + Promtail (logs to monitoring01) +- Prometheus node-exporter + Promtail (logs to monitoring02) - Monitoring scrape target auto-registration via `homelab.monitoring` options - Custom root CA trust - DNS zone auto-registration via `homelab.dns` options @@ -335,7 +335,7 @@ Use `nix flake show` or `nix develop -c ansible-inventory --graph` to list all h - Infrastructure subnet: `10.69.13.x` - DNS: ns1/ns2 provide authoritative DNS with primary-secondary setup - Internal CA for ACME certificates (no Let's Encrypt) -- Centralized monitoring at monitoring01 +- Centralized monitoring at monitoring02 - Static networking via systemd-networkd ### Secrets Management @@ -480,23 +480,21 @@ See [docs/host-creation.md](docs/host-creation.md) for the complete host creatio ### Monitoring Stack -All hosts ship metrics and logs to `monitoring01`: -- **Metrics**: Prometheus scrapes node-exporter from all hosts -- **Logs**: Promtail ships logs to Loki on monitoring01 -- **Access**: Grafana at monitoring01 for visualization -- **Tracing**: Tempo for distributed tracing -- **Profiling**: Pyroscope for continuous profiling +All hosts ship metrics and logs to `monitoring02`: +- **Metrics**: VictoriaMetrics scrapes node-exporter from all hosts +- **Logs**: Promtail ships logs to Loki on monitoring02 +- **Access**: Grafana at monitoring02 for visualization **Scrape Target Auto-Generation:** -Prometheus scrape targets are automatically generated from host configurations, following the same pattern as DNS zone generation: +VictoriaMetrics scrape targets are automatically generated from host configurations, following the same pattern as DNS zone generation: - **Node-exporter**: All flake hosts with static IPs are automatically added as node-exporter targets - **Service targets**: Defined via `homelab.monitoring.scrapeTargets` in service modules - **External targets**: Non-flake hosts defined in `/services/monitoring/external-targets.nix` - **Library**: `lib/monitoring.nix` provides `generateNodeExporterTargets` and `generateScrapeConfigs` -Service modules declare their scrape targets directly via `homelab.monitoring.scrapeTargets`. The Prometheus config on monitoring01 auto-generates scrape configs from all hosts. See "Homelab Module Options" section for available options. +Service modules declare their scrape targets directly via `homelab.monitoring.scrapeTargets`. The VictoriaMetrics config on monitoring02 auto-generates scrape configs from all hosts. See "Homelab Module Options" section for available options. To add monitoring targets for non-NixOS hosts, edit `/services/monitoring/external-targets.nix`. diff --git a/README.md b/README.md index 939f514..a13df69 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ NixOS Flake-based configuration repository for a homelab infrastructure. All hos | `ca` | Internal Certificate Authority | | `ha1` | Home Assistant + Zigbee2MQTT + Mosquitto | | `http-proxy` | Reverse proxy | -| `monitoring01` | Prometheus, Grafana, Loki, Tempo, Pyroscope | +| `monitoring02` | VictoriaMetrics, Grafana, Loki, Alertmanager | | `jelly01` | Jellyfin media server | | `nix-cache02` | Nix binary cache + NATS-based build service | | `nats1` | NATS messaging | @@ -121,4 +121,4 @@ No manual intervention is required after `tofu apply`. - Infrastructure subnet: `10.69.13.0/24` - DNS: ns1/ns2 authoritative with primary-secondary AXFR - Internal CA for TLS certificates (migrating from step-ca to OpenBao PKI) -- Centralized monitoring at monitoring01 +- Centralized monitoring at monitoring02 diff --git a/flake.nix b/flake.nix index 222b88f..9960431 100644 --- a/flake.nix +++ b/flake.nix @@ -92,15 +92,6 @@ ./hosts/http-proxy ]; }; - monitoring01 = nixpkgs.lib.nixosSystem { - inherit system; - specialArgs = { - inherit inputs self; - }; - modules = commonModules ++ [ - ./hosts/monitoring01 - ]; - }; jelly01 = nixpkgs.lib.nixosSystem { inherit system; specialArgs = { diff --git a/hosts/http-proxy/configuration.nix b/hosts/http-proxy/configuration.nix index 25e080d..3cb802f 100644 --- a/hosts/http-proxy/configuration.nix +++ b/hosts/http-proxy/configuration.nix @@ -19,8 +19,6 @@ "ha" "z2m" "jelly" - "pyroscope" - "pushgw" ]; nixpkgs.config.allowUnfree = true; diff --git a/hosts/monitoring01/configuration.nix b/hosts/monitoring01/configuration.nix deleted file mode 100644 index 7f87ef7..0000000 --- a/hosts/monitoring01/configuration.nix +++ /dev/null @@ -1,114 +0,0 @@ -{ - pkgs, - ... -}: - -{ - imports = [ - ./hardware-configuration.nix - - ../../system - ../../common/vm - ]; - - homelab.host.role = "monitoring"; - - nixpkgs.config.allowUnfree = true; - # Use the systemd-boot EFI boot loader. - boot.loader.grub = { - enable = true; - device = "/dev/sda"; - configurationLimit = 3; - }; - - networking.hostName = "monitoring01"; - networking.domain = "home.2rjus.net"; - networking.useNetworkd = true; - networking.useDHCP = false; - services.resolved.enable = true; - networking.nameservers = [ - "10.69.13.5" - "10.69.13.6" - ]; - - systemd.network.enable = true; - systemd.network.networks."ens18" = { - matchConfig.Name = "ens18"; - address = [ - "10.69.13.13/24" - ]; - routes = [ - { Gateway = "10.69.13.1"; } - ]; - linkConfig.RequiredForOnline = "routable"; - }; - time.timeZone = "Europe/Oslo"; - - nix.settings.experimental-features = [ - "nix-command" - "flakes" - ]; - nix.settings.tarball-ttl = 0; - environment.systemPackages = with pkgs; [ - vim - wget - git - sqlite - ]; - - services.qemuGuest.enable = true; - - # Vault secrets management - vault.enable = true; - homelab.deploy.enable = true; - vault.secrets.backup-helper = { - secretPath = "shared/backup/password"; - extractKey = "password"; - outputDir = "/run/secrets/backup_helper_secret"; - services = [ "restic-backups-grafana" "restic-backups-grafana-db" ]; - }; - - services.restic.backups.grafana = { - repository = "rest:http://10.69.12.52:8000/backup-nix"; - passwordFile = "/run/secrets/backup_helper_secret"; - paths = [ "/var/lib/grafana/plugins" ]; - timerConfig = { - OnCalendar = "daily"; - Persistent = true; - RandomizedDelaySec = "2h"; - }; - pruneOpts = [ - "--keep-daily 7" - "--keep-weekly 4" - "--keep-monthly 6" - "--keep-within 1d" - ]; - extraOptions = [ "--retry-lock=5m" ]; - }; - - services.restic.backups.grafana-db = { - repository = "rest:http://10.69.12.52:8000/backup-nix"; - passwordFile = "/run/secrets/backup_helper_secret"; - command = [ "${pkgs.sqlite}/bin/sqlite3" "/var/lib/grafana/data/grafana.db" ".dump" ]; - timerConfig = { - OnCalendar = "daily"; - Persistent = true; - RandomizedDelaySec = "2h"; - }; - pruneOpts = [ - "--keep-daily 7" - "--keep-weekly 4" - "--keep-monthly 6" - "--keep-within 1d" - ]; - extraOptions = [ "--retry-lock=5m" ]; - }; - - # Open ports in the firewall. - # networking.firewall.allowedTCPPorts = [ ... ]; - # networking.firewall.allowedUDPPorts = [ ... ]; - # Or disable the firewall altogether. - networking.firewall.enable = false; - - system.stateVersion = "23.11"; # Did you read the comment? -} diff --git a/hosts/monitoring01/default.nix b/hosts/monitoring01/default.nix deleted file mode 100644 index dc5ef1f..0000000 --- a/hosts/monitoring01/default.nix +++ /dev/null @@ -1,7 +0,0 @@ -{ ... }: -{ - imports = [ - ./configuration.nix - ../../services/monitoring - ]; -} diff --git a/hosts/monitoring01/hardware-configuration.nix b/hosts/monitoring01/hardware-configuration.nix deleted file mode 100644 index 48bf109..0000000 --- a/hosts/monitoring01/hardware-configuration.nix +++ /dev/null @@ -1,42 +0,0 @@ -{ - config, - lib, - pkgs, - modulesPath, - ... -}: - -{ - imports = [ - (modulesPath + "/profiles/qemu-guest.nix") - ]; - boot.initrd.availableKernelModules = [ - "ata_piix" - "uhci_hcd" - "virtio_pci" - "virtio_scsi" - "sd_mod" - "sr_mod" - ]; - boot.initrd.kernelModules = [ "dm-snapshot" ]; - boot.kernelModules = [ - "ptp_kvm" - ]; - boot.extraModulePackages = [ ]; - - fileSystems."/" = { - device = "/dev/disk/by-label/root"; - fsType = "xfs"; - }; - - swapDevices = [ { device = "/dev/disk/by-label/swap"; } ]; - - # Enables DHCP on each ethernet and wireless interface. In case of scripted networking - # (the default) this is the recommended approach. When using systemd-networkd it's - # still possible to use this option, but it's recommended to use it in conjunction - # with explicit per-interface declarations with `networking.interfaces..useDHCP`. - networking.useDHCP = lib.mkDefault true; - # networking.interfaces.ens18.useDHCP = lib.mkDefault true; - - nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux"; -} diff --git a/hosts/monitoring02/default.nix b/hosts/monitoring02/default.nix index 252daf0..4b17441 100644 --- a/hosts/monitoring02/default.nix +++ b/hosts/monitoring02/default.nix @@ -5,5 +5,8 @@ ../../services/victoriametrics ../../services/loki ../../services/monitoring/alerttonotify.nix + ../../services/monitoring/blackbox.nix + ../../services/monitoring/exportarr.nix + ../../services/monitoring/pve.nix ]; } \ No newline at end of file diff --git a/scripts/vault-fetch/README.md b/scripts/vault-fetch/README.md index 688f20c..1477080 100644 --- a/scripts/vault-fetch/README.md +++ b/scripts/vault-fetch/README.md @@ -20,10 +20,10 @@ vault-fetch [cache-directory] ```bash # Fetch Grafana admin secrets -vault-fetch hosts/monitoring01/grafana-admin /run/secrets/grafana /var/lib/vault/cache/grafana +vault-fetch hosts/ha1/mqtt-password /run/secrets/grafana /var/lib/vault/cache/grafana # Use default cache location -vault-fetch hosts/monitoring01/grafana-admin /run/secrets/grafana +vault-fetch hosts/ha1/mqtt-password /run/secrets/grafana ``` ## How It Works @@ -53,13 +53,13 @@ If Vault is unreachable or authentication fails: This tool is designed to be called from systemd service `ExecStartPre` hooks via the `vault.secrets` NixOS module: ```nix -vault.secrets.grafana-admin = { - secretPath = "hosts/monitoring01/grafana-admin"; +vault.secrets.mqtt-password = { + secretPath = "hosts/ha1/mqtt-password"; }; # Service automatically gets secrets fetched before start -systemd.services.grafana.serviceConfig = { - EnvironmentFile = "/run/secrets/grafana-admin/password"; +systemd.services.mosquitto.serviceConfig = { + EnvironmentFile = "/run/secrets/mqtt-password/password"; }; ``` diff --git a/scripts/vault-fetch/vault-fetch.sh b/scripts/vault-fetch/vault-fetch.sh index 3c2bd33..a500bba 100644 --- a/scripts/vault-fetch/vault-fetch.sh +++ b/scripts/vault-fetch/vault-fetch.sh @@ -5,7 +5,7 @@ set -euo pipefail # # Usage: vault-fetch [cache-directory] # -# Example: vault-fetch hosts/monitoring01/grafana-admin /run/secrets/grafana /var/lib/vault/cache/grafana +# Example: vault-fetch hosts/ha1/mqtt-password /run/secrets/grafana /var/lib/vault/cache/grafana # # This script: # 1. Authenticates to Vault using AppRole credentials from /var/lib/vault/approle/ @@ -17,7 +17,7 @@ set -euo pipefail # Parse arguments if [ $# -lt 2 ]; then echo "Usage: vault-fetch [cache-directory]" >&2 - echo "Example: vault-fetch hosts/monitoring01/grafana /run/secrets/grafana /var/lib/vault/cache/grafana" >&2 + echo "Example: vault-fetch hosts/ha1/mqtt-password /run/secrets/grafana /var/lib/vault/cache/grafana" >&2 exit 1 fi diff --git a/services/grafana/default.nix b/services/grafana/default.nix index 8fb645f..3da828b 100644 --- a/services/grafana/default.nix +++ b/services/grafana/default.nix @@ -45,13 +45,7 @@ isDefault = true; uid = "victoriametrics"; } - { - name = "Prometheus (monitoring01)"; - type = "prometheus"; - url = "http://monitoring01.home.2rjus.net:9090"; - uid = "prometheus"; - } - { +{ name = "Loki"; type = "loki"; url = "http://localhost:3100"; diff --git a/services/http-proxy/proxy.nix b/services/http-proxy/proxy.nix index 613a162..151da5a 100644 --- a/services/http-proxy/proxy.nix +++ b/services/http-proxy/proxy.nix @@ -63,22 +63,6 @@ } reverse_proxy http://jelly01.home.2rjus.net:8096 } - pyroscope.home.2rjus.net { - log { - output file /var/log/caddy/pyroscope.log { - mode 644 - } - } - reverse_proxy http://monitoring01.home.2rjus.net:4040 - } - pushgw.home.2rjus.net { - log { - output file /var/log/caddy/pushgw.log { - mode 644 - } - } - reverse_proxy http://monitoring01.home.2rjus.net:9091 - } http://http-proxy.home.2rjus.net/metrics { log { output file /var/log/caddy/caddy-metrics.log { diff --git a/services/monitoring/blackbox.nix b/services/monitoring/blackbox.nix index 1e334db..8e7f890 100644 --- a/services/monitoring/blackbox.nix +++ b/services/monitoring/blackbox.nix @@ -1,33 +1,4 @@ { pkgs, ... }: -let - # TLS endpoints to monitor for certificate expiration - # These are all services using ACME certificates from OpenBao PKI - tlsTargets = [ - # Direct ACME certs (security.acme.certs) - "https://vault.home.2rjus.net:8200" - "https://auth.home.2rjus.net" - "https://testvm01.home.2rjus.net" - - # Caddy auto-TLS on http-proxy - "https://nzbget.home.2rjus.net" - "https://radarr.home.2rjus.net" - "https://sonarr.home.2rjus.net" - "https://ha.home.2rjus.net" - "https://z2m.home.2rjus.net" - "https://prometheus.home.2rjus.net" - "https://alertmanager.home.2rjus.net" - "https://grafana.home.2rjus.net" - "https://jelly.home.2rjus.net" - "https://pyroscope.home.2rjus.net" - "https://pushgw.home.2rjus.net" - - # Caddy auto-TLS on nix-cache02 - "https://nix-cache.home.2rjus.net" - - # Caddy auto-TLS on grafana01 - "https://grafana-test.home.2rjus.net" - ]; -in { services.prometheus.exporters.blackbox = { enable = true; @@ -57,36 +28,4 @@ in - 503 ''; }; - - # Add blackbox scrape config to Prometheus - # Alert rules are in rules.yml (certificate_rules group) - services.prometheus.scrapeConfigs = [ - { - job_name = "blackbox_tls"; - metrics_path = "/probe"; - params = { - module = [ "https_cert" ]; - }; - static_configs = [{ - targets = tlsTargets; - }]; - relabel_configs = [ - # Pass the target URL to blackbox as a parameter - { - source_labels = [ "__address__" ]; - target_label = "__param_target"; - } - # Use the target URL as the instance label - { - source_labels = [ "__param_target" ]; - target_label = "instance"; - } - # Point the actual scrape at the local blackbox exporter - { - target_label = "__address__"; - replacement = "127.0.0.1:9115"; - } - ]; - } - ]; } diff --git a/services/monitoring/default.nix b/services/monitoring/default.nix deleted file mode 100644 index 5110ff4..0000000 --- a/services/monitoring/default.nix +++ /dev/null @@ -1,14 +0,0 @@ -{ ... }: -{ - imports = [ - ./loki.nix - ./grafana.nix - ./prometheus.nix - ./blackbox.nix - ./exportarr.nix - ./pve.nix - ./alerttonotify.nix - ./pyroscope.nix - ./tempo.nix - ]; -} diff --git a/services/monitoring/exportarr.nix b/services/monitoring/exportarr.nix index b6d1436..970cad1 100644 --- a/services/monitoring/exportarr.nix +++ b/services/monitoring/exportarr.nix @@ -14,14 +14,4 @@ apiKeyFile = config.vault.secrets.sonarr-api-key.outputDir; port = 9709; }; - - # Scrape config - services.prometheus.scrapeConfigs = [ - { - job_name = "sonarr"; - static_configs = [{ - targets = [ "localhost:9709" ]; - }]; - } - ]; } diff --git a/services/monitoring/grafana.nix b/services/monitoring/grafana.nix deleted file mode 100644 index f72f344..0000000 --- a/services/monitoring/grafana.nix +++ /dev/null @@ -1,11 +0,0 @@ -{ pkgs, ... }: -{ - services.grafana = { - enable = true; - settings = { - server = { - http_addr = ""; - }; - }; - }; -} diff --git a/services/monitoring/loki.nix b/services/monitoring/loki.nix deleted file mode 100644 index 87ee06f..0000000 --- a/services/monitoring/loki.nix +++ /dev/null @@ -1,58 +0,0 @@ -{ ... }: -{ - services.loki = { - enable = true; - configuration = { - auth_enabled = false; - - server = { - http_listen_port = 3100; - }; - common = { - ring = { - instance_addr = "127.0.0.1"; - kvstore = { - store = "inmemory"; - }; - }; - replication_factor = 1; - path_prefix = "/var/lib/loki"; - }; - schema_config = { - configs = [ - { - from = "2024-01-01"; - store = "tsdb"; - object_store = "filesystem"; - schema = "v13"; - index = { - prefix = "loki_index_"; - period = "24h"; - }; - } - ]; - }; - storage_config = { - filesystem = { - directory = "/var/lib/loki/chunks"; - }; - }; - compactor = { - working_directory = "/var/lib/loki/compactor"; - compaction_interval = "10m"; - retention_enabled = true; - retention_delete_delay = "2h"; - retention_delete_worker_count = 150; - delete_request_store = "filesystem"; - }; - limits_config = { - retention_period = "30d"; - ingestion_rate_mb = 10; - ingestion_burst_size_mb = 20; - max_streams_per_user = 10000; - max_query_series = 500; - max_query_parallelism = 8; - }; - }; - }; -} diff --git a/services/monitoring/prometheus.nix b/services/monitoring/prometheus.nix deleted file mode 100644 index 1d9c196..0000000 --- a/services/monitoring/prometheus.nix +++ /dev/null @@ -1,267 +0,0 @@ -{ self, lib, pkgs, ... }: -let - monLib = import ../../lib/monitoring.nix { inherit lib; }; - externalTargets = import ./external-targets.nix; - - nodeExporterTargets = monLib.generateNodeExporterTargets self externalTargets; - autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets; - - # Script to fetch AppRole token for Prometheus to use when scraping OpenBao metrics - fetchOpenbaoToken = pkgs.writeShellApplication { - name = "fetch-openbao-token"; - runtimeInputs = [ pkgs.curl pkgs.jq ]; - text = '' - VAULT_ADDR="https://vault01.home.2rjus.net:8200" - APPROLE_DIR="/var/lib/vault/approle" - OUTPUT_FILE="/run/secrets/prometheus/openbao-token" - - # Read AppRole credentials - if [ ! -f "$APPROLE_DIR/role-id" ] || [ ! -f "$APPROLE_DIR/secret-id" ]; then - echo "AppRole credentials not found at $APPROLE_DIR" >&2 - exit 1 - fi - - ROLE_ID=$(cat "$APPROLE_DIR/role-id") - SECRET_ID=$(cat "$APPROLE_DIR/secret-id") - - # Authenticate to Vault - AUTH_RESPONSE=$(curl -sf -k -X POST \ - -d "{\"role_id\":\"$ROLE_ID\",\"secret_id\":\"$SECRET_ID\"}" \ - "$VAULT_ADDR/v1/auth/approle/login") - - # Extract token - VAULT_TOKEN=$(echo "$AUTH_RESPONSE" | jq -r '.auth.client_token') - if [ -z "$VAULT_TOKEN" ] || [ "$VAULT_TOKEN" = "null" ]; then - echo "Failed to extract Vault token from response" >&2 - exit 1 - fi - - # Write token to file - mkdir -p "$(dirname "$OUTPUT_FILE")" - echo -n "$VAULT_TOKEN" > "$OUTPUT_FILE" - chown prometheus:prometheus "$OUTPUT_FILE" - chmod 0400 "$OUTPUT_FILE" - - echo "Successfully fetched OpenBao token" - ''; - }; -in -{ - # Systemd service to fetch AppRole token for Prometheus OpenBao scraping - # The token is used to authenticate when scraping /v1/sys/metrics - systemd.services.prometheus-openbao-token = { - description = "Fetch OpenBao token for Prometheus metrics scraping"; - after = [ "network-online.target" ]; - wants = [ "network-online.target" ]; - before = [ "prometheus.service" ]; - requiredBy = [ "prometheus.service" ]; - - serviceConfig = { - Type = "oneshot"; - ExecStart = lib.getExe fetchOpenbaoToken; - }; - }; - - # Timer to periodically refresh the token (AppRole tokens have 1-hour TTL) - systemd.timers.prometheus-openbao-token = { - description = "Refresh OpenBao token for Prometheus"; - wantedBy = [ "timers.target" ]; - timerConfig = { - OnBootSec = "5min"; - OnUnitActiveSec = "30min"; - RandomizedDelaySec = "5min"; - }; - }; - - # Fetch apiary bearer token from Vault - vault.secrets.prometheus-apiary-token = { - secretPath = "hosts/monitoring01/apiary-token"; - extractKey = "password"; - owner = "prometheus"; - group = "prometheus"; - services = [ "prometheus" ]; - }; - - services.prometheus = { - enable = true; - # syntax-only check because we use external credential files (e.g., openbao-token) - checkConfig = "syntax-only"; - alertmanager = { - enable = true; - configuration = { - global = { - }; - route = { - receiver = "webhook_natstonotify"; - group_wait = "30s"; - group_interval = "5m"; - repeat_interval = "1h"; - group_by = [ "alertname" ]; - }; - receivers = [ - { - name = "webhook_natstonotify"; - webhook_configs = [ - { - url = "http://localhost:5001/alert"; - } - ]; - } - ]; - }; - }; - alertmanagers = [ - { - static_configs = [ - { - targets = [ "localhost:9093" ]; - } - ]; - } - ]; - - retentionTime = "30d"; - globalConfig = { - scrape_interval = "15s"; - }; - rules = [ - (builtins.readFile ./rules.yml) - ]; - - scrapeConfigs = [ - # Auto-generated node-exporter targets from flake hosts + external - # Each static_config entry may have labels from homelab.host metadata - { - job_name = "node-exporter"; - static_configs = nodeExporterTargets; - } - # Systemd exporter on all hosts (same targets, different port) - # Preserves the same label grouping as node-exporter - { - job_name = "systemd-exporter"; - static_configs = map - (cfg: cfg // { - targets = map (t: builtins.replaceStrings [ ":9100" ] [ ":9558" ] t) cfg.targets; - }) - nodeExporterTargets; - } - # Local monitoring services (not auto-generated) - { - job_name = "prometheus"; - static_configs = [ - { - targets = [ "localhost:9090" ]; - } - ]; - } - { - job_name = "loki"; - static_configs = [ - { - targets = [ "localhost:3100" ]; - } - ]; - } - { - job_name = "grafana"; - static_configs = [ - { - targets = [ "localhost:3000" ]; - } - ]; - } - { - job_name = "alertmanager"; - static_configs = [ - { - targets = [ "localhost:9093" ]; - } - ]; - } - { - job_name = "pushgateway"; - honor_labels = true; - static_configs = [ - { - targets = [ "localhost:9091" ]; - } - ]; - } - # Caddy metrics from nix-cache02 (serves nix-cache.home.2rjus.net) - { - job_name = "nix-cache_caddy"; - scheme = "https"; - static_configs = [ - { - targets = [ "nix-cache.home.2rjus.net" ]; - } - ]; - } - # pve-exporter with complex relabel config - { - job_name = "pve-exporter"; - static_configs = [ - { - targets = [ "10.69.12.75" ]; - } - ]; - metrics_path = "/pve"; - params = { - module = [ "default" ]; - cluster = [ "1" ]; - node = [ "1" ]; - }; - relabel_configs = [ - { - source_labels = [ "__address__" ]; - target_label = "__param_target"; - } - { - source_labels = [ "__param_target" ]; - target_label = "instance"; - } - { - target_label = "__address__"; - replacement = "127.0.0.1:9221"; - } - ]; - } - # OpenBao metrics with bearer token auth - { - job_name = "openbao"; - scheme = "https"; - metrics_path = "/v1/sys/metrics"; - params = { - format = [ "prometheus" ]; - }; - static_configs = [{ - targets = [ "vault01.home.2rjus.net:8200" ]; - }]; - authorization = { - type = "Bearer"; - credentials_file = "/run/secrets/prometheus/openbao-token"; - }; - } - # Apiary external service - { - job_name = "apiary"; - scheme = "https"; - scrape_interval = "60s"; - static_configs = [{ - targets = [ "apiary.t-juice.club" ]; - }]; - authorization = { - type = "Bearer"; - credentials_file = "/run/secrets/prometheus-apiary-token"; - }; - } - ] ++ autoScrapeConfigs; - - pushgateway = { - enable = true; - web = { - external-url = "https://pushgw.home.2rjus.net"; - }; - }; - }; -} diff --git a/services/monitoring/pve.nix b/services/monitoring/pve.nix index 45f92ef..b35e91f 100644 --- a/services/monitoring/pve.nix +++ b/services/monitoring/pve.nix @@ -1,7 +1,7 @@ { config, ... }: { vault.secrets.pve-exporter = { - secretPath = "hosts/monitoring01/pve-exporter"; + secretPath = "hosts/monitoring02/pve-exporter"; extractKey = "config"; outputDir = "/run/secrets/pve_exporter"; mode = "0444"; diff --git a/services/monitoring/pyroscope.nix b/services/monitoring/pyroscope.nix deleted file mode 100644 index 03274ef..0000000 --- a/services/monitoring/pyroscope.nix +++ /dev/null @@ -1,8 +0,0 @@ -{ ... }: -{ - virtualisation.oci-containers.containers.pyroscope = { - pull = "missing"; - image = "grafana/pyroscope:latest"; - ports = [ "4040:4040" ]; - }; -} diff --git a/services/monitoring/rules.yml b/services/monitoring/rules.yml index a5f8abf..14b9baf 100644 --- a/services/monitoring/rules.yml +++ b/services/monitoring/rules.yml @@ -259,32 +259,32 @@ groups: description: "Wireguard handshake timeout on {{ $labels.instance }} for peer {{ $labels.public_key }}." - name: monitoring_rules rules: - - alert: prometheus_not_running - expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="prometheus.service", state="active"} == 0 + - alert: victoriametrics_not_running + expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="victoriametrics.service", state="active"} == 0 for: 5m labels: severity: critical annotations: - summary: "Prometheus service not running on {{ $labels.instance }}" - description: "Prometheus service not running on {{ $labels.instance }}" + summary: "VictoriaMetrics service not running on {{ $labels.instance }}" + description: "VictoriaMetrics service not running on {{ $labels.instance }}" + - alert: vmalert_not_running + expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="vmalert.service", state="active"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "vmalert service not running on {{ $labels.instance }}" + description: "vmalert service not running on {{ $labels.instance }}" - alert: alertmanager_not_running - expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0 + expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0 for: 5m labels: severity: critical annotations: summary: "Alertmanager service not running on {{ $labels.instance }}" description: "Alertmanager service not running on {{ $labels.instance }}" - - alert: pushgateway_not_running - expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0 - for: 5m - labels: - severity: critical - annotations: - summary: "Pushgateway service not running on {{ $labels.instance }}" - description: "Pushgateway service not running on {{ $labels.instance }}" - alert: loki_not_running - expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="loki.service", state="active"} == 0 + expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="loki.service", state="active"} == 0 for: 5m labels: severity: critical @@ -292,29 +292,13 @@ groups: summary: "Loki service not running on {{ $labels.instance }}" description: "Loki service not running on {{ $labels.instance }}" - alert: grafana_not_running - expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="grafana.service", state="active"} == 0 + expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="grafana.service", state="active"} == 0 for: 5m labels: severity: warning annotations: summary: "Grafana service not running on {{ $labels.instance }}" description: "Grafana service not running on {{ $labels.instance }}" - - alert: tempo_not_running - expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="tempo.service", state="active"} == 0 - for: 5m - labels: - severity: warning - annotations: - summary: "Tempo service not running on {{ $labels.instance }}" - description: "Tempo service not running on {{ $labels.instance }}" - - alert: pyroscope_not_running - expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="podman-pyroscope.service", state="active"} == 0 - for: 5m - labels: - severity: warning - annotations: - summary: "Pyroscope service not running on {{ $labels.instance }}" - description: "Pyroscope service not running on {{ $labels.instance }}" - name: proxmox_rules rules: - alert: pve_node_down diff --git a/services/monitoring/tempo.nix b/services/monitoring/tempo.nix deleted file mode 100644 index 8fda7f8..0000000 --- a/services/monitoring/tempo.nix +++ /dev/null @@ -1,37 +0,0 @@ -{ ... }: -{ - services.tempo = { - enable = true; - settings = { - server = { - http_listen_port = 3200; - grpc_listen_port = 3201; - }; - distributor = { - receivers = { - otlp = { - protocols = { - http = { - endpoint = ":4318"; - cors = { - allowed_origins = [ "*.home.2rjus.net" ]; - }; - }; - }; - }; - }; - }; - storage = { - trace = { - backend = "local"; - local = { - path = "/var/lib/tempo"; - }; - wal = { - path = "/var/lib/tempo/wal"; - }; - }; - }; - }; - }; -} diff --git a/services/victoriametrics/default.nix b/services/victoriametrics/default.nix index 2c2af1b..6275edc 100644 --- a/services/victoriametrics/default.nix +++ b/services/victoriametrics/default.nix @@ -6,6 +6,24 @@ let nodeExporterTargets = monLib.generateNodeExporterTargets self externalTargets; autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets; + # TLS endpoints to monitor for certificate expiration via blackbox exporter + tlsTargets = [ + "https://vault.home.2rjus.net:8200" + "https://auth.home.2rjus.net" + "https://testvm01.home.2rjus.net" + "https://nzbget.home.2rjus.net" + "https://radarr.home.2rjus.net" + "https://sonarr.home.2rjus.net" + "https://ha.home.2rjus.net" + "https://z2m.home.2rjus.net" + "https://metrics.home.2rjus.net" + "https://alertmanager.home.2rjus.net" + "https://grafana.home.2rjus.net" + "https://jelly.home.2rjus.net" + "https://nix-cache.home.2rjus.net" + "https://grafana-test.home.2rjus.net" + ]; + # Script to fetch AppRole token for VictoriaMetrics to use when scraping OpenBao metrics fetchOpenbaoToken = pkgs.writeShellApplication { name = "fetch-openbao-token-vm"; @@ -107,6 +125,39 @@ let credentials_file = "/run/secrets/victoriametrics-apiary-token"; }; } + # Blackbox TLS certificate monitoring + { + job_name = "blackbox_tls"; + metrics_path = "/probe"; + params = { + module = [ "https_cert" ]; + }; + static_configs = [{ targets = tlsTargets; }]; + relabel_configs = [ + { + source_labels = [ "__address__" ]; + target_label = "__param_target"; + } + { + source_labels = [ "__param_target" ]; + target_label = "instance"; + } + { + target_label = "__address__"; + replacement = "127.0.0.1:9115"; + } + ]; + } + # Sonarr exporter + { + job_name = "sonarr"; + static_configs = [{ targets = [ "localhost:9709" ]; }]; + } + # Proxmox VE exporter + { + job_name = "pve"; + static_configs = [{ targets = [ "localhost:9221" ]; }]; + } ] ++ autoScrapeConfigs; in { @@ -152,7 +203,7 @@ in # Fetch apiary bearer token from Vault vault.secrets.victoriametrics-apiary-token = { - secretPath = "hosts/monitoring01/apiary-token"; + secretPath = "hosts/monitoring02/apiary-token"; extractKey = "password"; owner = "victoriametrics"; group = "victoriametrics"; diff --git a/system/vault-secrets.nix b/system/vault-secrets.nix index 60466dd..bc5608c 100644 --- a/system/vault-secrets.nix +++ b/system/vault-secrets.nix @@ -57,7 +57,7 @@ let type = types.str; description = '' Path to the secret in Vault (without /v1/secret/data/ prefix). - Example: "hosts/monitoring01/grafana-admin" + Example: "hosts/ha1/mqtt-password" ''; }; @@ -152,13 +152,11 @@ in ''; example = literalExpression '' { - grafana-admin = { - secretPath = "hosts/monitoring01/grafana-admin"; - owner = "grafana"; - group = "grafana"; - restartTrigger = true; - restartInterval = "daily"; - services = [ "grafana" ]; + mqtt-password = { + secretPath = "hosts/ha1/mqtt-password"; + owner = "mosquitto"; + group = "mosquitto"; + services = [ "mosquitto" ]; }; } ''; diff --git a/terraform/vault/approle.tf b/terraform/vault/approle.tf index 1e5956a..6a12464 100644 --- a/terraform/vault/approle.tf +++ b/terraform/vault/approle.tf @@ -40,23 +40,13 @@ EOT # Define host access policies locals { host_policies = { - # Example: monitoring01 host - # "monitoring01" = { - # paths = [ - # "secret/data/hosts/monitoring01/*", - # "secret/data/services/prometheus/*", - # "secret/data/services/grafana/*", - # "secret/data/shared/smtp/*" - # ] - # extra_policies = ["some-other-policy"] # Optional: additional policies - # } - - # Example: ha1 host + # Example: # "ha1" = { # paths = [ # "secret/data/hosts/ha1/*", # "secret/data/shared/mqtt/*" # ] + # extra_policies = ["some-other-policy"] # Optional: additional policies # } "ha1" = { @@ -66,16 +56,6 @@ locals { ] } - "monitoring01" = { - paths = [ - "secret/data/hosts/monitoring01/*", - "secret/data/shared/backup/*", - "secret/data/shared/nats/*", - "secret/data/services/exportarr/*", - ] - extra_policies = ["prometheus-metrics"] - } - # Wave 1: hosts with no service secrets (only need vault.enable for future use) "nats1" = { paths = [ diff --git a/terraform/vault/hosts-generated.tf b/terraform/vault/hosts-generated.tf index 5257919..5fa92c2 100644 --- a/terraform/vault/hosts-generated.tf +++ b/terraform/vault/hosts-generated.tf @@ -47,8 +47,8 @@ locals { "monitoring02" = { paths = [ "secret/data/hosts/monitoring02/*", - "secret/data/hosts/monitoring01/apiary-token", "secret/data/services/grafana/*", + "secret/data/services/exportarr/*", "secret/data/shared/nats/nkey", ] extra_policies = ["prometheus-metrics"] diff --git a/terraform/vault/secrets.tf b/terraform/vault/secrets.tf index a84baff..83416a6 100644 --- a/terraform/vault/secrets.tf +++ b/terraform/vault/secrets.tf @@ -10,10 +10,6 @@ resource "vault_mount" "kv" { locals { secrets = { # Example host-specific secrets - # "hosts/monitoring01/grafana-admin" = { - # auto_generate = true - # password_length = 32 - # } # "hosts/ha1/mqtt-password" = { # auto_generate = true # password_length = 24 @@ -35,11 +31,6 @@ locals { # } # } - "hosts/monitoring01/grafana-admin" = { - auto_generate = true - password_length = 32 - } - "hosts/ha1/mqtt-password" = { auto_generate = true password_length = 24 @@ -57,8 +48,8 @@ locals { data = { nkey = var.nats_nkey } } - # PVE exporter config for monitoring01 - "hosts/monitoring01/pve-exporter" = { + # PVE exporter config for monitoring02 + "hosts/monitoring02/pve-exporter" = { auto_generate = false data = { config = var.pve_exporter_config } } @@ -149,7 +140,7 @@ locals { } # Bearer token for scraping apiary metrics - "hosts/monitoring01/apiary-token" = { + "hosts/monitoring02/apiary-token" = { auto_generate = true password_length = 64 }