diff --git a/.claude/agents/investigate-alarm.md b/.claude/agents/investigate-alarm.md index 11462ea..ebdf233 100644 --- a/.claude/agents/investigate-alarm.md +++ b/.claude/agents/investigate-alarm.md @@ -130,7 +130,7 @@ get_commit_info() # Get full details of a specific change ``` **Example workflow for a service-related alert:** -1. Query `nixos_flake_info{hostname="monitoring01"}` → `current_rev: 8959829` +1. Query `nixos_flake_info{hostname="monitoring02"}` → `current_rev: 8959829` 2. `resolve_ref("master")` → `4633421` 3. `is_ancestor("8959829", "4633421")` → Yes, host is behind 4. `commits_between("8959829", "4633421")` → 7 commits missing diff --git a/.claude/skills/observability/SKILL.md b/.claude/skills/observability/SKILL.md index f89ea93..3b3886f 100644 --- a/.claude/skills/observability/SKILL.md +++ b/.claude/skills/observability/SKILL.md @@ -30,7 +30,7 @@ Use the `lab-monitoring` MCP server tools: ### Label Reference Available labels for log queries: -- `hostname` - Hostname (e.g., `ns1`, `monitoring01`, `ha1`) - matches the Prometheus `hostname` label +- `hostname` - Hostname (e.g., `ns1`, `monitoring02`, `ha1`) - matches the Prometheus `hostname` label - `systemd_unit` - Systemd unit name (e.g., `nsd.service`, `nixos-upgrade.service`) - `job` - Either `systemd-journal` (most logs), `varlog` (file-based logs), or `bootstrap` (VM bootstrap logs) - `filename` - For `varlog` job, the log file path @@ -54,7 +54,7 @@ Journal logs are JSON-formatted. Key fields: **All logs from a host:** ```logql -{hostname="monitoring01"} +{hostname="monitoring02"} ``` **Logs from a service across all hosts:** @@ -74,7 +74,7 @@ Journal logs are JSON-formatted. Key fields: **Regex matching:** ```logql -{systemd_unit="prometheus.service"} |~ "scrape.*failed" +{systemd_unit="victoriametrics.service"} |~ "scrape.*failed" ``` **Filter by level (journal scrape only):** @@ -109,7 +109,7 @@ Default lookback is 1 hour. Use `start` parameter for older logs: Useful systemd units for troubleshooting: - `nixos-upgrade.service` - Daily auto-upgrade logs - `nsd.service` - DNS server (ns1/ns2) -- `prometheus.service` - Metrics collection +- `victoriametrics.service` - Metrics collection - `loki.service` - Log aggregation - `caddy.service` - Reverse proxy - `home-assistant.service` - Home automation @@ -152,7 +152,7 @@ VMs provisioned from template2 send bootstrap progress directly to Loki via curl Parse JSON and filter on fields: ```logql -{systemd_unit="prometheus.service"} | json | PRIORITY="3" +{systemd_unit="victoriametrics.service"} | json | PRIORITY="3" ``` --- @@ -242,12 +242,11 @@ All available Prometheus job names: - `unbound` - DNS resolver metrics (ns1, ns2) - `wireguard` - VPN tunnel metrics (http-proxy) -**Monitoring stack (localhost on monitoring01):** -- `prometheus` - Prometheus self-metrics +**Monitoring stack (localhost on monitoring02):** +- `victoriametrics` - VictoriaMetrics self-metrics - `loki` - Loki self-metrics - `grafana` - Grafana self-metrics - `alertmanager` - Alertmanager metrics -- `pushgateway` - Push-based metrics gateway **External/infrastructure:** - `pve-exporter` - Proxmox hypervisor metrics @@ -262,7 +261,7 @@ All scrape targets have these labels: **Standard labels:** - `instance` - Full target address (`.home.2rjus.net:`) - `job` - Job name (e.g., `node-exporter`, `unbound`, `nixos-exporter`) -- `hostname` - Short hostname (e.g., `ns1`, `monitoring01`) - use this for host filtering +- `hostname` - Short hostname (e.g., `ns1`, `monitoring02`) - use this for host filtering **Host metadata labels** (when configured in `homelab.host`): - `role` - Host role (e.g., `dns`, `build-host`, `vault`) @@ -275,7 +274,7 @@ Use the `hostname` label for easy host filtering across all jobs: ```promql {hostname="ns1"} # All metrics from ns1 -node_load1{hostname="monitoring01"} # Specific metric by hostname +node_load1{hostname="monitoring02"} # Specific metric by hostname up{hostname="ha1"} # Check if ha1 is up ``` @@ -283,10 +282,10 @@ This is simpler than wildcarding the `instance` label: ```promql # Old way (still works but verbose) -up{instance=~"monitoring01.*"} +up{instance=~"monitoring02.*"} # New way (preferred) -up{hostname="monitoring01"} +up{hostname="monitoring02"} ``` ### Filtering by Role/Tier diff --git a/CLAUDE.md b/CLAUDE.md index 452aea8..e7fc33f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -247,7 +247,7 @@ nix develop -c homelab-deploy -- deploy \ deploy.prod. ``` -Subject format: `deploy..` (e.g., `deploy.prod.monitoring01`, `deploy.test.testvm01`) +Subject format: `deploy..` (e.g., `deploy.prod.monitoring02`, `deploy.test.testvm01`) **Verifying Deployments:** @@ -309,7 +309,7 @@ All hosts automatically get: - OpenBao (Vault) secrets management via AppRole - Internal ACME CA integration (OpenBao PKI at vault.home.2rjus.net) - Daily auto-upgrades with auto-reboot -- Prometheus node-exporter + Promtail (logs to monitoring01) +- Prometheus node-exporter + Promtail (logs to monitoring02) - Monitoring scrape target auto-registration via `homelab.monitoring` options - Custom root CA trust - DNS zone auto-registration via `homelab.dns` options @@ -335,7 +335,7 @@ Use `nix flake show` or `nix develop -c ansible-inventory --graph` to list all h - Infrastructure subnet: `10.69.13.x` - DNS: ns1/ns2 provide authoritative DNS with primary-secondary setup - Internal CA for ACME certificates (no Let's Encrypt) -- Centralized monitoring at monitoring01 +- Centralized monitoring at monitoring02 - Static networking via systemd-networkd ### Secrets Management @@ -480,23 +480,21 @@ See [docs/host-creation.md](docs/host-creation.md) for the complete host creatio ### Monitoring Stack -All hosts ship metrics and logs to `monitoring01`: -- **Metrics**: Prometheus scrapes node-exporter from all hosts -- **Logs**: Promtail ships logs to Loki on monitoring01 -- **Access**: Grafana at monitoring01 for visualization -- **Tracing**: Tempo for distributed tracing -- **Profiling**: Pyroscope for continuous profiling +All hosts ship metrics and logs to `monitoring02`: +- **Metrics**: VictoriaMetrics scrapes node-exporter from all hosts +- **Logs**: Promtail ships logs to Loki on monitoring02 +- **Access**: Grafana at monitoring02 for visualization **Scrape Target Auto-Generation:** -Prometheus scrape targets are automatically generated from host configurations, following the same pattern as DNS zone generation: +VictoriaMetrics scrape targets are automatically generated from host configurations, following the same pattern as DNS zone generation: - **Node-exporter**: All flake hosts with static IPs are automatically added as node-exporter targets - **Service targets**: Defined via `homelab.monitoring.scrapeTargets` in service modules - **External targets**: Non-flake hosts defined in `/services/monitoring/external-targets.nix` - **Library**: `lib/monitoring.nix` provides `generateNodeExporterTargets` and `generateScrapeConfigs` -Service modules declare their scrape targets directly via `homelab.monitoring.scrapeTargets`. The Prometheus config on monitoring01 auto-generates scrape configs from all hosts. See "Homelab Module Options" section for available options. +Service modules declare their scrape targets directly via `homelab.monitoring.scrapeTargets`. The VictoriaMetrics config on monitoring02 auto-generates scrape configs from all hosts. See "Homelab Module Options" section for available options. To add monitoring targets for non-NixOS hosts, edit `/services/monitoring/external-targets.nix`. diff --git a/README.md b/README.md index 939f514..a13df69 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ NixOS Flake-based configuration repository for a homelab infrastructure. All hos | `ca` | Internal Certificate Authority | | `ha1` | Home Assistant + Zigbee2MQTT + Mosquitto | | `http-proxy` | Reverse proxy | -| `monitoring01` | Prometheus, Grafana, Loki, Tempo, Pyroscope | +| `monitoring02` | VictoriaMetrics, Grafana, Loki, Alertmanager | | `jelly01` | Jellyfin media server | | `nix-cache02` | Nix binary cache + NATS-based build service | | `nats1` | NATS messaging | @@ -121,4 +121,4 @@ No manual intervention is required after `tofu apply`. - Infrastructure subnet: `10.69.13.0/24` - DNS: ns1/ns2 authoritative with primary-secondary AXFR - Internal CA for TLS certificates (migrating from step-ca to OpenBao PKI) -- Centralized monitoring at monitoring01 +- Centralized monitoring at monitoring02 diff --git a/flake.nix b/flake.nix index 222b88f..9960431 100644 --- a/flake.nix +++ b/flake.nix @@ -92,15 +92,6 @@ ./hosts/http-proxy ]; }; - monitoring01 = nixpkgs.lib.nixosSystem { - inherit system; - specialArgs = { - inherit inputs self; - }; - modules = commonModules ++ [ - ./hosts/monitoring01 - ]; - }; jelly01 = nixpkgs.lib.nixosSystem { inherit system; specialArgs = { diff --git a/hosts/http-proxy/configuration.nix b/hosts/http-proxy/configuration.nix index 25e080d..3cb802f 100644 --- a/hosts/http-proxy/configuration.nix +++ b/hosts/http-proxy/configuration.nix @@ -19,8 +19,6 @@ "ha" "z2m" "jelly" - "pyroscope" - "pushgw" ]; nixpkgs.config.allowUnfree = true; diff --git a/hosts/monitoring01/configuration.nix b/hosts/monitoring01/configuration.nix deleted file mode 100644 index 7f87ef7..0000000 --- a/hosts/monitoring01/configuration.nix +++ /dev/null @@ -1,114 +0,0 @@ -{ - pkgs, - ... -}: - -{ - imports = [ - ./hardware-configuration.nix - - ../../system - ../../common/vm - ]; - - homelab.host.role = "monitoring"; - - nixpkgs.config.allowUnfree = true; - # Use the systemd-boot EFI boot loader. - boot.loader.grub = { - enable = true; - device = "/dev/sda"; - configurationLimit = 3; - }; - - networking.hostName = "monitoring01"; - networking.domain = "home.2rjus.net"; - networking.useNetworkd = true; - networking.useDHCP = false; - services.resolved.enable = true; - networking.nameservers = [ - "10.69.13.5" - "10.69.13.6" - ]; - - systemd.network.enable = true; - systemd.network.networks."ens18" = { - matchConfig.Name = "ens18"; - address = [ - "10.69.13.13/24" - ]; - routes = [ - { Gateway = "10.69.13.1"; } - ]; - linkConfig.RequiredForOnline = "routable"; - }; - time.timeZone = "Europe/Oslo"; - - nix.settings.experimental-features = [ - "nix-command" - "flakes" - ]; - nix.settings.tarball-ttl = 0; - environment.systemPackages = with pkgs; [ - vim - wget - git - sqlite - ]; - - services.qemuGuest.enable = true; - - # Vault secrets management - vault.enable = true; - homelab.deploy.enable = true; - vault.secrets.backup-helper = { - secretPath = "shared/backup/password"; - extractKey = "password"; - outputDir = "/run/secrets/backup_helper_secret"; - services = [ "restic-backups-grafana" "restic-backups-grafana-db" ]; - }; - - services.restic.backups.grafana = { - repository = "rest:http://10.69.12.52:8000/backup-nix"; - passwordFile = "/run/secrets/backup_helper_secret"; - paths = [ "/var/lib/grafana/plugins" ]; - timerConfig = { - OnCalendar = "daily"; - Persistent = true; - RandomizedDelaySec = "2h"; - }; - pruneOpts = [ - "--keep-daily 7" - "--keep-weekly 4" - "--keep-monthly 6" - "--keep-within 1d" - ]; - extraOptions = [ "--retry-lock=5m" ]; - }; - - services.restic.backups.grafana-db = { - repository = "rest:http://10.69.12.52:8000/backup-nix"; - passwordFile = "/run/secrets/backup_helper_secret"; - command = [ "${pkgs.sqlite}/bin/sqlite3" "/var/lib/grafana/data/grafana.db" ".dump" ]; - timerConfig = { - OnCalendar = "daily"; - Persistent = true; - RandomizedDelaySec = "2h"; - }; - pruneOpts = [ - "--keep-daily 7" - "--keep-weekly 4" - "--keep-monthly 6" - "--keep-within 1d" - ]; - extraOptions = [ "--retry-lock=5m" ]; - }; - - # Open ports in the firewall. - # networking.firewall.allowedTCPPorts = [ ... ]; - # networking.firewall.allowedUDPPorts = [ ... ]; - # Or disable the firewall altogether. - networking.firewall.enable = false; - - system.stateVersion = "23.11"; # Did you read the comment? -} diff --git a/hosts/monitoring01/default.nix b/hosts/monitoring01/default.nix deleted file mode 100644 index dc5ef1f..0000000 --- a/hosts/monitoring01/default.nix +++ /dev/null @@ -1,7 +0,0 @@ -{ ... }: -{ - imports = [ - ./configuration.nix - ../../services/monitoring - ]; -} diff --git a/hosts/monitoring01/hardware-configuration.nix b/hosts/monitoring01/hardware-configuration.nix deleted file mode 100644 index 48bf109..0000000 --- a/hosts/monitoring01/hardware-configuration.nix +++ /dev/null @@ -1,42 +0,0 @@ -{ - config, - lib, - pkgs, - modulesPath, - ... -}: - -{ - imports = [ - (modulesPath + "/profiles/qemu-guest.nix") - ]; - boot.initrd.availableKernelModules = [ - "ata_piix" - "uhci_hcd" - "virtio_pci" - "virtio_scsi" - "sd_mod" - "sr_mod" - ]; - boot.initrd.kernelModules = [ "dm-snapshot" ]; - boot.kernelModules = [ - "ptp_kvm" - ]; - boot.extraModulePackages = [ ]; - - fileSystems."/" = { - device = "/dev/disk/by-label/root"; - fsType = "xfs"; - }; - - swapDevices = [ { device = "/dev/disk/by-label/swap"; } ]; - - # Enables DHCP on each ethernet and wireless interface. In case of scripted networking - # (the default) this is the recommended approach. When using systemd-networkd it's - # still possible to use this option, but it's recommended to use it in conjunction - # with explicit per-interface declarations with `networking.interfaces..useDHCP`. - networking.useDHCP = lib.mkDefault true; - # networking.interfaces.ens18.useDHCP = lib.mkDefault true; - - nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux"; -} diff --git a/hosts/monitoring02/default.nix b/hosts/monitoring02/default.nix index 252daf0..4b17441 100644 --- a/hosts/monitoring02/default.nix +++ b/hosts/monitoring02/default.nix @@ -5,5 +5,8 @@ ../../services/victoriametrics ../../services/loki ../../services/monitoring/alerttonotify.nix + ../../services/monitoring/blackbox.nix + ../../services/monitoring/exportarr.nix + ../../services/monitoring/pve.nix ]; } \ No newline at end of file diff --git a/scripts/vault-fetch/README.md b/scripts/vault-fetch/README.md index 688f20c..1477080 100644 --- a/scripts/vault-fetch/README.md +++ b/scripts/vault-fetch/README.md @@ -20,10 +20,10 @@ vault-fetch [cache-directory] ```bash # Fetch Grafana admin secrets -vault-fetch hosts/monitoring01/grafana-admin /run/secrets/grafana /var/lib/vault/cache/grafana +vault-fetch hosts/ha1/mqtt-password /run/secrets/grafana /var/lib/vault/cache/grafana # Use default cache location -vault-fetch hosts/monitoring01/grafana-admin /run/secrets/grafana +vault-fetch hosts/ha1/mqtt-password /run/secrets/grafana ``` ## How It Works @@ -53,13 +53,13 @@ If Vault is unreachable or authentication fails: This tool is designed to be called from systemd service `ExecStartPre` hooks via the `vault.secrets` NixOS module: ```nix -vault.secrets.grafana-admin = { - secretPath = "hosts/monitoring01/grafana-admin"; +vault.secrets.mqtt-password = { + secretPath = "hosts/ha1/mqtt-password"; }; # Service automatically gets secrets fetched before start -systemd.services.grafana.serviceConfig = { - EnvironmentFile = "/run/secrets/grafana-admin/password"; +systemd.services.mosquitto.serviceConfig = { + EnvironmentFile = "/run/secrets/mqtt-password/password"; }; ``` diff --git a/scripts/vault-fetch/vault-fetch.sh b/scripts/vault-fetch/vault-fetch.sh index 3c2bd33..a500bba 100644 --- a/scripts/vault-fetch/vault-fetch.sh +++ b/scripts/vault-fetch/vault-fetch.sh @@ -5,7 +5,7 @@ set -euo pipefail # # Usage: vault-fetch [cache-directory] # -# Example: vault-fetch hosts/monitoring01/grafana-admin /run/secrets/grafana /var/lib/vault/cache/grafana +# Example: vault-fetch hosts/ha1/mqtt-password /run/secrets/grafana /var/lib/vault/cache/grafana # # This script: # 1. Authenticates to Vault using AppRole credentials from /var/lib/vault/approle/ @@ -17,7 +17,7 @@ set -euo pipefail # Parse arguments if [ $# -lt 2 ]; then echo "Usage: vault-fetch [cache-directory]" >&2 - echo "Example: vault-fetch hosts/monitoring01/grafana /run/secrets/grafana /var/lib/vault/cache/grafana" >&2 + echo "Example: vault-fetch hosts/ha1/mqtt-password /run/secrets/grafana /var/lib/vault/cache/grafana" >&2 exit 1 fi diff --git a/services/grafana/default.nix b/services/grafana/default.nix index 8fb645f..3da828b 100644 --- a/services/grafana/default.nix +++ b/services/grafana/default.nix @@ -45,13 +45,7 @@ isDefault = true; uid = "victoriametrics"; } - { - name = "Prometheus (monitoring01)"; - type = "prometheus"; - url = "http://monitoring01.home.2rjus.net:9090"; - uid = "prometheus"; - } - { +{ name = "Loki"; type = "loki"; url = "http://localhost:3100"; diff --git a/services/http-proxy/proxy.nix b/services/http-proxy/proxy.nix index 613a162..151da5a 100644 --- a/services/http-proxy/proxy.nix +++ b/services/http-proxy/proxy.nix @@ -63,22 +63,6 @@ } reverse_proxy http://jelly01.home.2rjus.net:8096 } - pyroscope.home.2rjus.net { - log { - output file /var/log/caddy/pyroscope.log { - mode 644 - } - } - reverse_proxy http://monitoring01.home.2rjus.net:4040 - } - pushgw.home.2rjus.net { - log { - output file /var/log/caddy/pushgw.log { - mode 644 - } - } - reverse_proxy http://monitoring01.home.2rjus.net:9091 - } http://http-proxy.home.2rjus.net/metrics { log { output file /var/log/caddy/caddy-metrics.log { diff --git a/services/monitoring/blackbox.nix b/services/monitoring/blackbox.nix index 1e334db..8e7f890 100644 --- a/services/monitoring/blackbox.nix +++ b/services/monitoring/blackbox.nix @@ -1,33 +1,4 @@ { pkgs, ... }: -let - # TLS endpoints to monitor for certificate expiration - # These are all services using ACME certificates from OpenBao PKI - tlsTargets = [ - # Direct ACME certs (security.acme.certs) - "https://vault.home.2rjus.net:8200" - "https://auth.home.2rjus.net" - "https://testvm01.home.2rjus.net" - - # Caddy auto-TLS on http-proxy - "https://nzbget.home.2rjus.net" - "https://radarr.home.2rjus.net" - "https://sonarr.home.2rjus.net" - "https://ha.home.2rjus.net" - "https://z2m.home.2rjus.net" - "https://prometheus.home.2rjus.net" - "https://alertmanager.home.2rjus.net" - "https://grafana.home.2rjus.net" - "https://jelly.home.2rjus.net" - "https://pyroscope.home.2rjus.net" - "https://pushgw.home.2rjus.net" - - # Caddy auto-TLS on nix-cache02 - "https://nix-cache.home.2rjus.net" - - # Caddy auto-TLS on grafana01 - "https://grafana-test.home.2rjus.net" - ]; -in { services.prometheus.exporters.blackbox = { enable = true; @@ -57,36 +28,4 @@ in - 503 ''; }; - - # Add blackbox scrape config to Prometheus - # Alert rules are in rules.yml (certificate_rules group) - services.prometheus.scrapeConfigs = [ - { - job_name = "blackbox_tls"; - metrics_path = "/probe"; - params = { - module = [ "https_cert" ]; - }; - static_configs = [{ - targets = tlsTargets; - }]; - relabel_configs = [ - # Pass the target URL to blackbox as a parameter - { - source_labels = [ "__address__" ]; - target_label = "__param_target"; - } - # Use the target URL as the instance label - { - source_labels = [ "__param_target" ]; - target_label = "instance"; - } - # Point the actual scrape at the local blackbox exporter - { - target_label = "__address__"; - replacement = "127.0.0.1:9115"; - } - ]; - } - ]; } diff --git a/services/monitoring/default.nix b/services/monitoring/default.nix deleted file mode 100644 index 5110ff4..0000000 --- a/services/monitoring/default.nix +++ /dev/null @@ -1,14 +0,0 @@ -{ ... }: -{ - imports = [ - ./loki.nix - ./grafana.nix - ./prometheus.nix - ./blackbox.nix - ./exportarr.nix - ./pve.nix - ./alerttonotify.nix - ./pyroscope.nix - ./tempo.nix - ]; -} diff --git a/services/monitoring/exportarr.nix b/services/monitoring/exportarr.nix index b6d1436..970cad1 100644 --- a/services/monitoring/exportarr.nix +++ b/services/monitoring/exportarr.nix @@ -14,14 +14,4 @@ apiKeyFile = config.vault.secrets.sonarr-api-key.outputDir; port = 9709; }; - - # Scrape config - services.prometheus.scrapeConfigs = [ - { - job_name = "sonarr"; - static_configs = [{ - targets = [ "localhost:9709" ]; - }]; - } - ]; } diff --git a/services/monitoring/grafana.nix b/services/monitoring/grafana.nix deleted file mode 100644 index f72f344..0000000 --- a/services/monitoring/grafana.nix +++ /dev/null @@ -1,11 +0,0 @@ -{ pkgs, ... }: -{ - services.grafana = { - enable = true; - settings = { - server = { - http_addr = ""; - }; - }; - }; -} diff --git a/services/monitoring/loki.nix b/services/monitoring/loki.nix deleted file mode 100644 index 87ee06f..0000000 --- a/services/monitoring/loki.nix +++ /dev/null @@ -1,58 +0,0 @@ -{ ... }: -{ - services.loki = { - enable = true; - configuration = { - auth_enabled = false; - - server = { - http_listen_port = 3100; - }; - common = { - ring = { - instance_addr = "127.0.0.1"; - kvstore = { - store = "inmemory"; - }; - }; - replication_factor = 1; - path_prefix = "/var/lib/loki"; - }; - schema_config = { - configs = [ - { - from = "2024-01-01"; - store = "tsdb"; - object_store = "filesystem"; - schema = "v13"; - index = { - prefix = "loki_index_"; - period = "24h"; - }; - } - ]; - }; - storage_config = { - filesystem = { - directory = "/var/lib/loki/chunks"; - }; - }; - compactor = { - working_directory = "/var/lib/loki/compactor"; - compaction_interval = "10m"; - retention_enabled = true; - retention_delete_delay = "2h"; - retention_delete_worker_count = 150; - delete_request_store = "filesystem"; - }; - limits_config = { - retention_period = "30d"; - ingestion_rate_mb = 10; - ingestion_burst_size_mb = 20; - max_streams_per_user = 10000; - max_query_series = 500; - max_query_parallelism = 8; - }; - }; - }; -} diff --git a/services/monitoring/prometheus.nix b/services/monitoring/prometheus.nix deleted file mode 100644 index 1d9c196..0000000 --- a/services/monitoring/prometheus.nix +++ /dev/null @@ -1,267 +0,0 @@ -{ self, lib, pkgs, ... }: -let - monLib = import ../../lib/monitoring.nix { inherit lib; }; - externalTargets = import ./external-targets.nix; - - nodeExporterTargets = monLib.generateNodeExporterTargets self externalTargets; - autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets; - - # Script to fetch AppRole token for Prometheus to use when scraping OpenBao metrics - fetchOpenbaoToken = pkgs.writeShellApplication { - name = "fetch-openbao-token"; - runtimeInputs = [ pkgs.curl pkgs.jq ]; - text = '' - VAULT_ADDR="https://vault01.home.2rjus.net:8200" - APPROLE_DIR="/var/lib/vault/approle" - OUTPUT_FILE="/run/secrets/prometheus/openbao-token" - - # Read AppRole credentials - if [ ! -f "$APPROLE_DIR/role-id" ] || [ ! -f "$APPROLE_DIR/secret-id" ]; then - echo "AppRole credentials not found at $APPROLE_DIR" >&2 - exit 1 - fi - - ROLE_ID=$(cat "$APPROLE_DIR/role-id") - SECRET_ID=$(cat "$APPROLE_DIR/secret-id") - - # Authenticate to Vault - AUTH_RESPONSE=$(curl -sf -k -X POST \ - -d "{\"role_id\":\"$ROLE_ID\",\"secret_id\":\"$SECRET_ID\"}" \ - "$VAULT_ADDR/v1/auth/approle/login") - - # Extract token - VAULT_TOKEN=$(echo "$AUTH_RESPONSE" | jq -r '.auth.client_token') - if [ -z "$VAULT_TOKEN" ] || [ "$VAULT_TOKEN" = "null" ]; then - echo "Failed to extract Vault token from response" >&2 - exit 1 - fi - - # Write token to file - mkdir -p "$(dirname "$OUTPUT_FILE")" - echo -n "$VAULT_TOKEN" > "$OUTPUT_FILE" - chown prometheus:prometheus "$OUTPUT_FILE" - chmod 0400 "$OUTPUT_FILE" - - echo "Successfully fetched OpenBao token" - ''; - }; -in -{ - # Systemd service to fetch AppRole token for Prometheus OpenBao scraping - # The token is used to authenticate when scraping /v1/sys/metrics - systemd.services.prometheus-openbao-token = { - description = "Fetch OpenBao token for Prometheus metrics scraping"; - after = [ "network-online.target" ]; - wants = [ "network-online.target" ]; - before = [ "prometheus.service" ]; - requiredBy = [ "prometheus.service" ]; - - serviceConfig = { - Type = "oneshot"; - ExecStart = lib.getExe fetchOpenbaoToken; - }; - }; - - # Timer to periodically refresh the token (AppRole tokens have 1-hour TTL) - systemd.timers.prometheus-openbao-token = { - description = "Refresh OpenBao token for Prometheus"; - wantedBy = [ "timers.target" ]; - timerConfig = { - OnBootSec = "5min"; - OnUnitActiveSec = "30min"; - RandomizedDelaySec = "5min"; - }; - }; - - # Fetch apiary bearer token from Vault - vault.secrets.prometheus-apiary-token = { - secretPath = "hosts/monitoring01/apiary-token"; - extractKey = "password"; - owner = "prometheus"; - group = "prometheus"; - services = [ "prometheus" ]; - }; - - services.prometheus = { - enable = true; - # syntax-only check because we use external credential files (e.g., openbao-token) - checkConfig = "syntax-only"; - alertmanager = { - enable = true; - configuration = { - global = { - }; - route = { - receiver = "webhook_natstonotify"; - group_wait = "30s"; - group_interval = "5m"; - repeat_interval = "1h"; - group_by = [ "alertname" ]; - }; - receivers = [ - { - name = "webhook_natstonotify"; - webhook_configs = [ - { - url = "http://localhost:5001/alert"; - } - ]; - } - ]; - }; - }; - alertmanagers = [ - { - static_configs = [ - { - targets = [ "localhost:9093" ]; - } - ]; - } - ]; - - retentionTime = "30d"; - globalConfig = { - scrape_interval = "15s"; - }; - rules = [ - (builtins.readFile ./rules.yml) - ]; - - scrapeConfigs = [ - # Auto-generated node-exporter targets from flake hosts + external - # Each static_config entry may have labels from homelab.host metadata - { - job_name = "node-exporter"; - static_configs = nodeExporterTargets; - } - # Systemd exporter on all hosts (same targets, different port) - # Preserves the same label grouping as node-exporter - { - job_name = "systemd-exporter"; - static_configs = map - (cfg: cfg // { - targets = map (t: builtins.replaceStrings [ ":9100" ] [ ":9558" ] t) cfg.targets; - }) - nodeExporterTargets; - } - # Local monitoring services (not auto-generated) - { - job_name = "prometheus"; - static_configs = [ - { - targets = [ "localhost:9090" ]; - } - ]; - } - { - job_name = "loki"; - static_configs = [ - { - targets = [ "localhost:3100" ]; - } - ]; - } - { - job_name = "grafana"; - static_configs = [ - { - targets = [ "localhost:3000" ]; - } - ]; - } - { - job_name = "alertmanager"; - static_configs = [ - { - targets = [ "localhost:9093" ]; - } - ]; - } - { - job_name = "pushgateway"; - honor_labels = true; - static_configs = [ - { - targets = [ "localhost:9091" ]; - } - ]; - } - # Caddy metrics from nix-cache02 (serves nix-cache.home.2rjus.net) - { - job_name = "nix-cache_caddy"; - scheme = "https"; - static_configs = [ - { - targets = [ "nix-cache.home.2rjus.net" ]; - } - ]; - } - # pve-exporter with complex relabel config - { - job_name = "pve-exporter"; - static_configs = [ - { - targets = [ "10.69.12.75" ]; - } - ]; - metrics_path = "/pve"; - params = { - module = [ "default" ]; - cluster = [ "1" ]; - node = [ "1" ]; - }; - relabel_configs = [ - { - source_labels = [ "__address__" ]; - target_label = "__param_target"; - } - { - source_labels = [ "__param_target" ]; - target_label = "instance"; - } - { - target_label = "__address__"; - replacement = "127.0.0.1:9221"; - } - ]; - } - # OpenBao metrics with bearer token auth - { - job_name = "openbao"; - scheme = "https"; - metrics_path = "/v1/sys/metrics"; - params = { - format = [ "prometheus" ]; - }; - static_configs = [{ - targets = [ "vault01.home.2rjus.net:8200" ]; - }]; - authorization = { - type = "Bearer"; - credentials_file = "/run/secrets/prometheus/openbao-token"; - }; - } - # Apiary external service - { - job_name = "apiary"; - scheme = "https"; - scrape_interval = "60s"; - static_configs = [{ - targets = [ "apiary.t-juice.club" ]; - }]; - authorization = { - type = "Bearer"; - credentials_file = "/run/secrets/prometheus-apiary-token"; - }; - } - ] ++ autoScrapeConfigs; - - pushgateway = { - enable = true; - web = { - external-url = "https://pushgw.home.2rjus.net"; - }; - }; - }; -} diff --git a/services/monitoring/pve.nix b/services/monitoring/pve.nix index 45f92ef..b35e91f 100644 --- a/services/monitoring/pve.nix +++ b/services/monitoring/pve.nix @@ -1,7 +1,7 @@ { config, ... }: { vault.secrets.pve-exporter = { - secretPath = "hosts/monitoring01/pve-exporter"; + secretPath = "hosts/monitoring02/pve-exporter"; extractKey = "config"; outputDir = "/run/secrets/pve_exporter"; mode = "0444"; diff --git a/services/monitoring/pyroscope.nix b/services/monitoring/pyroscope.nix deleted file mode 100644 index 03274ef..0000000 --- a/services/monitoring/pyroscope.nix +++ /dev/null @@ -1,8 +0,0 @@ -{ ... }: -{ - virtualisation.oci-containers.containers.pyroscope = { - pull = "missing"; - image = "grafana/pyroscope:latest"; - ports = [ "4040:4040" ]; - }; -} diff --git a/services/monitoring/rules.yml b/services/monitoring/rules.yml index a5f8abf..14b9baf 100644 --- a/services/monitoring/rules.yml +++ b/services/monitoring/rules.yml @@ -259,32 +259,32 @@ groups: description: "Wireguard handshake timeout on {{ $labels.instance }} for peer {{ $labels.public_key }}." - name: monitoring_rules rules: - - alert: prometheus_not_running - expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="prometheus.service", state="active"} == 0 + - alert: victoriametrics_not_running + expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="victoriametrics.service", state="active"} == 0 for: 5m labels: severity: critical annotations: - summary: "Prometheus service not running on {{ $labels.instance }}" - description: "Prometheus service not running on {{ $labels.instance }}" + summary: "VictoriaMetrics service not running on {{ $labels.instance }}" + description: "VictoriaMetrics service not running on {{ $labels.instance }}" + - alert: vmalert_not_running + expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="vmalert.service", state="active"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "vmalert service not running on {{ $labels.instance }}" + description: "vmalert service not running on {{ $labels.instance }}" - alert: alertmanager_not_running - expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0 + expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0 for: 5m labels: severity: critical annotations: summary: "Alertmanager service not running on {{ $labels.instance }}" description: "Alertmanager service not running on {{ $labels.instance }}" - - alert: pushgateway_not_running - expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0 - for: 5m - labels: - severity: critical - annotations: - summary: "Pushgateway service not running on {{ $labels.instance }}" - description: "Pushgateway service not running on {{ $labels.instance }}" - alert: loki_not_running - expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="loki.service", state="active"} == 0 + expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="loki.service", state="active"} == 0 for: 5m labels: severity: critical @@ -292,29 +292,13 @@ groups: summary: "Loki service not running on {{ $labels.instance }}" description: "Loki service not running on {{ $labels.instance }}" - alert: grafana_not_running - expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="grafana.service", state="active"} == 0 + expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="grafana.service", state="active"} == 0 for: 5m labels: severity: warning annotations: summary: "Grafana service not running on {{ $labels.instance }}" description: "Grafana service not running on {{ $labels.instance }}" - - alert: tempo_not_running - expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="tempo.service", state="active"} == 0 - for: 5m - labels: - severity: warning - annotations: - summary: "Tempo service not running on {{ $labels.instance }}" - description: "Tempo service not running on {{ $labels.instance }}" - - alert: pyroscope_not_running - expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="podman-pyroscope.service", state="active"} == 0 - for: 5m - labels: - severity: warning - annotations: - summary: "Pyroscope service not running on {{ $labels.instance }}" - description: "Pyroscope service not running on {{ $labels.instance }}" - name: proxmox_rules rules: - alert: pve_node_down diff --git a/services/monitoring/tempo.nix b/services/monitoring/tempo.nix deleted file mode 100644 index 8fda7f8..0000000 --- a/services/monitoring/tempo.nix +++ /dev/null @@ -1,37 +0,0 @@ -{ ... }: -{ - services.tempo = { - enable = true; - settings = { - server = { - http_listen_port = 3200; - grpc_listen_port = 3201; - }; - distributor = { - receivers = { - otlp = { - protocols = { - http = { - endpoint = ":4318"; - cors = { - allowed_origins = [ "*.home.2rjus.net" ]; - }; - }; - }; - }; - }; - }; - storage = { - trace = { - backend = "local"; - local = { - path = "/var/lib/tempo"; - }; - wal = { - path = "/var/lib/tempo/wal"; - }; - }; - }; - }; - }; -} diff --git a/services/victoriametrics/default.nix b/services/victoriametrics/default.nix index 2c2af1b..6275edc 100644 --- a/services/victoriametrics/default.nix +++ b/services/victoriametrics/default.nix @@ -6,6 +6,24 @@ let nodeExporterTargets = monLib.generateNodeExporterTargets self externalTargets; autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets; + # TLS endpoints to monitor for certificate expiration via blackbox exporter + tlsTargets = [ + "https://vault.home.2rjus.net:8200" + "https://auth.home.2rjus.net" + "https://testvm01.home.2rjus.net" + "https://nzbget.home.2rjus.net" + "https://radarr.home.2rjus.net" + "https://sonarr.home.2rjus.net" + "https://ha.home.2rjus.net" + "https://z2m.home.2rjus.net" + "https://metrics.home.2rjus.net" + "https://alertmanager.home.2rjus.net" + "https://grafana.home.2rjus.net" + "https://jelly.home.2rjus.net" + "https://nix-cache.home.2rjus.net" + "https://grafana-test.home.2rjus.net" + ]; + # Script to fetch AppRole token for VictoriaMetrics to use when scraping OpenBao metrics fetchOpenbaoToken = pkgs.writeShellApplication { name = "fetch-openbao-token-vm"; @@ -107,6 +125,39 @@ let credentials_file = "/run/secrets/victoriametrics-apiary-token"; }; } + # Blackbox TLS certificate monitoring + { + job_name = "blackbox_tls"; + metrics_path = "/probe"; + params = { + module = [ "https_cert" ]; + }; + static_configs = [{ targets = tlsTargets; }]; + relabel_configs = [ + { + source_labels = [ "__address__" ]; + target_label = "__param_target"; + } + { + source_labels = [ "__param_target" ]; + target_label = "instance"; + } + { + target_label = "__address__"; + replacement = "127.0.0.1:9115"; + } + ]; + } + # Sonarr exporter + { + job_name = "sonarr"; + static_configs = [{ targets = [ "localhost:9709" ]; }]; + } + # Proxmox VE exporter + { + job_name = "pve"; + static_configs = [{ targets = [ "localhost:9221" ]; }]; + } ] ++ autoScrapeConfigs; in { @@ -152,7 +203,7 @@ in # Fetch apiary bearer token from Vault vault.secrets.victoriametrics-apiary-token = { - secretPath = "hosts/monitoring01/apiary-token"; + secretPath = "hosts/monitoring02/apiary-token"; extractKey = "password"; owner = "victoriametrics"; group = "victoriametrics"; diff --git a/system/vault-secrets.nix b/system/vault-secrets.nix index 60466dd..bc5608c 100644 --- a/system/vault-secrets.nix +++ b/system/vault-secrets.nix @@ -57,7 +57,7 @@ let type = types.str; description = '' Path to the secret in Vault (without /v1/secret/data/ prefix). - Example: "hosts/monitoring01/grafana-admin" + Example: "hosts/ha1/mqtt-password" ''; }; @@ -152,13 +152,11 @@ in ''; example = literalExpression '' { - grafana-admin = { - secretPath = "hosts/monitoring01/grafana-admin"; - owner = "grafana"; - group = "grafana"; - restartTrigger = true; - restartInterval = "daily"; - services = [ "grafana" ]; + mqtt-password = { + secretPath = "hosts/ha1/mqtt-password"; + owner = "mosquitto"; + group = "mosquitto"; + services = [ "mosquitto" ]; }; } ''; diff --git a/terraform/vault/approle.tf b/terraform/vault/approle.tf index 1e5956a..6a12464 100644 --- a/terraform/vault/approle.tf +++ b/terraform/vault/approle.tf @@ -40,23 +40,13 @@ EOT # Define host access policies locals { host_policies = { - # Example: monitoring01 host - # "monitoring01" = { - # paths = [ - # "secret/data/hosts/monitoring01/*", - # "secret/data/services/prometheus/*", - # "secret/data/services/grafana/*", - # "secret/data/shared/smtp/*" - # ] - # extra_policies = ["some-other-policy"] # Optional: additional policies - # } - - # Example: ha1 host + # Example: # "ha1" = { # paths = [ # "secret/data/hosts/ha1/*", # "secret/data/shared/mqtt/*" # ] + # extra_policies = ["some-other-policy"] # Optional: additional policies # } "ha1" = { @@ -66,16 +56,6 @@ locals { ] } - "monitoring01" = { - paths = [ - "secret/data/hosts/monitoring01/*", - "secret/data/shared/backup/*", - "secret/data/shared/nats/*", - "secret/data/services/exportarr/*", - ] - extra_policies = ["prometheus-metrics"] - } - # Wave 1: hosts with no service secrets (only need vault.enable for future use) "nats1" = { paths = [ diff --git a/terraform/vault/hosts-generated.tf b/terraform/vault/hosts-generated.tf index 5257919..5fa92c2 100644 --- a/terraform/vault/hosts-generated.tf +++ b/terraform/vault/hosts-generated.tf @@ -47,8 +47,8 @@ locals { "monitoring02" = { paths = [ "secret/data/hosts/monitoring02/*", - "secret/data/hosts/monitoring01/apiary-token", "secret/data/services/grafana/*", + "secret/data/services/exportarr/*", "secret/data/shared/nats/nkey", ] extra_policies = ["prometheus-metrics"] diff --git a/terraform/vault/secrets.tf b/terraform/vault/secrets.tf index a84baff..83416a6 100644 --- a/terraform/vault/secrets.tf +++ b/terraform/vault/secrets.tf @@ -10,10 +10,6 @@ resource "vault_mount" "kv" { locals { secrets = { # Example host-specific secrets - # "hosts/monitoring01/grafana-admin" = { - # auto_generate = true - # password_length = 32 - # } # "hosts/ha1/mqtt-password" = { # auto_generate = true # password_length = 24 @@ -35,11 +31,6 @@ locals { # } # } - "hosts/monitoring01/grafana-admin" = { - auto_generate = true - password_length = 32 - } - "hosts/ha1/mqtt-password" = { auto_generate = true password_length = 24 @@ -57,8 +48,8 @@ locals { data = { nkey = var.nats_nkey } } - # PVE exporter config for monitoring01 - "hosts/monitoring01/pve-exporter" = { + # PVE exporter config for monitoring02 + "hosts/monitoring02/pve-exporter" = { auto_generate = false data = { config = var.pve_exporter_config } } @@ -149,7 +140,7 @@ locals { } # Bearer token for scraping apiary metrics - "hosts/monitoring01/apiary-token" = { + "hosts/monitoring02/apiary-token" = { auto_generate = true password_length = 64 }