diff --git a/docs/plans/monitoring-gaps.md b/docs/plans/completed/monitoring-gaps.md similarity index 100% rename from docs/plans/monitoring-gaps.md rename to docs/plans/completed/monitoring-gaps.md diff --git a/services/authelia/default.nix b/services/authelia/default.nix index 78de3cd..9d98fad 100644 --- a/services/authelia/default.nix +++ b/services/authelia/default.nix @@ -1,5 +1,10 @@ { config, ... }: { + homelab.monitoring.scrapeTargets = [{ + job_name = "authelia"; + port = 9959; + }]; + sops.secrets.authelia_ldap_password = { format = "yaml"; sopsFile = ../../secrets/auth01/secrets.yaml; @@ -45,6 +50,12 @@ storageEncryptionKeyFile = config.sops.secrets.authelia_storage_encryption_key_file.path; }; settings = { + telemetry = { + metrics = { + enabled = true; + address = "tcp://0.0.0.0:9959"; + }; + }; access_control = { default_policy = "two_factor"; }; diff --git a/services/monitoring/prometheus.nix b/services/monitoring/prometheus.nix index bc29de0..378a5b0 100644 --- a/services/monitoring/prometheus.nix +++ b/services/monitoring/prometheus.nix @@ -7,8 +7,19 @@ let autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets; in { + # OpenBao token for scraping metrics + vault.secrets.openbao-token = { + secretPath = "hosts/monitoring01/openbao-token"; + extractKey = "token"; + outputDir = "/run/secrets/prometheus"; + mode = "0400"; + owner = "prometheus"; + services = [ "prometheus" ]; + }; services.prometheus = { enable = true; + # syntax-only check because we use external credential files (e.g., openbao-token) + checkConfig = "syntax-only"; alertmanager = { enable = true; configuration = { @@ -61,6 +72,15 @@ in } ]; } + # Systemd exporter on all hosts (same targets, different port) + { + job_name = "systemd-exporter"; + static_configs = [ + { + targets = map (t: builtins.replaceStrings [":9100"] [":9558"] t) nodeExporterTargets; + } + ]; + } # Local monitoring services (not auto-generated) { job_name = "prometheus"; @@ -152,6 +172,22 @@ in } ]; } + # OpenBao metrics with bearer token auth + { + job_name = "openbao"; + scheme = "https"; + metrics_path = "/v1/sys/metrics"; + params = { + format = [ "prometheus" ]; + }; + static_configs = [{ + targets = [ "vault01.home.2rjus.net:8200" ]; + }]; + authorization = { + type = "Bearer"; + credentials_file = "/run/secrets/prometheus/openbao-token"; + }; + } ] ++ autoScrapeConfigs; pushgateway = { diff --git a/services/monitoring/rules.yml b/services/monitoring/rules.yml index ea5dc42..30d01eb 100644 --- a/services/monitoring/rules.yml +++ b/services/monitoring/rules.yml @@ -115,6 +115,14 @@ groups: annotations: summary: "NSD not running on {{ $labels.instance }}" description: "NSD has been down on {{ $labels.instance }} more than 5 minutes." + - alert: unbound_low_cache_hit_ratio + expr: (rate(unbound_cache_hits_total[5m]) / (rate(unbound_cache_hits_total[5m]) + rate(unbound_cache_misses_total[5m]))) < 0.5 + for: 15m + labels: + severity: warning + annotations: + summary: "Low DNS cache hit ratio on {{ $labels.instance }}" + description: "Unbound cache hit ratio is below 50% on {{ $labels.instance }}." - name: http_proxy_rules rules: - alert: caddy_down @@ -151,6 +159,14 @@ groups: annotations: summary: "NATS not running on {{ $labels.instance }}" description: "NATS has been down on {{ $labels.instance }} more than 5 minutes." + - alert: nats_slow_consumers + expr: nats_core_slow_consumer_count > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "NATS has slow consumers on {{ $labels.instance }}" + description: "NATS has {{ $value }} slow consumers on {{ $labels.instance }}." - name: nix_cache_rules rules: - alert: build_flakes_service_not_active_recently @@ -364,3 +380,83 @@ groups: annotations: summary: "Proxmox VM {{ $labels.id }} is stopped" description: "Proxmox VM {{ $labels.id }} ({{ $labels.name }}) has onboot=1 but is stopped." + - name: postgres_rules + rules: + - alert: postgres_down + expr: node_systemd_unit_state{instance="pgdb1.home.2rjus.net:9100", name="postgresql.service", state="active"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "PostgreSQL not running on {{ $labels.instance }}" + description: "PostgreSQL has been down on {{ $labels.instance }} more than 5 minutes." + - alert: postgres_exporter_down + expr: up{job="postgres"} == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "PostgreSQL exporter down on {{ $labels.instance }}" + description: "Cannot scrape PostgreSQL metrics from {{ $labels.instance }}." + - alert: postgres_high_connections + expr: pg_stat_activity_count / pg_settings_max_connections > 0.8 + for: 5m + labels: + severity: warning + annotations: + summary: "PostgreSQL connection pool near exhaustion on {{ $labels.instance }}" + description: "PostgreSQL is using over 80% of max_connections on {{ $labels.instance }}." + - name: auth_rules + rules: + - alert: authelia_down + expr: node_systemd_unit_state{instance="auth01.home.2rjus.net:9100", name="authelia-auth.service", state="active"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Authelia not running on {{ $labels.instance }}" + description: "Authelia has been down on {{ $labels.instance }} more than 5 minutes." + - alert: lldap_down + expr: node_systemd_unit_state{instance="auth01.home.2rjus.net:9100", name="lldap.service", state="active"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "LLDAP not running on {{ $labels.instance }}" + description: "LLDAP has been down on {{ $labels.instance }} more than 5 minutes." + - name: jellyfin_rules + rules: + - alert: jellyfin_down + expr: up{job="jellyfin"} == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Jellyfin not responding on {{ $labels.instance }}" + description: "Cannot scrape Jellyfin metrics from {{ $labels.instance }} for 5 minutes." + - name: vault_rules + rules: + - alert: openbao_down + expr: node_systemd_unit_state{instance="vault01.home.2rjus.net:9100", name="openbao.service", state="active"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "OpenBao not running on {{ $labels.instance }}" + description: "OpenBao has been down on {{ $labels.instance }} more than 5 minutes." + - alert: openbao_sealed + expr: vault_core_unsealed == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "OpenBao is sealed on {{ $labels.instance }}" + description: "OpenBao has been sealed on {{ $labels.instance }} for more than 5 minutes." + - alert: openbao_scrape_down + expr: up{job="openbao"} == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Cannot scrape OpenBao metrics from {{ $labels.instance }}" + description: "OpenBao metrics endpoint is not responding on {{ $labels.instance }}." diff --git a/services/nats/default.nix b/services/nats/default.nix index 5fbcea7..2b6f5e9 100644 --- a/services/nats/default.nix +++ b/services/nats/default.nix @@ -1,10 +1,16 @@ { ... }: { + homelab.monitoring.scrapeTargets = [{ + job_name = "nats"; + port = 8222; + }]; + services.nats = { enable = true; jetstream = true; serverName = "nats1"; settings = { + http_port = 8222; accounts = { ADMIN = { users = [ diff --git a/services/ns/resolver.nix b/services/ns/resolver.nix index 73ec72d..9dde679 100644 --- a/services/ns/resolver.nix +++ b/services/ns/resolver.nix @@ -1,10 +1,24 @@ { pkgs, ... }: { + homelab.monitoring.scrapeTargets = [{ + job_name = "unbound"; + port = 9167; + }]; + networking.firewall.allowedTCPPorts = [ 53 ]; networking.firewall.allowedUDPPorts = [ 53 ]; + + services.prometheus.exporters.unbound = { + enable = true; + unbound.host = "unix:///run/unbound/unbound.ctl"; + }; + + # Grant exporter access to unbound socket + systemd.services.prometheus-unbound-exporter.serviceConfig.SupplementaryGroups = [ "unbound" ]; + services.unbound = { enable = true; @@ -23,6 +37,11 @@ do-ip6 = "no"; do-udp = "yes"; do-tcp = "yes"; + extended-statistics = true; + }; + remote-control = { + control-enable = true; + control-interface = "/run/unbound/unbound.ctl"; }; stub-zone = { name = "home.2rjus.net"; diff --git a/services/postgres/postgres.nix b/services/postgres/postgres.nix index 45aa7b0..c4b8d99 100644 --- a/services/postgres/postgres.nix +++ b/services/postgres/postgres.nix @@ -1,5 +1,15 @@ { pkgs, ... }: { + homelab.monitoring.scrapeTargets = [{ + job_name = "postgres"; + port = 9187; + }]; + + services.prometheus.exporters.postgres = { + enable = true; + runAsLocalSuperUser = true; # Use peer auth as postgres user + }; + services.postgresql = { enable = true; enableJIT = true; diff --git a/services/vault/default.nix b/services/vault/default.nix index 3439a67..babf575 100644 --- a/services/vault/default.nix +++ b/services/vault/default.nix @@ -166,6 +166,11 @@ in settings = { ui = true; + telemetry = { + prometheus_retention_time = "60s"; + disable_hostname = true; + }; + storage.file.path = "/var/lib/openbao"; listener.default = { type = "tcp"; diff --git a/system/monitoring/metrics.nix b/system/monitoring/metrics.nix index b65837f..08fb1b1 100644 --- a/system/monitoring/metrics.nix +++ b/system/monitoring/metrics.nix @@ -9,4 +9,9 @@ "processes" ]; }; + + services.prometheus.exporters.systemd = { + enable = true; + # Default port: 9558 + }; } diff --git a/terraform/vault/policies.tf b/terraform/vault/policies.tf new file mode 100644 index 0000000..e0f90e4 --- /dev/null +++ b/terraform/vault/policies.tf @@ -0,0 +1,21 @@ +# Generic policies for services (not host-specific) + +resource "vault_policy" "prometheus_metrics" { + name = "prometheus-metrics" + policy = <