diff --git a/docs/plans/cert-monitoring.md b/docs/plans/completed/cert-monitoring.md similarity index 100% rename from docs/plans/cert-monitoring.md rename to docs/plans/completed/cert-monitoring.md diff --git a/services/monitoring/blackbox.nix b/services/monitoring/blackbox.nix new file mode 100644 index 0000000..47bd430 --- /dev/null +++ b/services/monitoring/blackbox.nix @@ -0,0 +1,109 @@ +{ pkgs, ... }: +let + # TLS endpoints to monitor for certificate expiration + # These are all services using ACME certificates from OpenBao PKI + tlsTargets = [ + # Direct ACME certs (security.acme.certs) + "https://vault.home.2rjus.net:8200" + "https://auth.home.2rjus.net" + "https://testvm01.home.2rjus.net" + + # Caddy auto-TLS on http-proxy + "https://nzbget.home.2rjus.net" + "https://radarr.home.2rjus.net" + "https://sonarr.home.2rjus.net" + "https://ha.home.2rjus.net" + "https://z2m.home.2rjus.net" + "https://prometheus.home.2rjus.net" + "https://alertmanager.home.2rjus.net" + "https://grafana.home.2rjus.net" + "https://jelly.home.2rjus.net" + "https://pyroscope.home.2rjus.net" + "https://pushgw.home.2rjus.net" + + # Caddy auto-TLS on nix-cache01 + "https://nix-cache.home.2rjus.net" + + # Caddy auto-TLS on grafana01 + "https://grafana-test.home.2rjus.net" + ]; +in +{ + services.prometheus.exporters.blackbox = { + enable = true; + configFile = pkgs.writeText "blackbox.yml" '' + modules: + https_cert: + prober: http + timeout: 10s + http: + fail_if_not_ssl: true + preferred_ip_protocol: ip4 + ''; + }; + + # Add blackbox scrape config and alert rules to Prometheus + services.prometheus = { + scrapeConfigs = [ + { + job_name = "blackbox_tls"; + metrics_path = "/probe"; + params = { + module = [ "https_cert" ]; + }; + static_configs = [{ + targets = tlsTargets; + }]; + relabel_configs = [ + # Pass the target URL to blackbox as a parameter + { + source_labels = [ "__address__" ]; + target_label = "__param_target"; + } + # Use the target URL as the instance label + { + source_labels = [ "__param_target" ]; + target_label = "instance"; + } + # Point the actual scrape at the local blackbox exporter + { + target_label = "__address__"; + replacement = "127.0.0.1:9115"; + } + ]; + } + ]; + + rules = [ + '' + groups: + - name: certificate_rules + rules: + - alert: tls_certificate_expiring_soon + expr: (probe_ssl_earliest_cert_expiry - time()) < 86400 * 7 + for: 1h + labels: + severity: warning + annotations: + summary: "TLS certificate expiring soon on {{ $labels.instance }}" + description: "The TLS certificate for {{ $labels.instance }} expires in less than 7 days." + - alert: tls_certificate_expiring_critical + expr: (probe_ssl_earliest_cert_expiry - time()) < 86400 + for: 0m + labels: + severity: critical + annotations: + summary: "TLS certificate expiring within 24h on {{ $labels.instance }}" + description: "The TLS certificate for {{ $labels.instance }} expires in less than 24 hours. Immediate action required." + - alert: tls_probe_failed + expr: probe_success{job="blackbox_tls"} == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "TLS probe failed for {{ $labels.instance }}" + description: "Cannot connect to {{ $labels.instance }} to check TLS certificate. The service may be down or unreachable." + '' + ]; + }; +} diff --git a/services/monitoring/default.nix b/services/monitoring/default.nix index 9c96ffd..8a8b1a8 100644 --- a/services/monitoring/default.nix +++ b/services/monitoring/default.nix @@ -4,6 +4,7 @@ ./loki.nix ./grafana.nix ./prometheus.nix + ./blackbox.nix ./pve.nix ./alerttonotify.nix ./pyroscope.nix