monitoring: add blackbox exporter for TLS certificate monitoring
All checks were successful
Run nix flake check / flake-check (push) Successful in 2m6s
All checks were successful
Run nix flake check / flake-check (push) Successful in 2m6s
Add blackbox exporter to monitoring01 to probe TLS endpoints and alert on expiring certificates. Monitors all ACME-managed certificates from OpenBao PKI including Caddy auto-TLS services. Alerts: - tls_certificate_expiring_soon (< 7 days, warning) - tls_certificate_expiring_critical (< 24h, critical) - tls_probe_failed (connectivity issues) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
109
services/monitoring/blackbox.nix
Normal file
109
services/monitoring/blackbox.nix
Normal file
@@ -0,0 +1,109 @@
|
||||
{ pkgs, ... }:
|
||||
let
|
||||
# TLS endpoints to monitor for certificate expiration
|
||||
# These are all services using ACME certificates from OpenBao PKI
|
||||
tlsTargets = [
|
||||
# Direct ACME certs (security.acme.certs)
|
||||
"https://vault.home.2rjus.net:8200"
|
||||
"https://auth.home.2rjus.net"
|
||||
"https://testvm01.home.2rjus.net"
|
||||
|
||||
# Caddy auto-TLS on http-proxy
|
||||
"https://nzbget.home.2rjus.net"
|
||||
"https://radarr.home.2rjus.net"
|
||||
"https://sonarr.home.2rjus.net"
|
||||
"https://ha.home.2rjus.net"
|
||||
"https://z2m.home.2rjus.net"
|
||||
"https://prometheus.home.2rjus.net"
|
||||
"https://alertmanager.home.2rjus.net"
|
||||
"https://grafana.home.2rjus.net"
|
||||
"https://jelly.home.2rjus.net"
|
||||
"https://pyroscope.home.2rjus.net"
|
||||
"https://pushgw.home.2rjus.net"
|
||||
|
||||
# Caddy auto-TLS on nix-cache01
|
||||
"https://nix-cache.home.2rjus.net"
|
||||
|
||||
# Caddy auto-TLS on grafana01
|
||||
"https://grafana-test.home.2rjus.net"
|
||||
];
|
||||
in
|
||||
{
|
||||
services.prometheus.exporters.blackbox = {
|
||||
enable = true;
|
||||
configFile = pkgs.writeText "blackbox.yml" ''
|
||||
modules:
|
||||
https_cert:
|
||||
prober: http
|
||||
timeout: 10s
|
||||
http:
|
||||
fail_if_not_ssl: true
|
||||
preferred_ip_protocol: ip4
|
||||
'';
|
||||
};
|
||||
|
||||
# Add blackbox scrape config and alert rules to Prometheus
|
||||
services.prometheus = {
|
||||
scrapeConfigs = [
|
||||
{
|
||||
job_name = "blackbox_tls";
|
||||
metrics_path = "/probe";
|
||||
params = {
|
||||
module = [ "https_cert" ];
|
||||
};
|
||||
static_configs = [{
|
||||
targets = tlsTargets;
|
||||
}];
|
||||
relabel_configs = [
|
||||
# Pass the target URL to blackbox as a parameter
|
||||
{
|
||||
source_labels = [ "__address__" ];
|
||||
target_label = "__param_target";
|
||||
}
|
||||
# Use the target URL as the instance label
|
||||
{
|
||||
source_labels = [ "__param_target" ];
|
||||
target_label = "instance";
|
||||
}
|
||||
# Point the actual scrape at the local blackbox exporter
|
||||
{
|
||||
target_label = "__address__";
|
||||
replacement = "127.0.0.1:9115";
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
|
||||
rules = [
|
||||
''
|
||||
groups:
|
||||
- name: certificate_rules
|
||||
rules:
|
||||
- alert: tls_certificate_expiring_soon
|
||||
expr: (probe_ssl_earliest_cert_expiry - time()) < 86400 * 7
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "TLS certificate expiring soon on {{ $labels.instance }}"
|
||||
description: "The TLS certificate for {{ $labels.instance }} expires in less than 7 days."
|
||||
- alert: tls_certificate_expiring_critical
|
||||
expr: (probe_ssl_earliest_cert_expiry - time()) < 86400
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "TLS certificate expiring within 24h on {{ $labels.instance }}"
|
||||
description: "The TLS certificate for {{ $labels.instance }} expires in less than 24 hours. Immediate action required."
|
||||
- alert: tls_probe_failed
|
||||
expr: probe_success{job="blackbox_tls"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "TLS probe failed for {{ $labels.instance }}"
|
||||
description: "Cannot connect to {{ $labels.instance }} to check TLS certificate. The service may be down or unreachable."
|
||||
''
|
||||
];
|
||||
};
|
||||
}
|
||||
@@ -4,6 +4,7 @@
|
||||
./loki.nix
|
||||
./grafana.nix
|
||||
./prometheus.nix
|
||||
./blackbox.nix
|
||||
./pve.nix
|
||||
./alerttonotify.nix
|
||||
./pyroscope.nix
|
||||
|
||||
Reference in New Issue
Block a user