monitoring: add blackbox exporter for TLS certificate monitoring
All checks were successful
Run nix flake check / flake-check (push) Successful in 2m6s
All checks were successful
Run nix flake check / flake-check (push) Successful in 2m6s
Add blackbox exporter to monitoring01 to probe TLS endpoints and alert on expiring certificates. Monitors all ACME-managed certificates from OpenBao PKI including Caddy auto-TLS services. Alerts: - tls_certificate_expiring_soon (< 7 days, warning) - tls_certificate_expiring_critical (< 24h, critical) - tls_probe_failed (connectivity issues) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
109
services/monitoring/blackbox.nix
Normal file
109
services/monitoring/blackbox.nix
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
{ pkgs, ... }:
|
||||||
|
let
|
||||||
|
# TLS endpoints to monitor for certificate expiration
|
||||||
|
# These are all services using ACME certificates from OpenBao PKI
|
||||||
|
tlsTargets = [
|
||||||
|
# Direct ACME certs (security.acme.certs)
|
||||||
|
"https://vault.home.2rjus.net:8200"
|
||||||
|
"https://auth.home.2rjus.net"
|
||||||
|
"https://testvm01.home.2rjus.net"
|
||||||
|
|
||||||
|
# Caddy auto-TLS on http-proxy
|
||||||
|
"https://nzbget.home.2rjus.net"
|
||||||
|
"https://radarr.home.2rjus.net"
|
||||||
|
"https://sonarr.home.2rjus.net"
|
||||||
|
"https://ha.home.2rjus.net"
|
||||||
|
"https://z2m.home.2rjus.net"
|
||||||
|
"https://prometheus.home.2rjus.net"
|
||||||
|
"https://alertmanager.home.2rjus.net"
|
||||||
|
"https://grafana.home.2rjus.net"
|
||||||
|
"https://jelly.home.2rjus.net"
|
||||||
|
"https://pyroscope.home.2rjus.net"
|
||||||
|
"https://pushgw.home.2rjus.net"
|
||||||
|
|
||||||
|
# Caddy auto-TLS on nix-cache01
|
||||||
|
"https://nix-cache.home.2rjus.net"
|
||||||
|
|
||||||
|
# Caddy auto-TLS on grafana01
|
||||||
|
"https://grafana-test.home.2rjus.net"
|
||||||
|
];
|
||||||
|
in
|
||||||
|
{
|
||||||
|
services.prometheus.exporters.blackbox = {
|
||||||
|
enable = true;
|
||||||
|
configFile = pkgs.writeText "blackbox.yml" ''
|
||||||
|
modules:
|
||||||
|
https_cert:
|
||||||
|
prober: http
|
||||||
|
timeout: 10s
|
||||||
|
http:
|
||||||
|
fail_if_not_ssl: true
|
||||||
|
preferred_ip_protocol: ip4
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
|
# Add blackbox scrape config and alert rules to Prometheus
|
||||||
|
services.prometheus = {
|
||||||
|
scrapeConfigs = [
|
||||||
|
{
|
||||||
|
job_name = "blackbox_tls";
|
||||||
|
metrics_path = "/probe";
|
||||||
|
params = {
|
||||||
|
module = [ "https_cert" ];
|
||||||
|
};
|
||||||
|
static_configs = [{
|
||||||
|
targets = tlsTargets;
|
||||||
|
}];
|
||||||
|
relabel_configs = [
|
||||||
|
# Pass the target URL to blackbox as a parameter
|
||||||
|
{
|
||||||
|
source_labels = [ "__address__" ];
|
||||||
|
target_label = "__param_target";
|
||||||
|
}
|
||||||
|
# Use the target URL as the instance label
|
||||||
|
{
|
||||||
|
source_labels = [ "__param_target" ];
|
||||||
|
target_label = "instance";
|
||||||
|
}
|
||||||
|
# Point the actual scrape at the local blackbox exporter
|
||||||
|
{
|
||||||
|
target_label = "__address__";
|
||||||
|
replacement = "127.0.0.1:9115";
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
|
rules = [
|
||||||
|
''
|
||||||
|
groups:
|
||||||
|
- name: certificate_rules
|
||||||
|
rules:
|
||||||
|
- alert: tls_certificate_expiring_soon
|
||||||
|
expr: (probe_ssl_earliest_cert_expiry - time()) < 86400 * 7
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "TLS certificate expiring soon on {{ $labels.instance }}"
|
||||||
|
description: "The TLS certificate for {{ $labels.instance }} expires in less than 7 days."
|
||||||
|
- alert: tls_certificate_expiring_critical
|
||||||
|
expr: (probe_ssl_earliest_cert_expiry - time()) < 86400
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "TLS certificate expiring within 24h on {{ $labels.instance }}"
|
||||||
|
description: "The TLS certificate for {{ $labels.instance }} expires in less than 24 hours. Immediate action required."
|
||||||
|
- alert: tls_probe_failed
|
||||||
|
expr: probe_success{job="blackbox_tls"} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "TLS probe failed for {{ $labels.instance }}"
|
||||||
|
description: "Cannot connect to {{ $labels.instance }} to check TLS certificate. The service may be down or unreachable."
|
||||||
|
''
|
||||||
|
];
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -4,6 +4,7 @@
|
|||||||
./loki.nix
|
./loki.nix
|
||||||
./grafana.nix
|
./grafana.nix
|
||||||
./prometheus.nix
|
./prometheus.nix
|
||||||
|
./blackbox.nix
|
||||||
./pve.nix
|
./pve.nix
|
||||||
./alerttonotify.nix
|
./alerttonotify.nix
|
||||||
./pyroscope.nix
|
./pyroscope.nix
|
||||||
|
|||||||
Reference in New Issue
Block a user