From 3a9a47f1adc4759cb38b1d47b854b897d2cc4876 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Thu, 5 Feb 2026 01:12:42 +0100 Subject: [PATCH] monitoring: exclude step-ca serving cert from general expiry alert The step-ca serving certificate is auto-renewed with a 24h lifetime, so it always triggers the general < 86400s threshold. Exclude it and add a dedicated step_ca_serving_cert_expiring alert at < 1h instead. Co-Authored-By: Claude Opus 4.5 --- services/monitoring/rules.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/services/monitoring/rules.yml b/services/monitoring/rules.yml index c5eba34..b9b36aa 100644 --- a/services/monitoring/rules.yml +++ b/services/monitoring/rules.yml @@ -307,13 +307,21 @@ groups: - name: certificate_rules rules: - alert: certificate_expiring_soon - expr: labmon_tlsconmon_certificate_seconds_left < 86400 + expr: labmon_tlsconmon_certificate_seconds_left{address!="ca.home.2rjus.net:443"} < 86400 for: 5m labels: severity: warning annotations: summary: "TLS certificate expiring soon for {{ $labels.instance }}" description: "TLS certificate for {{ $labels.address }} is expiring within 24 hours." + - alert: step_ca_serving_cert_expiring + expr: labmon_tlsconmon_certificate_seconds_left{address="ca.home.2rjus.net:443"} < 3600 + for: 5m + labels: + severity: critical + annotations: + summary: "Step-CA serving certificate expiring" + description: "The step-ca serving certificate (24h auto-renewed) has less than 1 hour of validity left. Renewal may have failed." - alert: certificate_check_error expr: labmon_tlsconmon_certificate_check_error == 1 for: 5m -- 2.49.1