diff --git a/services/monitoring/rules.yml b/services/monitoring/rules.yml index c5eba34..b9b36aa 100644 --- a/services/monitoring/rules.yml +++ b/services/monitoring/rules.yml @@ -307,13 +307,21 @@ groups: - name: certificate_rules rules: - alert: certificate_expiring_soon - expr: labmon_tlsconmon_certificate_seconds_left < 86400 + expr: labmon_tlsconmon_certificate_seconds_left{address!="ca.home.2rjus.net:443"} < 86400 for: 5m labels: severity: warning annotations: summary: "TLS certificate expiring soon for {{ $labels.instance }}" description: "TLS certificate for {{ $labels.address }} is expiring within 24 hours." + - alert: step_ca_serving_cert_expiring + expr: labmon_tlsconmon_certificate_seconds_left{address="ca.home.2rjus.net:443"} < 3600 + for: 5m + labels: + severity: critical + annotations: + summary: "Step-CA serving certificate expiring" + description: "The step-ca serving certificate (24h auto-renewed) has less than 1 hour of validity left. Renewal may have failed." - alert: certificate_check_error expr: labmon_tlsconmon_certificate_check_error == 1 for: 5m