From 8e1753c2c8478cdea89aa015cfc728409e21b19a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Mon, 9 Feb 2026 22:26:05 +0100 Subject: [PATCH] monitoring: fix blackbox rules and add force-push policy Move certificate alert rules to rules.yml instead of adding them as a separate rules string in blackbox.nix. The previous approach caused a YAML parse error due to duplicate 'groups' keys. Also add policy to CLAUDE.md: never force push to master. Co-Authored-By: Claude Opus 4.5 --- CLAUDE.md | 2 ++ services/monitoring/rules.yml | 26 ++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index af57dec..2bbff94 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -132,6 +132,8 @@ Terraform manages the secrets and AppRole policies in `terraform/vault/`. **Important:** Never amend commits to `master` unless the user explicitly asks for it. Amending rewrites history and causes issues for deployed configurations. +**Important:** Never force push to `master`. If a commit on master has an error, fix it with a new commit rather than rewriting history. + **Important:** Do not use `gh pr create` to create pull requests. The git server does not support GitHub CLI for PR creation. Instead, push the branch and let the user create the PR manually via the web interface. When starting a new plan or task, the first step should typically be to create and checkout a new branch with an appropriate name (e.g., `git checkout -b dns-automation` or `git checkout -b fix-nginx-config`). diff --git a/services/monitoring/rules.yml b/services/monitoring/rules.yml index 31d99c1..6822955 100644 --- a/services/monitoring/rules.yml +++ b/services/monitoring/rules.yml @@ -392,3 +392,29 @@ groups: annotations: summary: "Cannot scrape OpenBao metrics from {{ $labels.instance }}" description: "OpenBao metrics endpoint is not responding on {{ $labels.instance }}." + - name: certificate_rules + rules: + - alert: tls_certificate_expiring_soon + expr: (probe_ssl_earliest_cert_expiry - time()) < 86400 * 7 + for: 1h + labels: + severity: warning + annotations: + summary: "TLS certificate expiring soon on {{ $labels.instance }}" + description: "The TLS certificate for {{ $labels.instance }} expires in less than 7 days." + - alert: tls_certificate_expiring_critical + expr: (probe_ssl_earliest_cert_expiry - time()) < 86400 + for: 0m + labels: + severity: critical + annotations: + summary: "TLS certificate expiring within 24h on {{ $labels.instance }}" + description: "The TLS certificate for {{ $labels.instance }} expires in less than 24 hours. Immediate action required." + - alert: tls_probe_failed + expr: probe_success{job="blackbox_tls"} == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "TLS probe failed for {{ $labels.instance }}" + description: "Cannot connect to {{ $labels.instance }} to check TLS certificate. The service may be down or unreachable."