monitoring: implement monitoring gaps coverage
Some checks failed
Run nix flake check / flake-check (push) Failing after 7m36s
Some checks failed
Run nix flake check / flake-check (push) Failing after 7m36s
Add exporters and scrape targets for services lacking monitoring: - PostgreSQL: postgres-exporter on pgdb1 - Authelia: native telemetry metrics on auth01 - Unbound: unbound-exporter with remote-control on ns1/ns2 - NATS: HTTP monitoring endpoint on nats1 - OpenBao: telemetry config and Prometheus scrape with token auth - Systemd: systemd-exporter on all hosts for per-service metrics Add alert rules for postgres, auth (authelia + lldap), jellyfin, vault (openbao), plus extend existing nats and unbound rules. Add Terraform config for Prometheus metrics policy and token. The token is created via vault_token resource and stored in KV, so no manual token creation is needed. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -115,6 +115,14 @@ groups:
|
||||
annotations:
|
||||
summary: "NSD not running on {{ $labels.instance }}"
|
||||
description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
|
||||
- alert: unbound_low_cache_hit_ratio
|
||||
expr: (rate(unbound_cache_hits_total[5m]) / (rate(unbound_cache_hits_total[5m]) + rate(unbound_cache_misses_total[5m]))) < 0.5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Low DNS cache hit ratio on {{ $labels.instance }}"
|
||||
description: "Unbound cache hit ratio is below 50% on {{ $labels.instance }}."
|
||||
- name: http_proxy_rules
|
||||
rules:
|
||||
- alert: caddy_down
|
||||
@@ -151,6 +159,14 @@ groups:
|
||||
annotations:
|
||||
summary: "NATS not running on {{ $labels.instance }}"
|
||||
description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
|
||||
- alert: nats_slow_consumers
|
||||
expr: nats_core_slow_consumer_count > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "NATS has slow consumers on {{ $labels.instance }}"
|
||||
description: "NATS has {{ $value }} slow consumers on {{ $labels.instance }}."
|
||||
- name: nix_cache_rules
|
||||
rules:
|
||||
- alert: build_flakes_service_not_active_recently
|
||||
@@ -364,3 +380,83 @@ groups:
|
||||
annotations:
|
||||
summary: "Proxmox VM {{ $labels.id }} is stopped"
|
||||
description: "Proxmox VM {{ $labels.id }} ({{ $labels.name }}) has onboot=1 but is stopped."
|
||||
- name: postgres_rules
|
||||
rules:
|
||||
- alert: postgres_down
|
||||
expr: node_systemd_unit_state{instance="pgdb1.home.2rjus.net:9100", name="postgresql.service", state="active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PostgreSQL not running on {{ $labels.instance }}"
|
||||
description: "PostgreSQL has been down on {{ $labels.instance }} more than 5 minutes."
|
||||
- alert: postgres_exporter_down
|
||||
expr: up{job="postgres"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PostgreSQL exporter down on {{ $labels.instance }}"
|
||||
description: "Cannot scrape PostgreSQL metrics from {{ $labels.instance }}."
|
||||
- alert: postgres_high_connections
|
||||
expr: pg_stat_activity_count / pg_settings_max_connections > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PostgreSQL connection pool near exhaustion on {{ $labels.instance }}"
|
||||
description: "PostgreSQL is using over 80% of max_connections on {{ $labels.instance }}."
|
||||
- name: auth_rules
|
||||
rules:
|
||||
- alert: authelia_down
|
||||
expr: node_systemd_unit_state{instance="auth01.home.2rjus.net:9100", name="authelia-auth.service", state="active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Authelia not running on {{ $labels.instance }}"
|
||||
description: "Authelia has been down on {{ $labels.instance }} more than 5 minutes."
|
||||
- alert: lldap_down
|
||||
expr: node_systemd_unit_state{instance="auth01.home.2rjus.net:9100", name="lldap.service", state="active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "LLDAP not running on {{ $labels.instance }}"
|
||||
description: "LLDAP has been down on {{ $labels.instance }} more than 5 minutes."
|
||||
- name: jellyfin_rules
|
||||
rules:
|
||||
- alert: jellyfin_down
|
||||
expr: up{job="jellyfin"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Jellyfin not responding on {{ $labels.instance }}"
|
||||
description: "Cannot scrape Jellyfin metrics from {{ $labels.instance }} for 5 minutes."
|
||||
- name: vault_rules
|
||||
rules:
|
||||
- alert: openbao_down
|
||||
expr: node_systemd_unit_state{instance="vault01.home.2rjus.net:9100", name="openbao.service", state="active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "OpenBao not running on {{ $labels.instance }}"
|
||||
description: "OpenBao has been down on {{ $labels.instance }} more than 5 minutes."
|
||||
- alert: openbao_sealed
|
||||
expr: vault_core_unsealed == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "OpenBao is sealed on {{ $labels.instance }}"
|
||||
description: "OpenBao has been sealed on {{ $labels.instance }} for more than 5 minutes."
|
||||
- alert: openbao_scrape_down
|
||||
expr: up{job="openbao"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Cannot scrape OpenBao metrics from {{ $labels.instance }}"
|
||||
description: "OpenBao metrics endpoint is not responding on {{ $labels.instance }}."
|
||||
|
||||
Reference in New Issue
Block a user