diff --git a/services/monitoring/rules.yml b/services/monitoring/rules.yml index cec0b55..a5f8abf 100644 --- a/services/monitoring/rules.yml +++ b/services/monitoring/rules.yml @@ -118,13 +118,13 @@ groups: description: "NSD has been down on {{ $labels.instance }} more than 5 minutes." # Only alert on primary DNS (secondary has cold cache after failover) - alert: unbound_low_cache_hit_ratio - expr: (rate(unbound_cache_hits_total{dns_role="primary"}[5m]) / (rate(unbound_cache_hits_total{dns_role="primary"}[5m]) + rate(unbound_cache_misses_total{dns_role="primary"}[5m]))) < 0.5 + expr: (rate(unbound_cache_hits_total{dns_role="primary"}[5m]) / (rate(unbound_cache_hits_total{dns_role="primary"}[5m]) + rate(unbound_cache_misses_total{dns_role="primary"}[5m]))) < 0.2 for: 15m labels: severity: warning annotations: summary: "Low DNS cache hit ratio on {{ $labels.instance }}" - description: "Unbound cache hit ratio is below 50% on {{ $labels.instance }}." + description: "Unbound cache hit ratio is below 20% on {{ $labels.instance }}." - name: http_proxy_rules rules: - alert: caddy_down