Add some alerting rules for smartctl
Some checks failed
Run nix flake check / flake-check (push) Has been cancelled

This commit is contained in:
Torjus Håkestad 2025-05-18 00:51:02 +02:00
parent afa3cc3a57
commit 3797526000
Signed by: torjus
SSH Key Fingerprint: SHA256:KjAds8wHfD2mBYK2H815s/+ABcSdcIHUndwHEdSxml4

View File

@ -192,6 +192,32 @@
annotations:
summary: "Mosquitto not running on {{ $labels.instance }}"
description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
- name: smartctl_rules
rules:
- alert: SmartCriticalWarning
expr: smartctl_device_critical_warning > 0
for: 0m
labels:
severity: critical
annotations:
summary: SMART critical warning (instance {{ $labels.instance }})
description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartMediaErrors
expr: smartctl_device_media_errors > 0
for: 0m
labels:
severity: critical
annotations:
summary: SMART media errors (instance {{ $labels.instance }})
description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartWearoutIndicator
expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold
for: 0m
labels:
severity: critical
annotations:
summary: SMART Wearout Indicator (instance {{ $labels.instance }})
description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
''
];