diff --git a/services/monitoring/prometheus.nix b/services/monitoring/prometheus.nix index d0d2117..c5f91d7 100644 --- a/services/monitoring/prometheus.nix +++ b/services/monitoring/prometheus.nix @@ -192,6 +192,32 @@ annotations: summary: "Mosquitto not running on {{ $labels.instance }}" description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes." + - name: smartctl_rules + rules: + - alert: SmartCriticalWarning + expr: smartctl_device_critical_warning > 0 + for: 0m + labels: + severity: critical + annotations: + summary: SMART critical warning (instance {{ $labels.instance }}) + description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: SmartMediaErrors + expr: smartctl_device_media_errors > 0 + for: 0m + labels: + severity: critical + annotations: + summary: SMART media errors (instance {{ $labels.instance }}) + description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: SmartWearoutIndicator + expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold + for: 0m + labels: + severity: critical + annotations: + summary: SMART Wearout Indicator (instance {{ $labels.instance }}) + description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" '' ];