Add monitoring rules for monitoring services
Some checks failed
Run nix flake check / flake-check (push) Has been cancelled

This commit is contained in:
Torjus Håkestad 2025-05-29 10:09:23 +02:00
parent 1f6689aeb6
commit 797f915939
Signed by: torjus
SSH Key Fingerprint: SHA256:KjAds8wHfD2mBYK2H815s/+ABcSdcIHUndwHEdSxml4

View File

@ -105,6 +105,14 @@ groups:
annotations:
summary: "The build-flakes service on {{ $labels.instance }} has not run recently"
description: "The build-flakes service on {{ $labels.instance }} has not run recently"
- alert: build-flakes_timer_not_active
expr: node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="active"} == 0
for: 0m
labels:
severity: critical
annotations:
summary: "The build-flakes timer on {{ $labels.instance }} is not active"
description: "The build-flakes timer on {{ $labels.instance }} is not active"
- alert: build_flakes_error
expr: build_flakes_error == 1
labels:
@ -190,3 +198,61 @@ groups:
annotations:
summary: "Wireguard handshake timeout on {{ $labels.instance }}"
description: "Wireguard handshake timeout on {{ $labels.instance }} for more than 1 minutes."
- name: monitoring_rules
rules:
- alert: prometheus_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="prometheus.service", state="active"} == 0
labels:
severity: critical
annotations:
summary: "Prometheus service not running on {{ $labels.instance }}"
description: "Prometheus service not running on {{ $labels.instance }}"
- alert: alertmanager_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
labels:
severity: critical
annotations:
summary: "Alertmanager service not running on {{ $labels.instance }}"
description: "Alertmanager service not running on {{ $labels.instance }}"
- alert: pushgateway_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
labels:
severity: critical
annotations:
summary: "Pushgateway service not running on {{ $labels.instance }}"
description: "Pushgateway service not running on {{ $labels.instance }}"
- alert: pushgateway_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
labels:
severity: critical
annotations:
summary: "Pushgateway service not running on {{ $labels.instance }}"
description: "Pushgateway service not running on {{ $labels.instance }}"
- alert: loki_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="loki.service", state="active"} == 0
labels:
severity: critical
annotations:
summary: "Loki service not running on {{ $labels.instance }}"
description: "Loki service not running on {{ $labels.instance }}"
- alert: grafana_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
labels:
severity: warning
annotations:
summary: "Grafana service not running on {{ $labels.instance }}"
description: "Grafana service not running on {{ $labels.instance }}"
- alert: tempo_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="tempo.service", state="active"} == 0
labels:
severity: warning
annotations:
summary: "Tempo service not running on {{ $labels.instance }}"
description: "Tempo service not running on {{ $labels.instance }}"
- alert: pyroscope_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="podman-pyroscope.service", state="active"} == 0
labels:
severity: warning
annotations:
summary: "Pyroscope service not running on {{ $labels.instance }}"
description: "Pyroscope service not running on {{ $labels.instance }}"