From 797f91593913243875085b05ac889fcbcae24878 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Thu, 29 May 2025 10:09:23 +0200 Subject: [PATCH] Add monitoring rules for monitoring services --- services/monitoring/rules.yml | 66 +++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/services/monitoring/rules.yml b/services/monitoring/rules.yml index e8fd54b..ec99a80 100644 --- a/services/monitoring/rules.yml +++ b/services/monitoring/rules.yml @@ -105,6 +105,14 @@ groups: annotations: summary: "The build-flakes service on {{ $labels.instance }} has not run recently" description: "The build-flakes service on {{ $labels.instance }} has not run recently" + - alert: build-flakes_timer_not_active + expr: node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="active"} == 0 + for: 0m + labels: + severity: critical + annotations: + summary: "The build-flakes timer on {{ $labels.instance }} is not active" + description: "The build-flakes timer on {{ $labels.instance }} is not active" - alert: build_flakes_error expr: build_flakes_error == 1 labels: @@ -190,3 +198,61 @@ groups: annotations: summary: "Wireguard handshake timeout on {{ $labels.instance }}" description: "Wireguard handshake timeout on {{ $labels.instance }} for more than 1 minutes." + - name: monitoring_rules + rules: + - alert: prometheus_not_running + expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="prometheus.service", state="active"} == 0 + labels: + severity: critical + annotations: + summary: "Prometheus service not running on {{ $labels.instance }}" + description: "Prometheus service not running on {{ $labels.instance }}" + - alert: alertmanager_not_running + expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0 + labels: + severity: critical + annotations: + summary: "Alertmanager service not running on {{ $labels.instance }}" + description: "Alertmanager service not running on {{ $labels.instance }}" + - alert: pushgateway_not_running + expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0 + labels: + severity: critical + annotations: + summary: "Pushgateway service not running on {{ $labels.instance }}" + description: "Pushgateway service not running on {{ $labels.instance }}" + - alert: pushgateway_not_running + expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0 + labels: + severity: critical + annotations: + summary: "Pushgateway service not running on {{ $labels.instance }}" + description: "Pushgateway service not running on {{ $labels.instance }}" + - alert: loki_not_running + expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="loki.service", state="active"} == 0 + labels: + severity: critical + annotations: + summary: "Loki service not running on {{ $labels.instance }}" + description: "Loki service not running on {{ $labels.instance }}" + - alert: grafana_not_running + expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="grafana.service", state="active"} == 0 + labels: + severity: warning + annotations: + summary: "Grafana service not running on {{ $labels.instance }}" + description: "Grafana service not running on {{ $labels.instance }}" + - alert: tempo_not_running + expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="tempo.service", state="active"} == 0 + labels: + severity: warning + annotations: + summary: "Tempo service not running on {{ $labels.instance }}" + description: "Tempo service not running on {{ $labels.instance }}" + - alert: pyroscope_not_running + expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="podman-pyroscope.service", state="active"} == 0 + labels: + severity: warning + annotations: + summary: "Pyroscope service not running on {{ $labels.instance }}" + description: "Pyroscope service not running on {{ $labels.instance }}"