Add monitoring rules for monitoring services
Some checks failed
Run nix flake check / flake-check (push) Has been cancelled
Some checks failed
Run nix flake check / flake-check (push) Has been cancelled
This commit is contained in:
parent
1f6689aeb6
commit
797f915939
@ -105,6 +105,14 @@ groups:
|
||||
annotations:
|
||||
summary: "The build-flakes service on {{ $labels.instance }} has not run recently"
|
||||
description: "The build-flakes service on {{ $labels.instance }} has not run recently"
|
||||
- alert: build-flakes_timer_not_active
|
||||
expr: node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="active"} == 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "The build-flakes timer on {{ $labels.instance }} is not active"
|
||||
description: "The build-flakes timer on {{ $labels.instance }} is not active"
|
||||
- alert: build_flakes_error
|
||||
expr: build_flakes_error == 1
|
||||
labels:
|
||||
@ -190,3 +198,61 @@ groups:
|
||||
annotations:
|
||||
summary: "Wireguard handshake timeout on {{ $labels.instance }}"
|
||||
description: "Wireguard handshake timeout on {{ $labels.instance }} for more than 1 minutes."
|
||||
- name: monitoring_rules
|
||||
rules:
|
||||
- alert: prometheus_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="prometheus.service", state="active"} == 0
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Prometheus service not running on {{ $labels.instance }}"
|
||||
description: "Prometheus service not running on {{ $labels.instance }}"
|
||||
- alert: alertmanager_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Alertmanager service not running on {{ $labels.instance }}"
|
||||
description: "Alertmanager service not running on {{ $labels.instance }}"
|
||||
- alert: pushgateway_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Pushgateway service not running on {{ $labels.instance }}"
|
||||
description: "Pushgateway service not running on {{ $labels.instance }}"
|
||||
- alert: pushgateway_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Pushgateway service not running on {{ $labels.instance }}"
|
||||
description: "Pushgateway service not running on {{ $labels.instance }}"
|
||||
- alert: loki_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="loki.service", state="active"} == 0
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Loki service not running on {{ $labels.instance }}"
|
||||
description: "Loki service not running on {{ $labels.instance }}"
|
||||
- alert: grafana_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Grafana service not running on {{ $labels.instance }}"
|
||||
description: "Grafana service not running on {{ $labels.instance }}"
|
||||
- alert: tempo_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="tempo.service", state="active"} == 0
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Tempo service not running on {{ $labels.instance }}"
|
||||
description: "Tempo service not running on {{ $labels.instance }}"
|
||||
- alert: pyroscope_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="podman-pyroscope.service", state="active"} == 0
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Pyroscope service not running on {{ $labels.instance }}"
|
||||
description: "Pyroscope service not running on {{ $labels.instance }}"
|
||||
|
Loading…
x
Reference in New Issue
Block a user