monitoring01: remove host and migrate services to monitoring02
Some checks failed
Run nix flake check / flake-check (push) Failing after 3m15s
Run nix flake check / flake-check (pull_request) Failing after 3m8s

Remove monitoring01 host configuration and unused service modules
(prometheus, grafana, loki, tempo, pyroscope). Migrate blackbox,
exportarr, and pve exporters to monitoring02 with scrape configs
moved to VictoriaMetrics. Update alert rules, terraform vault
policies/secrets, http-proxy entries, and documentation to reflect
the monitoring02 migration.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-17 21:50:20 +01:00
parent 1bba6f106a
commit 4f593126c0
29 changed files with 115 additions and 773 deletions

View File

@@ -259,32 +259,32 @@ groups:
description: "Wireguard handshake timeout on {{ $labels.instance }} for peer {{ $labels.public_key }}."
- name: monitoring_rules
rules:
- alert: prometheus_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="prometheus.service", state="active"} == 0
- alert: victoriametrics_not_running
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="victoriametrics.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus service not running on {{ $labels.instance }}"
description: "Prometheus service not running on {{ $labels.instance }}"
summary: "VictoriaMetrics service not running on {{ $labels.instance }}"
description: "VictoriaMetrics service not running on {{ $labels.instance }}"
- alert: vmalert_not_running
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="vmalert.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "vmalert service not running on {{ $labels.instance }}"
description: "vmalert service not running on {{ $labels.instance }}"
- alert: alertmanager_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Alertmanager service not running on {{ $labels.instance }}"
description: "Alertmanager service not running on {{ $labels.instance }}"
- alert: pushgateway_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Pushgateway service not running on {{ $labels.instance }}"
description: "Pushgateway service not running on {{ $labels.instance }}"
- alert: loki_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="loki.service", state="active"} == 0
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="loki.service", state="active"} == 0
for: 5m
labels:
severity: critical
@@ -292,29 +292,13 @@ groups:
summary: "Loki service not running on {{ $labels.instance }}"
description: "Loki service not running on {{ $labels.instance }}"
- alert: grafana_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Grafana service not running on {{ $labels.instance }}"
description: "Grafana service not running on {{ $labels.instance }}"
- alert: tempo_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="tempo.service", state="active"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Tempo service not running on {{ $labels.instance }}"
description: "Tempo service not running on {{ $labels.instance }}"
- alert: pyroscope_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="podman-pyroscope.service", state="active"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Pyroscope service not running on {{ $labels.instance }}"
description: "Pyroscope service not running on {{ $labels.instance }}"
- name: proxmox_rules
rules:
- alert: pve_node_down