monitoring01: remove host and migrate services to monitoring02
Remove monitoring01 host configuration and unused service modules (prometheus, grafana, loki, tempo, pyroscope). Migrate blackbox, exportarr, and pve exporters to monitoring02 with scrape configs moved to VictoriaMetrics. Update alert rules, terraform vault policies/secrets, http-proxy entries, and documentation to reflect the monitoring02 migration. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -259,32 +259,32 @@ groups:
|
||||
description: "Wireguard handshake timeout on {{ $labels.instance }} for peer {{ $labels.public_key }}."
|
||||
- name: monitoring_rules
|
||||
rules:
|
||||
- alert: prometheus_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="prometheus.service", state="active"} == 0
|
||||
- alert: victoriametrics_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="victoriametrics.service", state="active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Prometheus service not running on {{ $labels.instance }}"
|
||||
description: "Prometheus service not running on {{ $labels.instance }}"
|
||||
summary: "VictoriaMetrics service not running on {{ $labels.instance }}"
|
||||
description: "VictoriaMetrics service not running on {{ $labels.instance }}"
|
||||
- alert: vmalert_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="vmalert.service", state="active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "vmalert service not running on {{ $labels.instance }}"
|
||||
description: "vmalert service not running on {{ $labels.instance }}"
|
||||
- alert: alertmanager_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
|
||||
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Alertmanager service not running on {{ $labels.instance }}"
|
||||
description: "Alertmanager service not running on {{ $labels.instance }}"
|
||||
- alert: pushgateway_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Pushgateway service not running on {{ $labels.instance }}"
|
||||
description: "Pushgateway service not running on {{ $labels.instance }}"
|
||||
- alert: loki_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="loki.service", state="active"} == 0
|
||||
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="loki.service", state="active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -292,29 +292,13 @@ groups:
|
||||
summary: "Loki service not running on {{ $labels.instance }}"
|
||||
description: "Loki service not running on {{ $labels.instance }}"
|
||||
- alert: grafana_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
|
||||
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Grafana service not running on {{ $labels.instance }}"
|
||||
description: "Grafana service not running on {{ $labels.instance }}"
|
||||
- alert: tempo_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="tempo.service", state="active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Tempo service not running on {{ $labels.instance }}"
|
||||
description: "Tempo service not running on {{ $labels.instance }}"
|
||||
- alert: pyroscope_not_running
|
||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="podman-pyroscope.service", state="active"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Pyroscope service not running on {{ $labels.instance }}"
|
||||
description: "Pyroscope service not running on {{ $labels.instance }}"
|
||||
- name: proxmox_rules
|
||||
rules:
|
||||
- alert: pve_node_down
|
||||
|
||||
Reference in New Issue
Block a user