Some checks failed
Run nix flake check / flake-check (push) Failing after 14m34s
271 lines
9.3 KiB
Nix
271 lines
9.3 KiB
Nix
{ ... }:
|
||
{
|
||
services.prometheus = {
|
||
enable = true;
|
||
alertmanager = {
|
||
enable = true;
|
||
configuration = {
|
||
global = {
|
||
};
|
||
route = {
|
||
receiver = "webhook_natstonotify";
|
||
group_wait = "30s";
|
||
group_interval = "5m";
|
||
repeat_interval = "12h";
|
||
group_by = [ "alertname" ];
|
||
};
|
||
receivers = [
|
||
{
|
||
name = "webhook_natstonotify";
|
||
webhook_configs = [
|
||
{
|
||
url = "http://localhost:5001/alert";
|
||
}
|
||
];
|
||
}
|
||
];
|
||
};
|
||
};
|
||
alertmanagers = [
|
||
{
|
||
static_configs = [
|
||
{
|
||
targets = [ "localhost:9093" ];
|
||
}
|
||
];
|
||
}
|
||
];
|
||
globalConfig = {
|
||
scrape_interval = "15s";
|
||
};
|
||
rules = [
|
||
''
|
||
groups:
|
||
- name: common_rules
|
||
rules:
|
||
- alert: node_down
|
||
expr: up == 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "Instance {{ $labels.instance }} down"
|
||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
|
||
- alert: low_disk_space
|
||
expr: node_filesystem_free_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 10
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "Disk space low on {{ $labels.instance }}"
|
||
description: "Disk space is low on {{ $labels.instance }}. Please check."
|
||
- alert: high_cpu_load
|
||
expr: max(node_load5{}) by (instance) > (count by (instance)(node_cpu_seconds_total{mode="idle"}) * 0.7)
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "High CPU load on {{ $labels.instance }}"
|
||
description: "CPU load is high on {{ $labels.instance }}. Please check."
|
||
- alert: nixos_upgrade_failed
|
||
expr: node_systemd_unit_state{name="nixos-upgrade.service", state="failed"} == 1
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "NixOS upgrade failed on {{ $labels.instance }}"
|
||
description: "NixOS upgrade failed on {{ $labels.instance }}"
|
||
- name: nameserver_rules
|
||
rules:
|
||
- alert: unbound_down
|
||
expr: node_systemd_unit_state {instance =~ "ns.+", name = "unbound.service", state = "active"} == 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "Unbound not running on {{ $labels.instance }}"
|
||
description: "Unbound has been down on {{ $labels.instance }} more than 5 minutes."
|
||
- alert: nsd_down
|
||
expr: node_systemd_unit_state {instance =~ "ns.+", name = "nsd.service", state = "active"} == 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "NSD not running on {{ $labels.instance }}"
|
||
description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
|
||
- name: http-proxy_rules
|
||
rules:
|
||
- alert: caddy_down
|
||
expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "Caddy not running on {{ $labels.instance }}"
|
||
description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
|
||
- name: nats_rules
|
||
rules:
|
||
- alert: nats down
|
||
expr: node_systemd_unit_state {instance="nats1.home.2rjus.net:9100", name = "nats.service", state = "active"} == 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "NATS not running on {{ $labels.instance }}"
|
||
description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
|
||
- name: nix_cache_rules
|
||
rules:
|
||
- alert: build-flakes service failed
|
||
expr: node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service",state="failed"} == 1
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "The build-flakes service on {{ $labels.instance }} has failed"
|
||
description: "The build-flakes service on {{ $labels.instance }} has failed"
|
||
- alert: low_disk_space_nix
|
||
expr: node_filesystem_free_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} / node_filesystem_size_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} * 100 < 10
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "Disk space low on /nix for {{ $labels.instance }}"
|
||
description: "Disk space is low on /nix for host {{ $labels.instance }}. Please check."
|
||
- name: home_assistant_rules
|
||
rules:
|
||
- alert: home_assistant_down
|
||
expr: node_systemd_unit_state {instance="ha1.home.2rjus.net:9100", name="home-assistant.service", state="active"} == 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "Home assistant not running on {{ $labels.instance }}"
|
||
description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes."
|
||
- alert: zigbee2qmtt_down
|
||
expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "Zigbee2mqtt not running on {{ $labels.instance }}"
|
||
description: "Zigbee2mqtt has been down on {{ $labels.instance }} more than 5 minutes."
|
||
- alert: mosquitto_down
|
||
expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "mosquitto.service", state = "active"} == 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "Mosquitto not running on {{ $labels.instance }}"
|
||
description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
|
||
''
|
||
];
|
||
|
||
scrapeConfigs = [
|
||
{
|
||
job_name = "node-exporter";
|
||
static_configs = [
|
||
{
|
||
targets = [
|
||
"ca.home.2rjus.net:9100"
|
||
"gunter.home.2rjus.net:9100"
|
||
"ha1.home.2rjus.net:9100"
|
||
"http-proxy.home.2rjus.net:9100"
|
||
"jelly01.home.2rjus.net:9100"
|
||
"monitoring01.home.2rjus.net:9100"
|
||
"nix-cache01.home.2rjus.net:9100"
|
||
"ns1.home.2rjus.net:9100"
|
||
"ns2.home.2rjus.net:9100"
|
||
"pgdb1.home.2rjus.net:9100"
|
||
"nats1.home.2rjus.net:9100"
|
||
];
|
||
}
|
||
];
|
||
}
|
||
{
|
||
job_name = "prometheus";
|
||
static_configs = [
|
||
{
|
||
targets = [ "localhost:9090" ];
|
||
}
|
||
];
|
||
}
|
||
{
|
||
job_name = "loki";
|
||
static_configs = [
|
||
{
|
||
targets = [ "localhost:3100" ];
|
||
}
|
||
];
|
||
}
|
||
{
|
||
job_name = "grafana";
|
||
static_configs = [
|
||
{
|
||
targets = [ "localhost:3100" ];
|
||
}
|
||
];
|
||
}
|
||
{
|
||
job_name = "alertmanager";
|
||
static_configs = [
|
||
{
|
||
targets = [ "localhost:9093" ];
|
||
}
|
||
];
|
||
}
|
||
{
|
||
job_name = "restic_rest";
|
||
static_configs = [
|
||
{
|
||
targets = [ "10.69.12.52:8000" ];
|
||
}
|
||
];
|
||
}
|
||
{
|
||
job_name = "pve-exporter";
|
||
static_configs = [
|
||
{
|
||
targets = [ "10.69.12.75" ];
|
||
}
|
||
];
|
||
metrics_path = "/pve";
|
||
params = {
|
||
module = [ "default" ];
|
||
cluster = [ "1" ];
|
||
node = [ "1" ];
|
||
};
|
||
relabel_configs = [
|
||
{
|
||
source_labels = [ "__address__" ];
|
||
target_label = "__param_target";
|
||
}
|
||
{
|
||
source_labels = [ "__param_target" ];
|
||
target_label = "instance";
|
||
}
|
||
{
|
||
target_label = "__address__";
|
||
replacement = "127.0.0.1:9221";
|
||
}
|
||
];
|
||
}
|
||
{
|
||
job_name = "caddy";
|
||
static_configs = [
|
||
{
|
||
targets = [ "http-proxy.home.2rjus.net" ];
|
||
}
|
||
];
|
||
}
|
||
{
|
||
job_name = "jellyfin";
|
||
static_configs = [
|
||
{
|
||
targets = [ "jelly01.home.2rjus.net:8096" ];
|
||
}
|
||
];
|
||
}
|
||
];
|
||
};
|
||
}
|