monitoring: auto-generate Prometheus scrape targets from host configs
Add homelab.monitoring NixOS options (enable, scrapeTargets) following the same pattern as homelab.dns. Prometheus scrape configs are now auto-generated from flake host configurations and external targets, replacing hardcoded target lists. Also cleans up alert rules: snake_case naming, fix zigbee2mqtt typo, remove duplicate pushgateway alert, add for clauses to monitoring_rules, remove hardcoded WireGuard public key, and add new alerts for certificates, proxmox, caddy, smartctl temperature, filesystem prediction, systemd state, file descriptors, and host reboots. Fixes grafana scrape target port from 3100 to 3000. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -26,7 +26,11 @@
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
# monitoring
|
homelab.monitoring.scrapeTargets = [{
|
||||||
|
job_name = "wireguard";
|
||||||
|
port = 9586;
|
||||||
|
}];
|
||||||
|
|
||||||
services.prometheus.exporters.wireguard = {
|
services.prometheus.exporters.wireguard = {
|
||||||
enable = true;
|
enable = true;
|
||||||
};
|
};
|
||||||
|
|||||||
145
lib/monitoring.nix
Normal file
145
lib/monitoring.nix
Normal file
@@ -0,0 +1,145 @@
|
|||||||
|
{ lib }:
|
||||||
|
let
|
||||||
|
# Extract IP address from CIDR notation (e.g., "10.69.13.5/24" -> "10.69.13.5")
|
||||||
|
extractIP = address:
|
||||||
|
let
|
||||||
|
parts = lib.splitString "/" address;
|
||||||
|
in
|
||||||
|
builtins.head parts;
|
||||||
|
|
||||||
|
# Check if a network interface name looks like a VPN/tunnel interface
|
||||||
|
isVpnInterface = ifaceName:
|
||||||
|
lib.hasPrefix "wg" ifaceName ||
|
||||||
|
lib.hasPrefix "tun" ifaceName ||
|
||||||
|
lib.hasPrefix "tap" ifaceName ||
|
||||||
|
lib.hasPrefix "vti" ifaceName;
|
||||||
|
|
||||||
|
# Extract monitoring info from a single host configuration
|
||||||
|
# Returns null if host should not be included
|
||||||
|
extractHostMonitoring = name: hostConfig:
|
||||||
|
let
|
||||||
|
cfg = hostConfig.config;
|
||||||
|
monConfig = (cfg.homelab or { }).monitoring or { enable = true; scrapeTargets = [ ]; };
|
||||||
|
dnsConfig = (cfg.homelab or { }).dns or { enable = true; };
|
||||||
|
hostname = cfg.networking.hostName;
|
||||||
|
networks = cfg.systemd.network.networks or { };
|
||||||
|
|
||||||
|
# Filter out VPN interfaces and find networks with static addresses
|
||||||
|
physicalNetworks = lib.filterAttrs
|
||||||
|
(netName: netCfg:
|
||||||
|
let
|
||||||
|
ifaceName = netCfg.matchConfig.Name or "";
|
||||||
|
in
|
||||||
|
!(isVpnInterface ifaceName) && (netCfg.address or [ ]) != [ ])
|
||||||
|
networks;
|
||||||
|
|
||||||
|
# Get addresses from physical networks only
|
||||||
|
networkAddresses = lib.flatten (
|
||||||
|
lib.mapAttrsToList
|
||||||
|
(netName: netCfg: netCfg.address or [ ])
|
||||||
|
physicalNetworks
|
||||||
|
);
|
||||||
|
|
||||||
|
firstAddress = if networkAddresses != [ ] then builtins.head networkAddresses else null;
|
||||||
|
in
|
||||||
|
if !(monConfig.enable or true) || !(dnsConfig.enable or true) || firstAddress == null then
|
||||||
|
null
|
||||||
|
else
|
||||||
|
{
|
||||||
|
inherit hostname;
|
||||||
|
ip = extractIP firstAddress;
|
||||||
|
scrapeTargets = monConfig.scrapeTargets or [ ];
|
||||||
|
};
|
||||||
|
|
||||||
|
# Generate node-exporter targets from all flake hosts
|
||||||
|
generateNodeExporterTargets = self: externalTargets:
|
||||||
|
let
|
||||||
|
nixosConfigs = self.nixosConfigurations or { };
|
||||||
|
hostList = lib.filter (x: x != null) (
|
||||||
|
lib.mapAttrsToList extractHostMonitoring nixosConfigs
|
||||||
|
);
|
||||||
|
flakeTargets = map (host: "${host.hostname}.home.2rjus.net:9100") hostList;
|
||||||
|
in
|
||||||
|
flakeTargets ++ (externalTargets.nodeExporter or [ ]);
|
||||||
|
|
||||||
|
# Generate scrape configs from all flake hosts and external targets
|
||||||
|
generateScrapeConfigs = self: externalTargets:
|
||||||
|
let
|
||||||
|
nixosConfigs = self.nixosConfigurations or { };
|
||||||
|
hostList = lib.filter (x: x != null) (
|
||||||
|
lib.mapAttrsToList extractHostMonitoring nixosConfigs
|
||||||
|
);
|
||||||
|
|
||||||
|
# Collect all scrapeTargets from all hosts, grouped by job_name
|
||||||
|
allTargets = lib.flatten (map
|
||||||
|
(host:
|
||||||
|
map
|
||||||
|
(target: {
|
||||||
|
inherit (target) job_name port metrics_path scheme scrape_interval honor_labels;
|
||||||
|
hostname = host.hostname;
|
||||||
|
})
|
||||||
|
host.scrapeTargets
|
||||||
|
)
|
||||||
|
hostList
|
||||||
|
);
|
||||||
|
|
||||||
|
# Group targets by job_name
|
||||||
|
grouped = lib.groupBy (t: t.job_name) allTargets;
|
||||||
|
|
||||||
|
# Generate a scrape config for each job
|
||||||
|
flakeScrapeConfigs = lib.mapAttrsToList
|
||||||
|
(jobName: targets:
|
||||||
|
let
|
||||||
|
first = builtins.head targets;
|
||||||
|
targetAddrs = map
|
||||||
|
(t:
|
||||||
|
let
|
||||||
|
portStr = toString t.port;
|
||||||
|
in
|
||||||
|
"${t.hostname}.home.2rjus.net:${portStr}")
|
||||||
|
targets;
|
||||||
|
config = {
|
||||||
|
job_name = jobName;
|
||||||
|
static_configs = [{
|
||||||
|
targets = targetAddrs;
|
||||||
|
}];
|
||||||
|
}
|
||||||
|
// (lib.optionalAttrs (first.metrics_path != "/metrics") {
|
||||||
|
metrics_path = first.metrics_path;
|
||||||
|
})
|
||||||
|
// (lib.optionalAttrs (first.scheme != "http") {
|
||||||
|
scheme = first.scheme;
|
||||||
|
})
|
||||||
|
// (lib.optionalAttrs (first.scrape_interval != null) {
|
||||||
|
scrape_interval = first.scrape_interval;
|
||||||
|
})
|
||||||
|
// (lib.optionalAttrs first.honor_labels {
|
||||||
|
honor_labels = true;
|
||||||
|
});
|
||||||
|
in
|
||||||
|
config
|
||||||
|
)
|
||||||
|
grouped;
|
||||||
|
|
||||||
|
# External scrape configs
|
||||||
|
externalScrapeConfigs = map
|
||||||
|
(ext: {
|
||||||
|
job_name = ext.job_name;
|
||||||
|
static_configs = [{
|
||||||
|
targets = ext.targets;
|
||||||
|
}];
|
||||||
|
} // (lib.optionalAttrs (ext ? metrics_path) {
|
||||||
|
metrics_path = ext.metrics_path;
|
||||||
|
}) // (lib.optionalAttrs (ext ? scheme) {
|
||||||
|
scheme = ext.scheme;
|
||||||
|
}) // (lib.optionalAttrs (ext ? scrape_interval) {
|
||||||
|
scrape_interval = ext.scrape_interval;
|
||||||
|
}))
|
||||||
|
(externalTargets.scrapeConfigs or [ ]);
|
||||||
|
in
|
||||||
|
flakeScrapeConfigs ++ externalScrapeConfigs;
|
||||||
|
|
||||||
|
in
|
||||||
|
{
|
||||||
|
inherit extractHostMonitoring generateNodeExporterTargets generateScrapeConfigs;
|
||||||
|
}
|
||||||
@@ -2,5 +2,6 @@
|
|||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
./dns.nix
|
./dns.nix
|
||||||
|
./monitoring.nix
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|||||||
50
modules/homelab/monitoring.nix
Normal file
50
modules/homelab/monitoring.nix
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
{ config, lib, ... }:
|
||||||
|
let
|
||||||
|
cfg = config.homelab.monitoring;
|
||||||
|
in
|
||||||
|
{
|
||||||
|
options.homelab.monitoring = {
|
||||||
|
enable = lib.mkOption {
|
||||||
|
type = lib.types.bool;
|
||||||
|
default = true;
|
||||||
|
description = "Include this host in Prometheus node-exporter scrape targets";
|
||||||
|
};
|
||||||
|
|
||||||
|
scrapeTargets = lib.mkOption {
|
||||||
|
type = lib.types.listOf (lib.types.submodule {
|
||||||
|
options = {
|
||||||
|
job_name = lib.mkOption {
|
||||||
|
type = lib.types.str;
|
||||||
|
description = "Prometheus scrape job name";
|
||||||
|
};
|
||||||
|
port = lib.mkOption {
|
||||||
|
type = lib.types.port;
|
||||||
|
description = "Port to scrape metrics from";
|
||||||
|
};
|
||||||
|
metrics_path = lib.mkOption {
|
||||||
|
type = lib.types.str;
|
||||||
|
default = "/metrics";
|
||||||
|
description = "HTTP path to scrape metrics from";
|
||||||
|
};
|
||||||
|
scheme = lib.mkOption {
|
||||||
|
type = lib.types.str;
|
||||||
|
default = "http";
|
||||||
|
description = "HTTP scheme (http or https)";
|
||||||
|
};
|
||||||
|
scrape_interval = lib.mkOption {
|
||||||
|
type = lib.types.nullOr lib.types.str;
|
||||||
|
default = null;
|
||||||
|
description = "Override the global scrape interval for this target";
|
||||||
|
};
|
||||||
|
honor_labels = lib.mkOption {
|
||||||
|
type = lib.types.bool;
|
||||||
|
default = false;
|
||||||
|
description = "Whether to honor labels from the scraped target";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
});
|
||||||
|
default = [ ];
|
||||||
|
description = "Additional Prometheus scrape targets exposed by this host";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -1,5 +1,9 @@
|
|||||||
{ pkgs, unstable, ... }:
|
{ pkgs, unstable, ... }:
|
||||||
{
|
{
|
||||||
|
homelab.monitoring.scrapeTargets = [{
|
||||||
|
job_name = "step-ca";
|
||||||
|
port = 9000;
|
||||||
|
}];
|
||||||
sops.secrets."ca_root_pw" = {
|
sops.secrets."ca_root_pw" = {
|
||||||
sopsFile = ../../secrets/ca/secrets.yaml;
|
sopsFile = ../../secrets/ca/secrets.yaml;
|
||||||
owner = "step-ca";
|
owner = "step-ca";
|
||||||
|
|||||||
@@ -1,5 +1,11 @@
|
|||||||
{ pkgs, config, ... }:
|
{ pkgs, config, ... }:
|
||||||
{
|
{
|
||||||
|
homelab.monitoring.scrapeTargets = [{
|
||||||
|
job_name = "home-assistant";
|
||||||
|
port = 8123;
|
||||||
|
metrics_path = "/api/prometheus";
|
||||||
|
scrape_interval = "60s";
|
||||||
|
}];
|
||||||
# Enable the Home Assistant service
|
# Enable the Home Assistant service
|
||||||
services.home-assistant = {
|
services.home-assistant = {
|
||||||
enable = true;
|
enable = true;
|
||||||
|
|||||||
@@ -3,4 +3,9 @@
|
|||||||
imports = [
|
imports = [
|
||||||
./proxy.nix
|
./proxy.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
|
homelab.monitoring.scrapeTargets = [{
|
||||||
|
job_name = "caddy";
|
||||||
|
port = 80;
|
||||||
|
}];
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,9 @@
|
|||||||
{ pkgs, ... }:
|
{ pkgs, ... }:
|
||||||
{
|
{
|
||||||
|
homelab.monitoring.scrapeTargets = [{
|
||||||
|
job_name = "jellyfin";
|
||||||
|
port = 8096;
|
||||||
|
}];
|
||||||
services.jellyfin = {
|
services.jellyfin = {
|
||||||
enable = true;
|
enable = true;
|
||||||
};
|
};
|
||||||
|
|||||||
12
services/monitoring/external-targets.nix
Normal file
12
services/monitoring/external-targets.nix
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
# Monitoring targets for hosts not managed by this flake
|
||||||
|
# These are manually maintained and combined with auto-generated targets
|
||||||
|
{
|
||||||
|
nodeExporter = [
|
||||||
|
"gunter.home.2rjus.net:9100"
|
||||||
|
];
|
||||||
|
scrapeConfigs = [
|
||||||
|
{ job_name = "smartctl"; targets = [ "gunter.home.2rjus.net:9633" ]; }
|
||||||
|
{ job_name = "ghettoptt"; targets = [ "gunter.home.2rjus.net:8989" ]; }
|
||||||
|
{ job_name = "restic_rest"; targets = [ "10.69.12.52:8000" ]; }
|
||||||
|
];
|
||||||
|
}
|
||||||
@@ -1,4 +1,11 @@
|
|||||||
{ ... }:
|
{ self, lib, ... }:
|
||||||
|
let
|
||||||
|
monLib = import ../../lib/monitoring.nix { inherit lib; };
|
||||||
|
externalTargets = import ./external-targets.nix;
|
||||||
|
|
||||||
|
nodeExporterTargets = monLib.generateNodeExporterTargets self externalTargets;
|
||||||
|
autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets;
|
||||||
|
in
|
||||||
{
|
{
|
||||||
services.prometheus = {
|
services.prometheus = {
|
||||||
enable = true;
|
enable = true;
|
||||||
@@ -45,26 +52,16 @@
|
|||||||
];
|
];
|
||||||
|
|
||||||
scrapeConfigs = [
|
scrapeConfigs = [
|
||||||
|
# Auto-generated node-exporter targets from flake hosts + external
|
||||||
{
|
{
|
||||||
job_name = "node-exporter";
|
job_name = "node-exporter";
|
||||||
static_configs = [
|
static_configs = [
|
||||||
{
|
{
|
||||||
targets = [
|
targets = nodeExporterTargets;
|
||||||
"ca.home.2rjus.net:9100"
|
|
||||||
"gunter.home.2rjus.net:9100"
|
|
||||||
"ha1.home.2rjus.net:9100"
|
|
||||||
"http-proxy.home.2rjus.net:9100"
|
|
||||||
"jelly01.home.2rjus.net:9100"
|
|
||||||
"monitoring01.home.2rjus.net:9100"
|
|
||||||
"nix-cache01.home.2rjus.net:9100"
|
|
||||||
"ns1.home.2rjus.net:9100"
|
|
||||||
"ns2.home.2rjus.net:9100"
|
|
||||||
"pgdb1.home.2rjus.net:9100"
|
|
||||||
"nats1.home.2rjus.net:9100"
|
|
||||||
];
|
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
# Local monitoring services (not auto-generated)
|
||||||
{
|
{
|
||||||
job_name = "prometheus";
|
job_name = "prometheus";
|
||||||
static_configs = [
|
static_configs = [
|
||||||
@@ -85,7 +82,7 @@
|
|||||||
job_name = "grafana";
|
job_name = "grafana";
|
||||||
static_configs = [
|
static_configs = [
|
||||||
{
|
{
|
||||||
targets = [ "localhost:3100" ];
|
targets = [ "localhost:3000" ];
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
@@ -98,13 +95,23 @@
|
|||||||
];
|
];
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
job_name = "restic_rest";
|
job_name = "pushgateway";
|
||||||
|
honor_labels = true;
|
||||||
static_configs = [
|
static_configs = [
|
||||||
{
|
{
|
||||||
targets = [ "10.69.12.52:8000" ];
|
targets = [ "localhost:9091" ];
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
{
|
||||||
|
job_name = "labmon";
|
||||||
|
static_configs = [
|
||||||
|
{
|
||||||
|
targets = [ "monitoring01.home.2rjus.net:9969" ];
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
# pve-exporter with complex relabel config
|
||||||
{
|
{
|
||||||
job_name = "pve-exporter";
|
job_name = "pve-exporter";
|
||||||
static_configs = [
|
static_configs = [
|
||||||
@@ -133,91 +140,8 @@
|
|||||||
}
|
}
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
{
|
] ++ autoScrapeConfigs;
|
||||||
job_name = "caddy";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "http-proxy.home.2rjus.net" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "jellyfin";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "jelly01.home.2rjus.net:8096" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "smartctl";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "gunter.home.2rjus.net:9633" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "wireguard";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "http-proxy.home.2rjus.net:9586" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "home-assistant";
|
|
||||||
scrape_interval = "60s";
|
|
||||||
metrics_path = "/api/prometheus";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "ha1.home.2rjus.net:8123" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "ghettoptt";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "gunter.home.2rjus.net:8989" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "step-ca";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "ca.home.2rjus.net:9000" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "labmon";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "monitoring01.home.2rjus.net:9969" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "pushgateway";
|
|
||||||
honor_labels = true;
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "localhost:9091" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "nix-cache_caddy";
|
|
||||||
scheme = "https";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "nix-cache.home.2rjus.net" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
pushgateway = {
|
pushgateway = {
|
||||||
enable = true;
|
enable = true;
|
||||||
web = {
|
web = {
|
||||||
|
|||||||
@@ -57,6 +57,38 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: "Promtail service not running on {{ $labels.instance }}"
|
summary: "Promtail service not running on {{ $labels.instance }}"
|
||||||
description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes."
|
description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes."
|
||||||
|
- alert: filesystem_filling_up
|
||||||
|
expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Filesystem predicted to fill within 24h on {{ $labels.instance }}"
|
||||||
|
description: "Based on the last 6h trend, the root filesystem on {{ $labels.instance }} is predicted to run out of space within 24 hours."
|
||||||
|
- alert: systemd_not_running
|
||||||
|
expr: node_systemd_system_running == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Systemd not in running state on {{ $labels.instance }}"
|
||||||
|
description: "Systemd is not in running state on {{ $labels.instance }}. The system may be in a degraded state."
|
||||||
|
- alert: high_file_descriptors
|
||||||
|
expr: node_filefd_allocated / node_filefd_maximum > 0.8
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "High file descriptor usage on {{ $labels.instance }}"
|
||||||
|
description: "More than 80% of file descriptors are in use on {{ $labels.instance }}."
|
||||||
|
- alert: host_reboot
|
||||||
|
expr: changes(node_boot_time_seconds[10m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: "Host {{ $labels.instance }} has rebooted"
|
||||||
|
description: "Host {{ $labels.instance }} has rebooted."
|
||||||
- name: nameserver_rules
|
- name: nameserver_rules
|
||||||
rules:
|
rules:
|
||||||
- alert: unbound_down
|
- alert: unbound_down
|
||||||
@@ -75,7 +107,7 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: "NSD not running on {{ $labels.instance }}"
|
summary: "NSD not running on {{ $labels.instance }}"
|
||||||
description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
|
description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
- name: http-proxy_rules
|
- name: http_proxy_rules
|
||||||
rules:
|
rules:
|
||||||
- alert: caddy_down
|
- alert: caddy_down
|
||||||
expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0
|
expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0
|
||||||
@@ -85,6 +117,22 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: "Caddy not running on {{ $labels.instance }}"
|
summary: "Caddy not running on {{ $labels.instance }}"
|
||||||
description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
|
description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
|
- alert: caddy_upstream_unhealthy
|
||||||
|
expr: caddy_reverse_proxy_upstreams_healthy == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Caddy upstream unhealthy for {{ $labels.upstream }}"
|
||||||
|
description: "Caddy reverse proxy upstream {{ $labels.upstream }} is unhealthy on {{ $labels.instance }}."
|
||||||
|
- alert: caddy_high_error_rate
|
||||||
|
expr: rate(caddy_http_request_errors_total[5m]) > 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "High HTTP error rate on {{ $labels.instance }}"
|
||||||
|
description: "Caddy is experiencing a high rate of HTTP errors on {{ $labels.instance }}."
|
||||||
- name: nats_rules
|
- name: nats_rules
|
||||||
rules:
|
rules:
|
||||||
- alert: nats_down
|
- alert: nats_down
|
||||||
@@ -97,7 +145,7 @@ groups:
|
|||||||
description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
|
description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
- name: nix_cache_rules
|
- name: nix_cache_rules
|
||||||
rules:
|
rules:
|
||||||
- alert: build-flakes_service_not_active_recently
|
- alert: build_flakes_service_not_active_recently
|
||||||
expr: count_over_time(node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="active"}[1h]) < 1
|
expr: count_over_time(node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="active"}[1h]) < 1
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
@@ -138,7 +186,7 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: "Home assistant not running on {{ $labels.instance }}"
|
summary: "Home assistant not running on {{ $labels.instance }}"
|
||||||
description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes."
|
description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
- alert: zigbee2qmtt_down
|
- alert: zigbee2mqtt_down
|
||||||
expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0
|
expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
@@ -156,7 +204,7 @@ groups:
|
|||||||
description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
|
description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
- name: smartctl_rules
|
- name: smartctl_rules
|
||||||
rules:
|
rules:
|
||||||
- alert: SmartCriticalWarning
|
- alert: smart_critical_warning
|
||||||
expr: smartctl_device_critical_warning > 0
|
expr: smartctl_device_critical_warning > 0
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
@@ -164,7 +212,7 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: SMART critical warning (instance {{ $labels.instance }})
|
summary: SMART critical warning (instance {{ $labels.instance }})
|
||||||
description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
- alert: SmartMediaErrors
|
- alert: smart_media_errors
|
||||||
expr: smartctl_device_media_errors > 0
|
expr: smartctl_device_media_errors > 0
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
@@ -172,7 +220,7 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: SMART media errors (instance {{ $labels.instance }})
|
summary: SMART media errors (instance {{ $labels.instance }})
|
||||||
description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
- alert: SmartWearoutIndicator
|
- alert: smart_wearout_indicator
|
||||||
expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold
|
expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
@@ -180,20 +228,29 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: SMART Wearout Indicator (instance {{ $labels.instance }})
|
summary: SMART Wearout Indicator (instance {{ $labels.instance }})
|
||||||
description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
- alert: smartctl_high_temperature
|
||||||
|
expr: smartctl_device_temperature > 60
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Disk temperature above 60C on {{ $labels.instance }}"
|
||||||
|
description: "Disk {{ $labels.device }} on {{ $labels.instance }} has temperature {{ $value }}C."
|
||||||
- name: wireguard_rules
|
- name: wireguard_rules
|
||||||
rules:
|
rules:
|
||||||
- alert: WireguardHandshake
|
- alert: wireguard_handshake_timeout
|
||||||
expr: (time() - wireguard_latest_handshake_seconds{instance="http-proxy.home.2rjus.net:9586",interface="wg0",public_key="32Rb13wExcy8uI92JTnFdiOfkv0mlQ6f181WA741DHs="}) > 300
|
expr: (time() - wireguard_latest_handshake_seconds{interface="wg0"}) > 300
|
||||||
for: 1m
|
for: 1m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Wireguard handshake timeout on {{ $labels.instance }}"
|
summary: "Wireguard handshake timeout on {{ $labels.instance }}"
|
||||||
description: "Wireguard handshake timeout on {{ $labels.instance }} for more than 1 minutes."
|
description: "Wireguard handshake timeout on {{ $labels.instance }} for peer {{ $labels.public_key }}."
|
||||||
- name: monitoring_rules
|
- name: monitoring_rules
|
||||||
rules:
|
rules:
|
||||||
- alert: prometheus_not_running
|
- alert: prometheus_not_running
|
||||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="prometheus.service", state="active"} == 0
|
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="prometheus.service", state="active"} == 0
|
||||||
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
@@ -201,6 +258,7 @@ groups:
|
|||||||
description: "Prometheus service not running on {{ $labels.instance }}"
|
description: "Prometheus service not running on {{ $labels.instance }}"
|
||||||
- alert: alertmanager_not_running
|
- alert: alertmanager_not_running
|
||||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
|
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
|
||||||
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
@@ -208,13 +266,7 @@ groups:
|
|||||||
description: "Alertmanager service not running on {{ $labels.instance }}"
|
description: "Alertmanager service not running on {{ $labels.instance }}"
|
||||||
- alert: pushgateway_not_running
|
- alert: pushgateway_not_running
|
||||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
|
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
|
||||||
labels:
|
for: 5m
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: "Pushgateway service not running on {{ $labels.instance }}"
|
|
||||||
description: "Pushgateway service not running on {{ $labels.instance }}"
|
|
||||||
- alert: pushgateway_not_running
|
|
||||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
|
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
@@ -222,6 +274,7 @@ groups:
|
|||||||
description: "Pushgateway service not running on {{ $labels.instance }}"
|
description: "Pushgateway service not running on {{ $labels.instance }}"
|
||||||
- alert: loki_not_running
|
- alert: loki_not_running
|
||||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="loki.service", state="active"} == 0
|
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="loki.service", state="active"} == 0
|
||||||
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
@@ -229,6 +282,7 @@ groups:
|
|||||||
description: "Loki service not running on {{ $labels.instance }}"
|
description: "Loki service not running on {{ $labels.instance }}"
|
||||||
- alert: grafana_not_running
|
- alert: grafana_not_running
|
||||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
|
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
|
||||||
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
@@ -236,6 +290,7 @@ groups:
|
|||||||
description: "Grafana service not running on {{ $labels.instance }}"
|
description: "Grafana service not running on {{ $labels.instance }}"
|
||||||
- alert: tempo_not_running
|
- alert: tempo_not_running
|
||||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="tempo.service", state="active"} == 0
|
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="tempo.service", state="active"} == 0
|
||||||
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
@@ -243,8 +298,53 @@ groups:
|
|||||||
description: "Tempo service not running on {{ $labels.instance }}"
|
description: "Tempo service not running on {{ $labels.instance }}"
|
||||||
- alert: pyroscope_not_running
|
- alert: pyroscope_not_running
|
||||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="podman-pyroscope.service", state="active"} == 0
|
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="podman-pyroscope.service", state="active"} == 0
|
||||||
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Pyroscope service not running on {{ $labels.instance }}"
|
summary: "Pyroscope service not running on {{ $labels.instance }}"
|
||||||
description: "Pyroscope service not running on {{ $labels.instance }}"
|
description: "Pyroscope service not running on {{ $labels.instance }}"
|
||||||
|
- name: certificate_rules
|
||||||
|
rules:
|
||||||
|
- alert: certificate_expiring_soon
|
||||||
|
expr: labmon_tlsconmon_certificate_seconds_left < 86400
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "TLS certificate expiring soon for {{ $labels.instance }}"
|
||||||
|
description: "TLS certificate for {{ $labels.address }} is expiring within 24 hours."
|
||||||
|
- alert: certificate_check_error
|
||||||
|
expr: labmon_tlsconmon_certificate_check_error == 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Error checking certificate for {{ $labels.address }}"
|
||||||
|
description: "Certificate check is failing for {{ $labels.address }} on {{ $labels.instance }}."
|
||||||
|
- alert: step_ca_certificate_expiring
|
||||||
|
expr: labmon_stepmon_certificate_seconds_left < 3600
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Step-CA certificate expiring for {{ $labels.instance }}"
|
||||||
|
description: "Step-CA certificate is expiring within 1 hour on {{ $labels.instance }}."
|
||||||
|
- name: proxmox_rules
|
||||||
|
rules:
|
||||||
|
- alert: pve_node_down
|
||||||
|
expr: pve_up{id=~"node/.*"} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Proxmox node {{ $labels.id }} is down"
|
||||||
|
description: "Proxmox node {{ $labels.id }} has been down for more than 5 minutes."
|
||||||
|
- alert: pve_guest_stopped
|
||||||
|
expr: pve_up{id=~"qemu/.*"} == 0 and pve_onboot_status == 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Proxmox VM {{ $labels.id }} is stopped"
|
||||||
|
description: "Proxmox VM {{ $labels.id }} ({{ $labels.name }}) has onboot=1 but is stopped."
|
||||||
|
|||||||
@@ -6,4 +6,10 @@
|
|||||||
./proxy.nix
|
./proxy.nix
|
||||||
./nix.nix
|
./nix.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
|
homelab.monitoring.scrapeTargets = [{
|
||||||
|
job_name = "nix-cache_caddy";
|
||||||
|
port = 443;
|
||||||
|
scheme = "https";
|
||||||
|
}];
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user