monitoring: auto-generate Prometheus scrape targets from host configs #16

Merged
torjus merged 2 commits from monitoring-improvements into master 2026-02-04 23:53:46 +00:00
13 changed files with 401 additions and 121 deletions

View File

@@ -122,9 +122,10 @@ This ensures documentation matches the exact nixpkgs version (currently NixOS 25
- Core modules: nix.nix, sshd.nix, sops.nix, acme.nix, autoupgrade.nix - Core modules: nix.nix, sshd.nix, sops.nix, acme.nix, autoupgrade.nix
- Monitoring: node-exporter and promtail on every host - Monitoring: node-exporter and promtail on every host
- `/modules/` - Custom NixOS modules - `/modules/` - Custom NixOS modules
- `homelab/` - Homelab-specific options (DNS automation, etc.) - `homelab/` - Homelab-specific options (DNS automation, monitoring scrape targets)
- `/lib/` - Nix library functions - `/lib/` - Nix library functions
- `dns-zone.nix` - DNS zone generation functions - `dns-zone.nix` - DNS zone generation functions
- `monitoring.nix` - Prometheus scrape target generation functions
- `/services/` - Reusable service modules, selectively imported by hosts - `/services/` - Reusable service modules, selectively imported by hosts
- `home-assistant/` - Home automation stack - `home-assistant/` - Home automation stack
- `monitoring/` - Observability stack (Prometheus, Grafana, Loki, Tempo) - `monitoring/` - Observability stack (Prometheus, Grafana, Loki, Tempo)
@@ -156,6 +157,7 @@ All hosts automatically get:
- Internal ACME CA integration (ca.home.2rjus.net) - Internal ACME CA integration (ca.home.2rjus.net)
- Daily auto-upgrades with auto-reboot - Daily auto-upgrades with auto-reboot
- Prometheus node-exporter + Promtail (logs to monitoring01) - Prometheus node-exporter + Promtail (logs to monitoring01)
- Monitoring scrape target auto-registration via `homelab.monitoring` options
- Custom root CA trust - Custom root CA trust
- DNS zone auto-registration via `homelab.dns` options - DNS zone auto-registration via `homelab.dns` options
@@ -310,7 +312,7 @@ This means:
11. Deploy by running `nixos-rebuild boot --flake URL#<hostname>` on the host. 11. Deploy by running `nixos-rebuild boot --flake URL#<hostname>` on the host.
12. Run auto-upgrade on DNS servers (ns1, ns2) to pick up the new host's DNS entry 12. Run auto-upgrade on DNS servers (ns1, ns2) to pick up the new host's DNS entry
**Note:** DNS A records are auto-generated from the host's `systemd.network.networks` static IP configuration. No manual zone file editing is required. **Note:** DNS A records and Prometheus node-exporter scrape targets are auto-generated from the host's `systemd.network.networks` static IP configuration. No manual zone file or Prometheus config editing is required.
### Important Patterns ### Important Patterns
@@ -333,6 +335,23 @@ All hosts ship metrics and logs to `monitoring01`:
- **Tracing**: Tempo for distributed tracing - **Tracing**: Tempo for distributed tracing
- **Profiling**: Pyroscope for continuous profiling - **Profiling**: Pyroscope for continuous profiling
**Scrape Target Auto-Generation:**
Prometheus scrape targets are automatically generated from host configurations, following the same pattern as DNS zone generation:
- **Node-exporter**: All flake hosts with static IPs are automatically added as node-exporter targets
- **Service targets**: Defined via `homelab.monitoring.scrapeTargets` in service modules
- **External targets**: Non-flake hosts defined in `/services/monitoring/external-targets.nix`
- **Library**: `lib/monitoring.nix` provides `generateNodeExporterTargets` and `generateScrapeConfigs`
Host monitoring options (`homelab.monitoring.*`):
- `enable` (default: `true`) - Include host in Prometheus node-exporter scrape targets
- `scrapeTargets` (default: `[]`) - Additional scrape targets exposed by this host (job_name, port, metrics_path, scheme, scrape_interval, honor_labels)
Service modules declare their scrape targets directly (e.g., `services/ca/default.nix` declares step-ca on port 9000). The Prometheus config on monitoring01 auto-generates scrape configs from all hosts.
To add monitoring targets for non-NixOS hosts, edit `/services/monitoring/external-targets.nix`.
### DNS Architecture ### DNS Architecture
- `ns1` (10.69.13.5) - Primary authoritative DNS + resolver - `ns1` (10.69.13.5) - Primary authoritative DNS + resolver

View File

@@ -26,7 +26,11 @@
}; };
}; };
}; };
# monitoring homelab.monitoring.scrapeTargets = [{
job_name = "wireguard";
port = 9586;
}];
services.prometheus.exporters.wireguard = { services.prometheus.exporters.wireguard = {
enable = true; enable = true;
}; };

145
lib/monitoring.nix Normal file
View File

@@ -0,0 +1,145 @@
{ lib }:
let
# Extract IP address from CIDR notation (e.g., "10.69.13.5/24" -> "10.69.13.5")
extractIP = address:
let
parts = lib.splitString "/" address;
in
builtins.head parts;
# Check if a network interface name looks like a VPN/tunnel interface
isVpnInterface = ifaceName:
lib.hasPrefix "wg" ifaceName ||
lib.hasPrefix "tun" ifaceName ||
lib.hasPrefix "tap" ifaceName ||
lib.hasPrefix "vti" ifaceName;
# Extract monitoring info from a single host configuration
# Returns null if host should not be included
extractHostMonitoring = name: hostConfig:
let
cfg = hostConfig.config;
monConfig = (cfg.homelab or { }).monitoring or { enable = true; scrapeTargets = [ ]; };
dnsConfig = (cfg.homelab or { }).dns or { enable = true; };
hostname = cfg.networking.hostName;
networks = cfg.systemd.network.networks or { };
# Filter out VPN interfaces and find networks with static addresses
physicalNetworks = lib.filterAttrs
(netName: netCfg:
let
ifaceName = netCfg.matchConfig.Name or "";
in
!(isVpnInterface ifaceName) && (netCfg.address or [ ]) != [ ])
networks;
# Get addresses from physical networks only
networkAddresses = lib.flatten (
lib.mapAttrsToList
(netName: netCfg: netCfg.address or [ ])
physicalNetworks
);
firstAddress = if networkAddresses != [ ] then builtins.head networkAddresses else null;
in
if !(monConfig.enable or true) || !(dnsConfig.enable or true) || firstAddress == null then
null
else
{
inherit hostname;
ip = extractIP firstAddress;
scrapeTargets = monConfig.scrapeTargets or [ ];
};
# Generate node-exporter targets from all flake hosts
generateNodeExporterTargets = self: externalTargets:
let
nixosConfigs = self.nixosConfigurations or { };
hostList = lib.filter (x: x != null) (
lib.mapAttrsToList extractHostMonitoring nixosConfigs
);
flakeTargets = map (host: "${host.hostname}.home.2rjus.net:9100") hostList;
in
flakeTargets ++ (externalTargets.nodeExporter or [ ]);
# Generate scrape configs from all flake hosts and external targets
generateScrapeConfigs = self: externalTargets:
let
nixosConfigs = self.nixosConfigurations or { };
hostList = lib.filter (x: x != null) (
lib.mapAttrsToList extractHostMonitoring nixosConfigs
);
# Collect all scrapeTargets from all hosts, grouped by job_name
allTargets = lib.flatten (map
(host:
map
(target: {
inherit (target) job_name port metrics_path scheme scrape_interval honor_labels;
hostname = host.hostname;
})
host.scrapeTargets
)
hostList
);
# Group targets by job_name
grouped = lib.groupBy (t: t.job_name) allTargets;
# Generate a scrape config for each job
flakeScrapeConfigs = lib.mapAttrsToList
(jobName: targets:
let
first = builtins.head targets;
targetAddrs = map
(t:
let
portStr = toString t.port;
in
"${t.hostname}.home.2rjus.net:${portStr}")
targets;
config = {
job_name = jobName;
static_configs = [{
targets = targetAddrs;
}];
}
// (lib.optionalAttrs (first.metrics_path != "/metrics") {
metrics_path = first.metrics_path;
})
// (lib.optionalAttrs (first.scheme != "http") {
scheme = first.scheme;
})
// (lib.optionalAttrs (first.scrape_interval != null) {
scrape_interval = first.scrape_interval;
})
// (lib.optionalAttrs first.honor_labels {
honor_labels = true;
});
in
config
)
grouped;
# External scrape configs
externalScrapeConfigs = map
(ext: {
job_name = ext.job_name;
static_configs = [{
targets = ext.targets;
}];
} // (lib.optionalAttrs (ext ? metrics_path) {
metrics_path = ext.metrics_path;
}) // (lib.optionalAttrs (ext ? scheme) {
scheme = ext.scheme;
}) // (lib.optionalAttrs (ext ? scrape_interval) {
scrape_interval = ext.scrape_interval;
}))
(externalTargets.scrapeConfigs or [ ]);
in
flakeScrapeConfigs ++ externalScrapeConfigs;
in
{
inherit extractHostMonitoring generateNodeExporterTargets generateScrapeConfigs;
}

View File

@@ -2,5 +2,6 @@
{ {
imports = [ imports = [
./dns.nix ./dns.nix
./monitoring.nix
]; ];
} }

View File

@@ -0,0 +1,50 @@
{ config, lib, ... }:
let
cfg = config.homelab.monitoring;
in
{
options.homelab.monitoring = {
enable = lib.mkOption {
type = lib.types.bool;
default = true;
description = "Include this host in Prometheus node-exporter scrape targets";
};
scrapeTargets = lib.mkOption {
type = lib.types.listOf (lib.types.submodule {
options = {
job_name = lib.mkOption {
type = lib.types.str;
description = "Prometheus scrape job name";
};
port = lib.mkOption {
type = lib.types.port;
description = "Port to scrape metrics from";
};
metrics_path = lib.mkOption {
type = lib.types.str;
default = "/metrics";
description = "HTTP path to scrape metrics from";
};
scheme = lib.mkOption {
type = lib.types.str;
default = "http";
description = "HTTP scheme (http or https)";
};
scrape_interval = lib.mkOption {
type = lib.types.nullOr lib.types.str;
default = null;
description = "Override the global scrape interval for this target";
};
honor_labels = lib.mkOption {
type = lib.types.bool;
default = false;
description = "Whether to honor labels from the scraped target";
};
};
});
default = [ ];
description = "Additional Prometheus scrape targets exposed by this host";
};
};
}

View File

@@ -1,5 +1,9 @@
{ pkgs, unstable, ... }: { pkgs, unstable, ... }:
{ {
homelab.monitoring.scrapeTargets = [{
job_name = "step-ca";
port = 9000;
}];
sops.secrets."ca_root_pw" = { sops.secrets."ca_root_pw" = {
sopsFile = ../../secrets/ca/secrets.yaml; sopsFile = ../../secrets/ca/secrets.yaml;
owner = "step-ca"; owner = "step-ca";

View File

@@ -1,5 +1,11 @@
{ pkgs, config, ... }: { pkgs, config, ... }:
{ {
homelab.monitoring.scrapeTargets = [{
job_name = "home-assistant";
port = 8123;
metrics_path = "/api/prometheus";
scrape_interval = "60s";
}];
# Enable the Home Assistant service # Enable the Home Assistant service
services.home-assistant = { services.home-assistant = {
enable = true; enable = true;

View File

@@ -3,4 +3,9 @@
imports = [ imports = [
./proxy.nix ./proxy.nix
]; ];
homelab.monitoring.scrapeTargets = [{
job_name = "caddy";
port = 80;
}];
} }

View File

@@ -1,5 +1,9 @@
{ pkgs, ... }: { pkgs, ... }:
{ {
homelab.monitoring.scrapeTargets = [{
job_name = "jellyfin";
port = 8096;
}];
services.jellyfin = { services.jellyfin = {
enable = true; enable = true;
}; };

View File

@@ -0,0 +1,12 @@
# Monitoring targets for hosts not managed by this flake
# These are manually maintained and combined with auto-generated targets
{
nodeExporter = [
"gunter.home.2rjus.net:9100"
];
scrapeConfigs = [
{ job_name = "smartctl"; targets = [ "gunter.home.2rjus.net:9633" ]; }
{ job_name = "ghettoptt"; targets = [ "gunter.home.2rjus.net:8989" ]; }
{ job_name = "restic_rest"; targets = [ "10.69.12.52:8000" ]; }
];
}

View File

@@ -1,4 +1,11 @@
{ ... }: { self, lib, ... }:
let
monLib = import ../../lib/monitoring.nix { inherit lib; };
externalTargets = import ./external-targets.nix;
nodeExporterTargets = monLib.generateNodeExporterTargets self externalTargets;
autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets;
in
{ {
services.prometheus = { services.prometheus = {
enable = true; enable = true;
@@ -45,26 +52,16 @@
]; ];
scrapeConfigs = [ scrapeConfigs = [
# Auto-generated node-exporter targets from flake hosts + external
{ {
job_name = "node-exporter"; job_name = "node-exporter";
static_configs = [ static_configs = [
{ {
targets = [ targets = nodeExporterTargets;
"ca.home.2rjus.net:9100"
"gunter.home.2rjus.net:9100"
"ha1.home.2rjus.net:9100"
"http-proxy.home.2rjus.net:9100"
"jelly01.home.2rjus.net:9100"
"monitoring01.home.2rjus.net:9100"
"nix-cache01.home.2rjus.net:9100"
"ns1.home.2rjus.net:9100"
"ns2.home.2rjus.net:9100"
"pgdb1.home.2rjus.net:9100"
"nats1.home.2rjus.net:9100"
];
} }
]; ];
} }
# Local monitoring services (not auto-generated)
{ {
job_name = "prometheus"; job_name = "prometheus";
static_configs = [ static_configs = [
@@ -85,7 +82,7 @@
job_name = "grafana"; job_name = "grafana";
static_configs = [ static_configs = [
{ {
targets = [ "localhost:3100" ]; targets = [ "localhost:3000" ];
} }
]; ];
} }
@@ -98,13 +95,23 @@
]; ];
} }
{ {
job_name = "restic_rest"; job_name = "pushgateway";
honor_labels = true;
static_configs = [ static_configs = [
{ {
targets = [ "10.69.12.52:8000" ]; targets = [ "localhost:9091" ];
} }
]; ];
} }
{
job_name = "labmon";
static_configs = [
{
targets = [ "monitoring01.home.2rjus.net:9969" ];
}
];
}
# pve-exporter with complex relabel config
{ {
job_name = "pve-exporter"; job_name = "pve-exporter";
static_configs = [ static_configs = [
@@ -133,91 +140,8 @@
} }
]; ];
} }
{ ] ++ autoScrapeConfigs;
job_name = "caddy";
static_configs = [
{
targets = [ "http-proxy.home.2rjus.net" ];
}
];
}
{
job_name = "jellyfin";
static_configs = [
{
targets = [ "jelly01.home.2rjus.net:8096" ];
}
];
}
{
job_name = "smartctl";
static_configs = [
{
targets = [ "gunter.home.2rjus.net:9633" ];
}
];
}
{
job_name = "wireguard";
static_configs = [
{
targets = [ "http-proxy.home.2rjus.net:9586" ];
}
];
}
{
job_name = "home-assistant";
scrape_interval = "60s";
metrics_path = "/api/prometheus";
static_configs = [
{
targets = [ "ha1.home.2rjus.net:8123" ];
}
];
}
{
job_name = "ghettoptt";
static_configs = [
{
targets = [ "gunter.home.2rjus.net:8989" ];
}
];
}
{
job_name = "step-ca";
static_configs = [
{
targets = [ "ca.home.2rjus.net:9000" ];
}
];
}
{
job_name = "labmon";
static_configs = [
{
targets = [ "monitoring01.home.2rjus.net:9969" ];
}
];
}
{
job_name = "pushgateway";
honor_labels = true;
static_configs = [
{
targets = [ "localhost:9091" ];
}
];
}
{
job_name = "nix-cache_caddy";
scheme = "https";
static_configs = [
{
targets = [ "nix-cache.home.2rjus.net" ];
}
];
}
];
pushgateway = { pushgateway = {
enable = true; enable = true;
web = { web = {

View File

@@ -57,6 +57,38 @@ groups:
annotations: annotations:
summary: "Promtail service not running on {{ $labels.instance }}" summary: "Promtail service not running on {{ $labels.instance }}"
description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes." description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes."
- alert: filesystem_filling_up
expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0
for: 1h
labels:
severity: warning
annotations:
summary: "Filesystem predicted to fill within 24h on {{ $labels.instance }}"
description: "Based on the last 6h trend, the root filesystem on {{ $labels.instance }} is predicted to run out of space within 24 hours."
- alert: systemd_not_running
expr: node_systemd_system_running == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Systemd not in running state on {{ $labels.instance }}"
description: "Systemd is not in running state on {{ $labels.instance }}. The system may be in a degraded state."
- alert: high_file_descriptors
expr: node_filefd_allocated / node_filefd_maximum > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "High file descriptor usage on {{ $labels.instance }}"
description: "More than 80% of file descriptors are in use on {{ $labels.instance }}."
- alert: host_reboot
expr: changes(node_boot_time_seconds[10m]) > 0
for: 0m
labels:
severity: info
annotations:
summary: "Host {{ $labels.instance }} has rebooted"
description: "Host {{ $labels.instance }} has rebooted."
- name: nameserver_rules - name: nameserver_rules
rules: rules:
- alert: unbound_down - alert: unbound_down
@@ -75,7 +107,7 @@ groups:
annotations: annotations:
summary: "NSD not running on {{ $labels.instance }}" summary: "NSD not running on {{ $labels.instance }}"
description: "NSD has been down on {{ $labels.instance }} more than 5 minutes." description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
- name: http-proxy_rules - name: http_proxy_rules
rules: rules:
- alert: caddy_down - alert: caddy_down
expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0 expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0
@@ -85,6 +117,22 @@ groups:
annotations: annotations:
summary: "Caddy not running on {{ $labels.instance }}" summary: "Caddy not running on {{ $labels.instance }}"
description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes." description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
- alert: caddy_upstream_unhealthy
expr: caddy_reverse_proxy_upstreams_healthy == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Caddy upstream unhealthy for {{ $labels.upstream }}"
description: "Caddy reverse proxy upstream {{ $labels.upstream }} is unhealthy on {{ $labels.instance }}."
- alert: caddy_high_error_rate
expr: rate(caddy_http_request_errors_total[5m]) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "High HTTP error rate on {{ $labels.instance }}"
description: "Caddy is experiencing a high rate of HTTP errors on {{ $labels.instance }}."
- name: nats_rules - name: nats_rules
rules: rules:
- alert: nats_down - alert: nats_down
@@ -97,7 +145,7 @@ groups:
description: "NATS has been down on {{ $labels.instance }} more than 5 minutes." description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
- name: nix_cache_rules - name: nix_cache_rules
rules: rules:
- alert: build-flakes_service_not_active_recently - alert: build_flakes_service_not_active_recently
expr: count_over_time(node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="active"}[1h]) < 1 expr: count_over_time(node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="active"}[1h]) < 1
for: 0m for: 0m
labels: labels:
@@ -138,7 +186,7 @@ groups:
annotations: annotations:
summary: "Home assistant not running on {{ $labels.instance }}" summary: "Home assistant not running on {{ $labels.instance }}"
description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes." description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes."
- alert: zigbee2qmtt_down - alert: zigbee2mqtt_down
expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0 expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0
for: 5m for: 5m
labels: labels:
@@ -156,7 +204,7 @@ groups:
description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes." description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
- name: smartctl_rules - name: smartctl_rules
rules: rules:
- alert: SmartCriticalWarning - alert: smart_critical_warning
expr: smartctl_device_critical_warning > 0 expr: smartctl_device_critical_warning > 0
for: 0m for: 0m
labels: labels:
@@ -164,7 +212,7 @@ groups:
annotations: annotations:
summary: SMART critical warning (instance {{ $labels.instance }}) summary: SMART critical warning (instance {{ $labels.instance }})
description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartMediaErrors - alert: smart_media_errors
expr: smartctl_device_media_errors > 0 expr: smartctl_device_media_errors > 0
for: 0m for: 0m
labels: labels:
@@ -172,7 +220,7 @@ groups:
annotations: annotations:
summary: SMART media errors (instance {{ $labels.instance }}) summary: SMART media errors (instance {{ $labels.instance }})
description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartWearoutIndicator - alert: smart_wearout_indicator
expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold
for: 0m for: 0m
labels: labels:
@@ -180,20 +228,29 @@ groups:
annotations: annotations:
summary: SMART Wearout Indicator (instance {{ $labels.instance }}) summary: SMART Wearout Indicator (instance {{ $labels.instance }})
description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: smartctl_high_temperature
expr: smartctl_device_temperature > 60
for: 5m
labels:
severity: warning
annotations:
summary: "Disk temperature above 60C on {{ $labels.instance }}"
description: "Disk {{ $labels.device }} on {{ $labels.instance }} has temperature {{ $value }}C."
- name: wireguard_rules - name: wireguard_rules
rules: rules:
- alert: WireguardHandshake - alert: wireguard_handshake_timeout
expr: (time() - wireguard_latest_handshake_seconds{instance="http-proxy.home.2rjus.net:9586",interface="wg0",public_key="32Rb13wExcy8uI92JTnFdiOfkv0mlQ6f181WA741DHs="}) > 300 expr: (time() - wireguard_latest_handshake_seconds{interface="wg0"}) > 300
for: 1m for: 1m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "Wireguard handshake timeout on {{ $labels.instance }}" summary: "Wireguard handshake timeout on {{ $labels.instance }}"
description: "Wireguard handshake timeout on {{ $labels.instance }} for more than 1 minutes." description: "Wireguard handshake timeout on {{ $labels.instance }} for peer {{ $labels.public_key }}."
- name: monitoring_rules - name: monitoring_rules
rules: rules:
- alert: prometheus_not_running - alert: prometheus_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="prometheus.service", state="active"} == 0 expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="prometheus.service", state="active"} == 0
for: 5m
labels: labels:
severity: critical severity: critical
annotations: annotations:
@@ -201,6 +258,7 @@ groups:
description: "Prometheus service not running on {{ $labels.instance }}" description: "Prometheus service not running on {{ $labels.instance }}"
- alert: alertmanager_not_running - alert: alertmanager_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0 expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
for: 5m
labels: labels:
severity: critical severity: critical
annotations: annotations:
@@ -208,13 +266,7 @@ groups:
description: "Alertmanager service not running on {{ $labels.instance }}" description: "Alertmanager service not running on {{ $labels.instance }}"
- alert: pushgateway_not_running - alert: pushgateway_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0 expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
labels: for: 5m
severity: critical
annotations:
summary: "Pushgateway service not running on {{ $labels.instance }}"
description: "Pushgateway service not running on {{ $labels.instance }}"
- alert: pushgateway_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
labels: labels:
severity: critical severity: critical
annotations: annotations:
@@ -222,6 +274,7 @@ groups:
description: "Pushgateway service not running on {{ $labels.instance }}" description: "Pushgateway service not running on {{ $labels.instance }}"
- alert: loki_not_running - alert: loki_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="loki.service", state="active"} == 0 expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="loki.service", state="active"} == 0
for: 5m
labels: labels:
severity: critical severity: critical
annotations: annotations:
@@ -229,6 +282,7 @@ groups:
description: "Loki service not running on {{ $labels.instance }}" description: "Loki service not running on {{ $labels.instance }}"
- alert: grafana_not_running - alert: grafana_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="grafana.service", state="active"} == 0 expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
@@ -236,6 +290,7 @@ groups:
description: "Grafana service not running on {{ $labels.instance }}" description: "Grafana service not running on {{ $labels.instance }}"
- alert: tempo_not_running - alert: tempo_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="tempo.service", state="active"} == 0 expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="tempo.service", state="active"} == 0
for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
@@ -243,8 +298,53 @@ groups:
description: "Tempo service not running on {{ $labels.instance }}" description: "Tempo service not running on {{ $labels.instance }}"
- alert: pyroscope_not_running - alert: pyroscope_not_running
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="podman-pyroscope.service", state="active"} == 0 expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="podman-pyroscope.service", state="active"} == 0
for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "Pyroscope service not running on {{ $labels.instance }}" summary: "Pyroscope service not running on {{ $labels.instance }}"
description: "Pyroscope service not running on {{ $labels.instance }}" description: "Pyroscope service not running on {{ $labels.instance }}"
- name: certificate_rules
rules:
- alert: certificate_expiring_soon
expr: labmon_tlsconmon_certificate_seconds_left < 86400
for: 5m
labels:
severity: warning
annotations:
summary: "TLS certificate expiring soon for {{ $labels.instance }}"
description: "TLS certificate for {{ $labels.address }} is expiring within 24 hours."
- alert: certificate_check_error
expr: labmon_tlsconmon_certificate_check_error == 1
for: 5m
labels:
severity: warning
annotations:
summary: "Error checking certificate for {{ $labels.address }}"
description: "Certificate check is failing for {{ $labels.address }} on {{ $labels.instance }}."
- alert: step_ca_certificate_expiring
expr: labmon_stepmon_certificate_seconds_left < 3600
for: 5m
labels:
severity: critical
annotations:
summary: "Step-CA certificate expiring for {{ $labels.instance }}"
description: "Step-CA certificate is expiring within 1 hour on {{ $labels.instance }}."
- name: proxmox_rules
rules:
- alert: pve_node_down
expr: pve_up{id=~"node/.*"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Proxmox node {{ $labels.id }} is down"
description: "Proxmox node {{ $labels.id }} has been down for more than 5 minutes."
- alert: pve_guest_stopped
expr: pve_up{id=~"qemu/.*"} == 0 and pve_onboot_status == 1
for: 5m
labels:
severity: warning
annotations:
summary: "Proxmox VM {{ $labels.id }} is stopped"
description: "Proxmox VM {{ $labels.id }} ({{ $labels.name }}) has onboot=1 but is stopped."

View File

@@ -6,4 +6,10 @@
./proxy.nix ./proxy.nix
./nix.nix ./nix.nix
]; ];
homelab.monitoring.scrapeTargets = [{
job_name = "nix-cache_caddy";
port = 443;
scheme = "https";
}];
} }