monitoring: auto-generate Prometheus scrape targets from host configs #16
23
CLAUDE.md
23
CLAUDE.md
@@ -122,9 +122,10 @@ This ensures documentation matches the exact nixpkgs version (currently NixOS 25
|
|||||||
- Core modules: nix.nix, sshd.nix, sops.nix, acme.nix, autoupgrade.nix
|
- Core modules: nix.nix, sshd.nix, sops.nix, acme.nix, autoupgrade.nix
|
||||||
- Monitoring: node-exporter and promtail on every host
|
- Monitoring: node-exporter and promtail on every host
|
||||||
- `/modules/` - Custom NixOS modules
|
- `/modules/` - Custom NixOS modules
|
||||||
- `homelab/` - Homelab-specific options (DNS automation, etc.)
|
- `homelab/` - Homelab-specific options (DNS automation, monitoring scrape targets)
|
||||||
- `/lib/` - Nix library functions
|
- `/lib/` - Nix library functions
|
||||||
- `dns-zone.nix` - DNS zone generation functions
|
- `dns-zone.nix` - DNS zone generation functions
|
||||||
|
- `monitoring.nix` - Prometheus scrape target generation functions
|
||||||
- `/services/` - Reusable service modules, selectively imported by hosts
|
- `/services/` - Reusable service modules, selectively imported by hosts
|
||||||
- `home-assistant/` - Home automation stack
|
- `home-assistant/` - Home automation stack
|
||||||
- `monitoring/` - Observability stack (Prometheus, Grafana, Loki, Tempo)
|
- `monitoring/` - Observability stack (Prometheus, Grafana, Loki, Tempo)
|
||||||
@@ -156,6 +157,7 @@ All hosts automatically get:
|
|||||||
- Internal ACME CA integration (ca.home.2rjus.net)
|
- Internal ACME CA integration (ca.home.2rjus.net)
|
||||||
- Daily auto-upgrades with auto-reboot
|
- Daily auto-upgrades with auto-reboot
|
||||||
- Prometheus node-exporter + Promtail (logs to monitoring01)
|
- Prometheus node-exporter + Promtail (logs to monitoring01)
|
||||||
|
- Monitoring scrape target auto-registration via `homelab.monitoring` options
|
||||||
- Custom root CA trust
|
- Custom root CA trust
|
||||||
- DNS zone auto-registration via `homelab.dns` options
|
- DNS zone auto-registration via `homelab.dns` options
|
||||||
|
|
||||||
@@ -310,7 +312,7 @@ This means:
|
|||||||
11. Deploy by running `nixos-rebuild boot --flake URL#<hostname>` on the host.
|
11. Deploy by running `nixos-rebuild boot --flake URL#<hostname>` on the host.
|
||||||
12. Run auto-upgrade on DNS servers (ns1, ns2) to pick up the new host's DNS entry
|
12. Run auto-upgrade on DNS servers (ns1, ns2) to pick up the new host's DNS entry
|
||||||
|
|
||||||
**Note:** DNS A records are auto-generated from the host's `systemd.network.networks` static IP configuration. No manual zone file editing is required.
|
**Note:** DNS A records and Prometheus node-exporter scrape targets are auto-generated from the host's `systemd.network.networks` static IP configuration. No manual zone file or Prometheus config editing is required.
|
||||||
|
|
||||||
### Important Patterns
|
### Important Patterns
|
||||||
|
|
||||||
@@ -333,6 +335,23 @@ All hosts ship metrics and logs to `monitoring01`:
|
|||||||
- **Tracing**: Tempo for distributed tracing
|
- **Tracing**: Tempo for distributed tracing
|
||||||
- **Profiling**: Pyroscope for continuous profiling
|
- **Profiling**: Pyroscope for continuous profiling
|
||||||
|
|
||||||
|
**Scrape Target Auto-Generation:**
|
||||||
|
|
||||||
|
Prometheus scrape targets are automatically generated from host configurations, following the same pattern as DNS zone generation:
|
||||||
|
|
||||||
|
- **Node-exporter**: All flake hosts with static IPs are automatically added as node-exporter targets
|
||||||
|
- **Service targets**: Defined via `homelab.monitoring.scrapeTargets` in service modules
|
||||||
|
- **External targets**: Non-flake hosts defined in `/services/monitoring/external-targets.nix`
|
||||||
|
- **Library**: `lib/monitoring.nix` provides `generateNodeExporterTargets` and `generateScrapeConfigs`
|
||||||
|
|
||||||
|
Host monitoring options (`homelab.monitoring.*`):
|
||||||
|
- `enable` (default: `true`) - Include host in Prometheus node-exporter scrape targets
|
||||||
|
- `scrapeTargets` (default: `[]`) - Additional scrape targets exposed by this host (job_name, port, metrics_path, scheme, scrape_interval, honor_labels)
|
||||||
|
|
||||||
|
Service modules declare their scrape targets directly (e.g., `services/ca/default.nix` declares step-ca on port 9000). The Prometheus config on monitoring01 auto-generates scrape configs from all hosts.
|
||||||
|
|
||||||
|
To add monitoring targets for non-NixOS hosts, edit `/services/monitoring/external-targets.nix`.
|
||||||
|
|
||||||
### DNS Architecture
|
### DNS Architecture
|
||||||
|
|
||||||
- `ns1` (10.69.13.5) - Primary authoritative DNS + resolver
|
- `ns1` (10.69.13.5) - Primary authoritative DNS + resolver
|
||||||
|
|||||||
@@ -26,7 +26,11 @@
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
# monitoring
|
homelab.monitoring.scrapeTargets = [{
|
||||||
|
job_name = "wireguard";
|
||||||
|
port = 9586;
|
||||||
|
}];
|
||||||
|
|
||||||
services.prometheus.exporters.wireguard = {
|
services.prometheus.exporters.wireguard = {
|
||||||
enable = true;
|
enable = true;
|
||||||
};
|
};
|
||||||
|
|||||||
145
lib/monitoring.nix
Normal file
145
lib/monitoring.nix
Normal file
@@ -0,0 +1,145 @@
|
|||||||
|
{ lib }:
|
||||||
|
let
|
||||||
|
# Extract IP address from CIDR notation (e.g., "10.69.13.5/24" -> "10.69.13.5")
|
||||||
|
extractIP = address:
|
||||||
|
let
|
||||||
|
parts = lib.splitString "/" address;
|
||||||
|
in
|
||||||
|
builtins.head parts;
|
||||||
|
|
||||||
|
# Check if a network interface name looks like a VPN/tunnel interface
|
||||||
|
isVpnInterface = ifaceName:
|
||||||
|
lib.hasPrefix "wg" ifaceName ||
|
||||||
|
lib.hasPrefix "tun" ifaceName ||
|
||||||
|
lib.hasPrefix "tap" ifaceName ||
|
||||||
|
lib.hasPrefix "vti" ifaceName;
|
||||||
|
|
||||||
|
# Extract monitoring info from a single host configuration
|
||||||
|
# Returns null if host should not be included
|
||||||
|
extractHostMonitoring = name: hostConfig:
|
||||||
|
let
|
||||||
|
cfg = hostConfig.config;
|
||||||
|
monConfig = (cfg.homelab or { }).monitoring or { enable = true; scrapeTargets = [ ]; };
|
||||||
|
dnsConfig = (cfg.homelab or { }).dns or { enable = true; };
|
||||||
|
hostname = cfg.networking.hostName;
|
||||||
|
networks = cfg.systemd.network.networks or { };
|
||||||
|
|
||||||
|
# Filter out VPN interfaces and find networks with static addresses
|
||||||
|
physicalNetworks = lib.filterAttrs
|
||||||
|
(netName: netCfg:
|
||||||
|
let
|
||||||
|
ifaceName = netCfg.matchConfig.Name or "";
|
||||||
|
in
|
||||||
|
!(isVpnInterface ifaceName) && (netCfg.address or [ ]) != [ ])
|
||||||
|
networks;
|
||||||
|
|
||||||
|
# Get addresses from physical networks only
|
||||||
|
networkAddresses = lib.flatten (
|
||||||
|
lib.mapAttrsToList
|
||||||
|
(netName: netCfg: netCfg.address or [ ])
|
||||||
|
physicalNetworks
|
||||||
|
);
|
||||||
|
|
||||||
|
firstAddress = if networkAddresses != [ ] then builtins.head networkAddresses else null;
|
||||||
|
in
|
||||||
|
if !(monConfig.enable or true) || !(dnsConfig.enable or true) || firstAddress == null then
|
||||||
|
null
|
||||||
|
else
|
||||||
|
{
|
||||||
|
inherit hostname;
|
||||||
|
ip = extractIP firstAddress;
|
||||||
|
scrapeTargets = monConfig.scrapeTargets or [ ];
|
||||||
|
};
|
||||||
|
|
||||||
|
# Generate node-exporter targets from all flake hosts
|
||||||
|
generateNodeExporterTargets = self: externalTargets:
|
||||||
|
let
|
||||||
|
nixosConfigs = self.nixosConfigurations or { };
|
||||||
|
hostList = lib.filter (x: x != null) (
|
||||||
|
lib.mapAttrsToList extractHostMonitoring nixosConfigs
|
||||||
|
);
|
||||||
|
flakeTargets = map (host: "${host.hostname}.home.2rjus.net:9100") hostList;
|
||||||
|
in
|
||||||
|
flakeTargets ++ (externalTargets.nodeExporter or [ ]);
|
||||||
|
|
||||||
|
# Generate scrape configs from all flake hosts and external targets
|
||||||
|
generateScrapeConfigs = self: externalTargets:
|
||||||
|
let
|
||||||
|
nixosConfigs = self.nixosConfigurations or { };
|
||||||
|
hostList = lib.filter (x: x != null) (
|
||||||
|
lib.mapAttrsToList extractHostMonitoring nixosConfigs
|
||||||
|
);
|
||||||
|
|
||||||
|
# Collect all scrapeTargets from all hosts, grouped by job_name
|
||||||
|
allTargets = lib.flatten (map
|
||||||
|
(host:
|
||||||
|
map
|
||||||
|
(target: {
|
||||||
|
inherit (target) job_name port metrics_path scheme scrape_interval honor_labels;
|
||||||
|
hostname = host.hostname;
|
||||||
|
})
|
||||||
|
host.scrapeTargets
|
||||||
|
)
|
||||||
|
hostList
|
||||||
|
);
|
||||||
|
|
||||||
|
# Group targets by job_name
|
||||||
|
grouped = lib.groupBy (t: t.job_name) allTargets;
|
||||||
|
|
||||||
|
# Generate a scrape config for each job
|
||||||
|
flakeScrapeConfigs = lib.mapAttrsToList
|
||||||
|
(jobName: targets:
|
||||||
|
let
|
||||||
|
first = builtins.head targets;
|
||||||
|
targetAddrs = map
|
||||||
|
(t:
|
||||||
|
let
|
||||||
|
portStr = toString t.port;
|
||||||
|
in
|
||||||
|
"${t.hostname}.home.2rjus.net:${portStr}")
|
||||||
|
targets;
|
||||||
|
config = {
|
||||||
|
job_name = jobName;
|
||||||
|
static_configs = [{
|
||||||
|
targets = targetAddrs;
|
||||||
|
}];
|
||||||
|
}
|
||||||
|
// (lib.optionalAttrs (first.metrics_path != "/metrics") {
|
||||||
|
metrics_path = first.metrics_path;
|
||||||
|
})
|
||||||
|
// (lib.optionalAttrs (first.scheme != "http") {
|
||||||
|
scheme = first.scheme;
|
||||||
|
})
|
||||||
|
// (lib.optionalAttrs (first.scrape_interval != null) {
|
||||||
|
scrape_interval = first.scrape_interval;
|
||||||
|
})
|
||||||
|
// (lib.optionalAttrs first.honor_labels {
|
||||||
|
honor_labels = true;
|
||||||
|
});
|
||||||
|
in
|
||||||
|
config
|
||||||
|
)
|
||||||
|
grouped;
|
||||||
|
|
||||||
|
# External scrape configs
|
||||||
|
externalScrapeConfigs = map
|
||||||
|
(ext: {
|
||||||
|
job_name = ext.job_name;
|
||||||
|
static_configs = [{
|
||||||
|
targets = ext.targets;
|
||||||
|
}];
|
||||||
|
} // (lib.optionalAttrs (ext ? metrics_path) {
|
||||||
|
metrics_path = ext.metrics_path;
|
||||||
|
}) // (lib.optionalAttrs (ext ? scheme) {
|
||||||
|
scheme = ext.scheme;
|
||||||
|
}) // (lib.optionalAttrs (ext ? scrape_interval) {
|
||||||
|
scrape_interval = ext.scrape_interval;
|
||||||
|
}))
|
||||||
|
(externalTargets.scrapeConfigs or [ ]);
|
||||||
|
in
|
||||||
|
flakeScrapeConfigs ++ externalScrapeConfigs;
|
||||||
|
|
||||||
|
in
|
||||||
|
{
|
||||||
|
inherit extractHostMonitoring generateNodeExporterTargets generateScrapeConfigs;
|
||||||
|
}
|
||||||
@@ -2,5 +2,6 @@
|
|||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
./dns.nix
|
./dns.nix
|
||||||
|
./monitoring.nix
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|||||||
50
modules/homelab/monitoring.nix
Normal file
50
modules/homelab/monitoring.nix
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
{ config, lib, ... }:
|
||||||
|
let
|
||||||
|
cfg = config.homelab.monitoring;
|
||||||
|
in
|
||||||
|
{
|
||||||
|
options.homelab.monitoring = {
|
||||||
|
enable = lib.mkOption {
|
||||||
|
type = lib.types.bool;
|
||||||
|
default = true;
|
||||||
|
description = "Include this host in Prometheus node-exporter scrape targets";
|
||||||
|
};
|
||||||
|
|
||||||
|
scrapeTargets = lib.mkOption {
|
||||||
|
type = lib.types.listOf (lib.types.submodule {
|
||||||
|
options = {
|
||||||
|
job_name = lib.mkOption {
|
||||||
|
type = lib.types.str;
|
||||||
|
description = "Prometheus scrape job name";
|
||||||
|
};
|
||||||
|
port = lib.mkOption {
|
||||||
|
type = lib.types.port;
|
||||||
|
description = "Port to scrape metrics from";
|
||||||
|
};
|
||||||
|
metrics_path = lib.mkOption {
|
||||||
|
type = lib.types.str;
|
||||||
|
default = "/metrics";
|
||||||
|
description = "HTTP path to scrape metrics from";
|
||||||
|
};
|
||||||
|
scheme = lib.mkOption {
|
||||||
|
type = lib.types.str;
|
||||||
|
default = "http";
|
||||||
|
description = "HTTP scheme (http or https)";
|
||||||
|
};
|
||||||
|
scrape_interval = lib.mkOption {
|
||||||
|
type = lib.types.nullOr lib.types.str;
|
||||||
|
default = null;
|
||||||
|
description = "Override the global scrape interval for this target";
|
||||||
|
};
|
||||||
|
honor_labels = lib.mkOption {
|
||||||
|
type = lib.types.bool;
|
||||||
|
default = false;
|
||||||
|
description = "Whether to honor labels from the scraped target";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
});
|
||||||
|
default = [ ];
|
||||||
|
description = "Additional Prometheus scrape targets exposed by this host";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -1,5 +1,9 @@
|
|||||||
{ pkgs, unstable, ... }:
|
{ pkgs, unstable, ... }:
|
||||||
{
|
{
|
||||||
|
homelab.monitoring.scrapeTargets = [{
|
||||||
|
job_name = "step-ca";
|
||||||
|
port = 9000;
|
||||||
|
}];
|
||||||
sops.secrets."ca_root_pw" = {
|
sops.secrets."ca_root_pw" = {
|
||||||
sopsFile = ../../secrets/ca/secrets.yaml;
|
sopsFile = ../../secrets/ca/secrets.yaml;
|
||||||
owner = "step-ca";
|
owner = "step-ca";
|
||||||
|
|||||||
@@ -1,5 +1,11 @@
|
|||||||
{ pkgs, config, ... }:
|
{ pkgs, config, ... }:
|
||||||
{
|
{
|
||||||
|
homelab.monitoring.scrapeTargets = [{
|
||||||
|
job_name = "home-assistant";
|
||||||
|
port = 8123;
|
||||||
|
metrics_path = "/api/prometheus";
|
||||||
|
scrape_interval = "60s";
|
||||||
|
}];
|
||||||
# Enable the Home Assistant service
|
# Enable the Home Assistant service
|
||||||
services.home-assistant = {
|
services.home-assistant = {
|
||||||
enable = true;
|
enable = true;
|
||||||
|
|||||||
@@ -3,4 +3,9 @@
|
|||||||
imports = [
|
imports = [
|
||||||
./proxy.nix
|
./proxy.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
|
homelab.monitoring.scrapeTargets = [{
|
||||||
|
job_name = "caddy";
|
||||||
|
port = 80;
|
||||||
|
}];
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,9 @@
|
|||||||
{ pkgs, ... }:
|
{ pkgs, ... }:
|
||||||
{
|
{
|
||||||
|
homelab.monitoring.scrapeTargets = [{
|
||||||
|
job_name = "jellyfin";
|
||||||
|
port = 8096;
|
||||||
|
}];
|
||||||
services.jellyfin = {
|
services.jellyfin = {
|
||||||
enable = true;
|
enable = true;
|
||||||
};
|
};
|
||||||
|
|||||||
12
services/monitoring/external-targets.nix
Normal file
12
services/monitoring/external-targets.nix
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
# Monitoring targets for hosts not managed by this flake
|
||||||
|
# These are manually maintained and combined with auto-generated targets
|
||||||
|
{
|
||||||
|
nodeExporter = [
|
||||||
|
"gunter.home.2rjus.net:9100"
|
||||||
|
];
|
||||||
|
scrapeConfigs = [
|
||||||
|
{ job_name = "smartctl"; targets = [ "gunter.home.2rjus.net:9633" ]; }
|
||||||
|
{ job_name = "ghettoptt"; targets = [ "gunter.home.2rjus.net:8989" ]; }
|
||||||
|
{ job_name = "restic_rest"; targets = [ "10.69.12.52:8000" ]; }
|
||||||
|
];
|
||||||
|
}
|
||||||
@@ -1,4 +1,11 @@
|
|||||||
{ ... }:
|
{ self, lib, ... }:
|
||||||
|
let
|
||||||
|
monLib = import ../../lib/monitoring.nix { inherit lib; };
|
||||||
|
externalTargets = import ./external-targets.nix;
|
||||||
|
|
||||||
|
nodeExporterTargets = monLib.generateNodeExporterTargets self externalTargets;
|
||||||
|
autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets;
|
||||||
|
in
|
||||||
{
|
{
|
||||||
services.prometheus = {
|
services.prometheus = {
|
||||||
enable = true;
|
enable = true;
|
||||||
@@ -45,26 +52,16 @@
|
|||||||
];
|
];
|
||||||
|
|
||||||
scrapeConfigs = [
|
scrapeConfigs = [
|
||||||
|
# Auto-generated node-exporter targets from flake hosts + external
|
||||||
{
|
{
|
||||||
job_name = "node-exporter";
|
job_name = "node-exporter";
|
||||||
static_configs = [
|
static_configs = [
|
||||||
{
|
{
|
||||||
targets = [
|
targets = nodeExporterTargets;
|
||||||
"ca.home.2rjus.net:9100"
|
|
||||||
"gunter.home.2rjus.net:9100"
|
|
||||||
"ha1.home.2rjus.net:9100"
|
|
||||||
"http-proxy.home.2rjus.net:9100"
|
|
||||||
"jelly01.home.2rjus.net:9100"
|
|
||||||
"monitoring01.home.2rjus.net:9100"
|
|
||||||
"nix-cache01.home.2rjus.net:9100"
|
|
||||||
"ns1.home.2rjus.net:9100"
|
|
||||||
"ns2.home.2rjus.net:9100"
|
|
||||||
"pgdb1.home.2rjus.net:9100"
|
|
||||||
"nats1.home.2rjus.net:9100"
|
|
||||||
];
|
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
# Local monitoring services (not auto-generated)
|
||||||
{
|
{
|
||||||
job_name = "prometheus";
|
job_name = "prometheus";
|
||||||
static_configs = [
|
static_configs = [
|
||||||
@@ -85,7 +82,7 @@
|
|||||||
job_name = "grafana";
|
job_name = "grafana";
|
||||||
static_configs = [
|
static_configs = [
|
||||||
{
|
{
|
||||||
targets = [ "localhost:3100" ];
|
targets = [ "localhost:3000" ];
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
@@ -98,13 +95,23 @@
|
|||||||
];
|
];
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
job_name = "restic_rest";
|
job_name = "pushgateway";
|
||||||
|
honor_labels = true;
|
||||||
static_configs = [
|
static_configs = [
|
||||||
{
|
{
|
||||||
targets = [ "10.69.12.52:8000" ];
|
targets = [ "localhost:9091" ];
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
{
|
||||||
|
job_name = "labmon";
|
||||||
|
static_configs = [
|
||||||
|
{
|
||||||
|
targets = [ "monitoring01.home.2rjus.net:9969" ];
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
# pve-exporter with complex relabel config
|
||||||
{
|
{
|
||||||
job_name = "pve-exporter";
|
job_name = "pve-exporter";
|
||||||
static_configs = [
|
static_configs = [
|
||||||
@@ -133,91 +140,8 @@
|
|||||||
}
|
}
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
{
|
] ++ autoScrapeConfigs;
|
||||||
job_name = "caddy";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "http-proxy.home.2rjus.net" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "jellyfin";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "jelly01.home.2rjus.net:8096" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "smartctl";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "gunter.home.2rjus.net:9633" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "wireguard";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "http-proxy.home.2rjus.net:9586" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "home-assistant";
|
|
||||||
scrape_interval = "60s";
|
|
||||||
metrics_path = "/api/prometheus";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "ha1.home.2rjus.net:8123" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "ghettoptt";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "gunter.home.2rjus.net:8989" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "step-ca";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "ca.home.2rjus.net:9000" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "labmon";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "monitoring01.home.2rjus.net:9969" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "pushgateway";
|
|
||||||
honor_labels = true;
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "localhost:9091" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "nix-cache_caddy";
|
|
||||||
scheme = "https";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "nix-cache.home.2rjus.net" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
pushgateway = {
|
pushgateway = {
|
||||||
enable = true;
|
enable = true;
|
||||||
web = {
|
web = {
|
||||||
|
|||||||
@@ -57,6 +57,38 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: "Promtail service not running on {{ $labels.instance }}"
|
summary: "Promtail service not running on {{ $labels.instance }}"
|
||||||
description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes."
|
description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes."
|
||||||
|
- alert: filesystem_filling_up
|
||||||
|
expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Filesystem predicted to fill within 24h on {{ $labels.instance }}"
|
||||||
|
description: "Based on the last 6h trend, the root filesystem on {{ $labels.instance }} is predicted to run out of space within 24 hours."
|
||||||
|
- alert: systemd_not_running
|
||||||
|
expr: node_systemd_system_running == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Systemd not in running state on {{ $labels.instance }}"
|
||||||
|
description: "Systemd is not in running state on {{ $labels.instance }}. The system may be in a degraded state."
|
||||||
|
- alert: high_file_descriptors
|
||||||
|
expr: node_filefd_allocated / node_filefd_maximum > 0.8
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "High file descriptor usage on {{ $labels.instance }}"
|
||||||
|
description: "More than 80% of file descriptors are in use on {{ $labels.instance }}."
|
||||||
|
- alert: host_reboot
|
||||||
|
expr: changes(node_boot_time_seconds[10m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: "Host {{ $labels.instance }} has rebooted"
|
||||||
|
description: "Host {{ $labels.instance }} has rebooted."
|
||||||
- name: nameserver_rules
|
- name: nameserver_rules
|
||||||
rules:
|
rules:
|
||||||
- alert: unbound_down
|
- alert: unbound_down
|
||||||
@@ -75,7 +107,7 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: "NSD not running on {{ $labels.instance }}"
|
summary: "NSD not running on {{ $labels.instance }}"
|
||||||
description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
|
description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
- name: http-proxy_rules
|
- name: http_proxy_rules
|
||||||
rules:
|
rules:
|
||||||
- alert: caddy_down
|
- alert: caddy_down
|
||||||
expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0
|
expr: node_systemd_unit_state {instance="http-proxy.home.2rjus.net:9100", name = "caddy.service", state = "active"} == 0
|
||||||
@@ -85,6 +117,22 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: "Caddy not running on {{ $labels.instance }}"
|
summary: "Caddy not running on {{ $labels.instance }}"
|
||||||
description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
|
description: "Caddy has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
|
- alert: caddy_upstream_unhealthy
|
||||||
|
expr: caddy_reverse_proxy_upstreams_healthy == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Caddy upstream unhealthy for {{ $labels.upstream }}"
|
||||||
|
description: "Caddy reverse proxy upstream {{ $labels.upstream }} is unhealthy on {{ $labels.instance }}."
|
||||||
|
- alert: caddy_high_error_rate
|
||||||
|
expr: rate(caddy_http_request_errors_total[5m]) > 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "High HTTP error rate on {{ $labels.instance }}"
|
||||||
|
description: "Caddy is experiencing a high rate of HTTP errors on {{ $labels.instance }}."
|
||||||
- name: nats_rules
|
- name: nats_rules
|
||||||
rules:
|
rules:
|
||||||
- alert: nats_down
|
- alert: nats_down
|
||||||
@@ -97,7 +145,7 @@ groups:
|
|||||||
description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
|
description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
- name: nix_cache_rules
|
- name: nix_cache_rules
|
||||||
rules:
|
rules:
|
||||||
- alert: build-flakes_service_not_active_recently
|
- alert: build_flakes_service_not_active_recently
|
||||||
expr: count_over_time(node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="active"}[1h]) < 1
|
expr: count_over_time(node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="active"}[1h]) < 1
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
@@ -138,7 +186,7 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: "Home assistant not running on {{ $labels.instance }}"
|
summary: "Home assistant not running on {{ $labels.instance }}"
|
||||||
description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes."
|
description: "Home assistant has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
- alert: zigbee2qmtt_down
|
- alert: zigbee2mqtt_down
|
||||||
expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0
|
expr: node_systemd_unit_state {instance = "ha1.home.2rjus.net:9100", name = "zigbee2mqtt.service", state = "active"} == 0
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
@@ -156,7 +204,7 @@ groups:
|
|||||||
description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
|
description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
- name: smartctl_rules
|
- name: smartctl_rules
|
||||||
rules:
|
rules:
|
||||||
- alert: SmartCriticalWarning
|
- alert: smart_critical_warning
|
||||||
expr: smartctl_device_critical_warning > 0
|
expr: smartctl_device_critical_warning > 0
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
@@ -164,7 +212,7 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: SMART critical warning (instance {{ $labels.instance }})
|
summary: SMART critical warning (instance {{ $labels.instance }})
|
||||||
description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
- alert: SmartMediaErrors
|
- alert: smart_media_errors
|
||||||
expr: smartctl_device_media_errors > 0
|
expr: smartctl_device_media_errors > 0
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
@@ -172,7 +220,7 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: SMART media errors (instance {{ $labels.instance }})
|
summary: SMART media errors (instance {{ $labels.instance }})
|
||||||
description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
- alert: SmartWearoutIndicator
|
- alert: smart_wearout_indicator
|
||||||
expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold
|
expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
@@ -180,20 +228,29 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: SMART Wearout Indicator (instance {{ $labels.instance }})
|
summary: SMART Wearout Indicator (instance {{ $labels.instance }})
|
||||||
description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
- alert: smartctl_high_temperature
|
||||||
|
expr: smartctl_device_temperature > 60
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Disk temperature above 60C on {{ $labels.instance }}"
|
||||||
|
description: "Disk {{ $labels.device }} on {{ $labels.instance }} has temperature {{ $value }}C."
|
||||||
- name: wireguard_rules
|
- name: wireguard_rules
|
||||||
rules:
|
rules:
|
||||||
- alert: WireguardHandshake
|
- alert: wireguard_handshake_timeout
|
||||||
expr: (time() - wireguard_latest_handshake_seconds{instance="http-proxy.home.2rjus.net:9586",interface="wg0",public_key="32Rb13wExcy8uI92JTnFdiOfkv0mlQ6f181WA741DHs="}) > 300
|
expr: (time() - wireguard_latest_handshake_seconds{interface="wg0"}) > 300
|
||||||
for: 1m
|
for: 1m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Wireguard handshake timeout on {{ $labels.instance }}"
|
summary: "Wireguard handshake timeout on {{ $labels.instance }}"
|
||||||
description: "Wireguard handshake timeout on {{ $labels.instance }} for more than 1 minutes."
|
description: "Wireguard handshake timeout on {{ $labels.instance }} for peer {{ $labels.public_key }}."
|
||||||
- name: monitoring_rules
|
- name: monitoring_rules
|
||||||
rules:
|
rules:
|
||||||
- alert: prometheus_not_running
|
- alert: prometheus_not_running
|
||||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="prometheus.service", state="active"} == 0
|
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="prometheus.service", state="active"} == 0
|
||||||
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
@@ -201,6 +258,7 @@ groups:
|
|||||||
description: "Prometheus service not running on {{ $labels.instance }}"
|
description: "Prometheus service not running on {{ $labels.instance }}"
|
||||||
- alert: alertmanager_not_running
|
- alert: alertmanager_not_running
|
||||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
|
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
|
||||||
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
@@ -208,13 +266,7 @@ groups:
|
|||||||
description: "Alertmanager service not running on {{ $labels.instance }}"
|
description: "Alertmanager service not running on {{ $labels.instance }}"
|
||||||
- alert: pushgateway_not_running
|
- alert: pushgateway_not_running
|
||||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
|
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
|
||||||
labels:
|
for: 5m
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: "Pushgateway service not running on {{ $labels.instance }}"
|
|
||||||
description: "Pushgateway service not running on {{ $labels.instance }}"
|
|
||||||
- alert: pushgateway_not_running
|
|
||||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
|
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
@@ -222,6 +274,7 @@ groups:
|
|||||||
description: "Pushgateway service not running on {{ $labels.instance }}"
|
description: "Pushgateway service not running on {{ $labels.instance }}"
|
||||||
- alert: loki_not_running
|
- alert: loki_not_running
|
||||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="loki.service", state="active"} == 0
|
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="loki.service", state="active"} == 0
|
||||||
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
@@ -229,6 +282,7 @@ groups:
|
|||||||
description: "Loki service not running on {{ $labels.instance }}"
|
description: "Loki service not running on {{ $labels.instance }}"
|
||||||
- alert: grafana_not_running
|
- alert: grafana_not_running
|
||||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
|
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
|
||||||
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
@@ -236,6 +290,7 @@ groups:
|
|||||||
description: "Grafana service not running on {{ $labels.instance }}"
|
description: "Grafana service not running on {{ $labels.instance }}"
|
||||||
- alert: tempo_not_running
|
- alert: tempo_not_running
|
||||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="tempo.service", state="active"} == 0
|
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="tempo.service", state="active"} == 0
|
||||||
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
@@ -243,8 +298,53 @@ groups:
|
|||||||
description: "Tempo service not running on {{ $labels.instance }}"
|
description: "Tempo service not running on {{ $labels.instance }}"
|
||||||
- alert: pyroscope_not_running
|
- alert: pyroscope_not_running
|
||||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="podman-pyroscope.service", state="active"} == 0
|
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="podman-pyroscope.service", state="active"} == 0
|
||||||
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Pyroscope service not running on {{ $labels.instance }}"
|
summary: "Pyroscope service not running on {{ $labels.instance }}"
|
||||||
description: "Pyroscope service not running on {{ $labels.instance }}"
|
description: "Pyroscope service not running on {{ $labels.instance }}"
|
||||||
|
- name: certificate_rules
|
||||||
|
rules:
|
||||||
|
- alert: certificate_expiring_soon
|
||||||
|
expr: labmon_tlsconmon_certificate_seconds_left < 86400
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "TLS certificate expiring soon for {{ $labels.instance }}"
|
||||||
|
description: "TLS certificate for {{ $labels.address }} is expiring within 24 hours."
|
||||||
|
- alert: certificate_check_error
|
||||||
|
expr: labmon_tlsconmon_certificate_check_error == 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Error checking certificate for {{ $labels.address }}"
|
||||||
|
description: "Certificate check is failing for {{ $labels.address }} on {{ $labels.instance }}."
|
||||||
|
- alert: step_ca_certificate_expiring
|
||||||
|
expr: labmon_stepmon_certificate_seconds_left < 3600
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Step-CA certificate expiring for {{ $labels.instance }}"
|
||||||
|
description: "Step-CA certificate is expiring within 1 hour on {{ $labels.instance }}."
|
||||||
|
- name: proxmox_rules
|
||||||
|
rules:
|
||||||
|
- alert: pve_node_down
|
||||||
|
expr: pve_up{id=~"node/.*"} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Proxmox node {{ $labels.id }} is down"
|
||||||
|
description: "Proxmox node {{ $labels.id }} has been down for more than 5 minutes."
|
||||||
|
- alert: pve_guest_stopped
|
||||||
|
expr: pve_up{id=~"qemu/.*"} == 0 and pve_onboot_status == 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Proxmox VM {{ $labels.id }} is stopped"
|
||||||
|
description: "Proxmox VM {{ $labels.id }} ({{ $labels.name }}) has onboot=1 but is stopped."
|
||||||
|
|||||||
@@ -6,4 +6,10 @@
|
|||||||
./proxy.nix
|
./proxy.nix
|
||||||
./nix.nix
|
./nix.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
|
homelab.monitoring.scrapeTargets = [{
|
||||||
|
job_name = "nix-cache_caddy";
|
||||||
|
port = 443;
|
||||||
|
scheme = "https";
|
||||||
|
}];
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user