monitoring-gaps-implementation #20
@@ -1,5 +1,10 @@
|
|||||||
{ config, ... }:
|
{ config, ... }:
|
||||||
{
|
{
|
||||||
|
homelab.monitoring.scrapeTargets = [{
|
||||||
|
job_name = "authelia";
|
||||||
|
port = 9959;
|
||||||
|
}];
|
||||||
|
|
||||||
sops.secrets.authelia_ldap_password = {
|
sops.secrets.authelia_ldap_password = {
|
||||||
format = "yaml";
|
format = "yaml";
|
||||||
sopsFile = ../../secrets/auth01/secrets.yaml;
|
sopsFile = ../../secrets/auth01/secrets.yaml;
|
||||||
@@ -45,6 +50,12 @@
|
|||||||
storageEncryptionKeyFile = config.sops.secrets.authelia_storage_encryption_key_file.path;
|
storageEncryptionKeyFile = config.sops.secrets.authelia_storage_encryption_key_file.path;
|
||||||
};
|
};
|
||||||
settings = {
|
settings = {
|
||||||
|
telemetry = {
|
||||||
|
metrics = {
|
||||||
|
enabled = true;
|
||||||
|
address = "tcp://0.0.0.0:9959";
|
||||||
|
};
|
||||||
|
};
|
||||||
access_control = {
|
access_control = {
|
||||||
default_policy = "two_factor";
|
default_policy = "two_factor";
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -7,8 +7,19 @@ let
|
|||||||
autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets;
|
autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets;
|
||||||
in
|
in
|
||||||
{
|
{
|
||||||
|
# OpenBao token for scraping metrics
|
||||||
|
vault.secrets.openbao-token = {
|
||||||
|
secretPath = "hosts/monitoring01/openbao-token";
|
||||||
|
extractKey = "token";
|
||||||
|
outputDir = "/run/secrets/prometheus/openbao-token";
|
||||||
|
mode = "0400";
|
||||||
|
owner = "prometheus";
|
||||||
|
services = [ "prometheus" ];
|
||||||
|
};
|
||||||
services.prometheus = {
|
services.prometheus = {
|
||||||
enable = true;
|
enable = true;
|
||||||
|
# syntax-only check because we use external credential files (e.g., openbao-token)
|
||||||
|
checkConfig = "syntax-only";
|
||||||
alertmanager = {
|
alertmanager = {
|
||||||
enable = true;
|
enable = true;
|
||||||
configuration = {
|
configuration = {
|
||||||
@@ -61,6 +72,15 @@ in
|
|||||||
}
|
}
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
# Systemd exporter on all hosts (same targets, different port)
|
||||||
|
{
|
||||||
|
job_name = "systemd-exporter";
|
||||||
|
static_configs = [
|
||||||
|
{
|
||||||
|
targets = map (t: builtins.replaceStrings [":9100"] [":9558"] t) nodeExporterTargets;
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
# Local monitoring services (not auto-generated)
|
# Local monitoring services (not auto-generated)
|
||||||
{
|
{
|
||||||
job_name = "prometheus";
|
job_name = "prometheus";
|
||||||
@@ -152,6 +172,22 @@ in
|
|||||||
}
|
}
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
# OpenBao metrics with bearer token auth
|
||||||
|
{
|
||||||
|
job_name = "openbao";
|
||||||
|
scheme = "https";
|
||||||
|
metrics_path = "/v1/sys/metrics";
|
||||||
|
params = {
|
||||||
|
format = [ "prometheus" ];
|
||||||
|
};
|
||||||
|
static_configs = [{
|
||||||
|
targets = [ "vault01.home.2rjus.net:8200" ];
|
||||||
|
}];
|
||||||
|
authorization = {
|
||||||
|
type = "Bearer";
|
||||||
|
credentials_file = "/run/secrets/prometheus/openbao-token";
|
||||||
|
};
|
||||||
|
}
|
||||||
] ++ autoScrapeConfigs;
|
] ++ autoScrapeConfigs;
|
||||||
|
|
||||||
pushgateway = {
|
pushgateway = {
|
||||||
|
|||||||
@@ -115,6 +115,14 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: "NSD not running on {{ $labels.instance }}"
|
summary: "NSD not running on {{ $labels.instance }}"
|
||||||
description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
|
description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
|
- alert: unbound_low_cache_hit_ratio
|
||||||
|
expr: (rate(unbound_cache_hits_total[5m]) / (rate(unbound_cache_hits_total[5m]) + rate(unbound_cache_misses_total[5m]))) < 0.5
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Low DNS cache hit ratio on {{ $labels.instance }}"
|
||||||
|
description: "Unbound cache hit ratio is below 50% on {{ $labels.instance }}."
|
||||||
- name: http_proxy_rules
|
- name: http_proxy_rules
|
||||||
rules:
|
rules:
|
||||||
- alert: caddy_down
|
- alert: caddy_down
|
||||||
@@ -151,6 +159,14 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: "NATS not running on {{ $labels.instance }}"
|
summary: "NATS not running on {{ $labels.instance }}"
|
||||||
description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
|
description: "NATS has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
|
- alert: nats_slow_consumers
|
||||||
|
expr: nats_core_slow_consumer_count > 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "NATS has slow consumers on {{ $labels.instance }}"
|
||||||
|
description: "NATS has {{ $value }} slow consumers on {{ $labels.instance }}."
|
||||||
- name: nix_cache_rules
|
- name: nix_cache_rules
|
||||||
rules:
|
rules:
|
||||||
- alert: build_flakes_service_not_active_recently
|
- alert: build_flakes_service_not_active_recently
|
||||||
@@ -364,3 +380,83 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: "Proxmox VM {{ $labels.id }} is stopped"
|
summary: "Proxmox VM {{ $labels.id }} is stopped"
|
||||||
description: "Proxmox VM {{ $labels.id }} ({{ $labels.name }}) has onboot=1 but is stopped."
|
description: "Proxmox VM {{ $labels.id }} ({{ $labels.name }}) has onboot=1 but is stopped."
|
||||||
|
- name: postgres_rules
|
||||||
|
rules:
|
||||||
|
- alert: postgres_down
|
||||||
|
expr: node_systemd_unit_state{instance="pgdb1.home.2rjus.net:9100", name="postgresql.service", state="active"} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "PostgreSQL not running on {{ $labels.instance }}"
|
||||||
|
description: "PostgreSQL has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
|
- alert: postgres_exporter_down
|
||||||
|
expr: up{job="postgres"} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "PostgreSQL exporter down on {{ $labels.instance }}"
|
||||||
|
description: "Cannot scrape PostgreSQL metrics from {{ $labels.instance }}."
|
||||||
|
- alert: postgres_high_connections
|
||||||
|
expr: pg_stat_activity_count / pg_settings_max_connections > 0.8
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "PostgreSQL connection pool near exhaustion on {{ $labels.instance }}"
|
||||||
|
description: "PostgreSQL is using over 80% of max_connections on {{ $labels.instance }}."
|
||||||
|
- name: auth_rules
|
||||||
|
rules:
|
||||||
|
- alert: authelia_down
|
||||||
|
expr: node_systemd_unit_state{instance="auth01.home.2rjus.net:9100", name="authelia-auth.service", state="active"} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Authelia not running on {{ $labels.instance }}"
|
||||||
|
description: "Authelia has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
|
- alert: lldap_down
|
||||||
|
expr: node_systemd_unit_state{instance="auth01.home.2rjus.net:9100", name="lldap.service", state="active"} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "LLDAP not running on {{ $labels.instance }}"
|
||||||
|
description: "LLDAP has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
|
- name: jellyfin_rules
|
||||||
|
rules:
|
||||||
|
- alert: jellyfin_down
|
||||||
|
expr: up{job="jellyfin"} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Jellyfin not responding on {{ $labels.instance }}"
|
||||||
|
description: "Cannot scrape Jellyfin metrics from {{ $labels.instance }} for 5 minutes."
|
||||||
|
- name: vault_rules
|
||||||
|
rules:
|
||||||
|
- alert: openbao_down
|
||||||
|
expr: node_systemd_unit_state{instance="vault01.home.2rjus.net:9100", name="openbao.service", state="active"} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "OpenBao not running on {{ $labels.instance }}"
|
||||||
|
description: "OpenBao has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
|
- alert: openbao_sealed
|
||||||
|
expr: vault_core_unsealed == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "OpenBao is sealed on {{ $labels.instance }}"
|
||||||
|
description: "OpenBao has been sealed on {{ $labels.instance }} for more than 5 minutes."
|
||||||
|
- alert: openbao_scrape_down
|
||||||
|
expr: up{job="openbao"} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Cannot scrape OpenBao metrics from {{ $labels.instance }}"
|
||||||
|
description: "OpenBao metrics endpoint is not responding on {{ $labels.instance }}."
|
||||||
|
|||||||
@@ -1,10 +1,16 @@
|
|||||||
{ ... }:
|
{ ... }:
|
||||||
{
|
{
|
||||||
|
homelab.monitoring.scrapeTargets = [{
|
||||||
|
job_name = "nats";
|
||||||
|
port = 8222;
|
||||||
|
}];
|
||||||
|
|
||||||
services.nats = {
|
services.nats = {
|
||||||
enable = true;
|
enable = true;
|
||||||
jetstream = true;
|
jetstream = true;
|
||||||
serverName = "nats1";
|
serverName = "nats1";
|
||||||
settings = {
|
settings = {
|
||||||
|
http_port = 8222;
|
||||||
accounts = {
|
accounts = {
|
||||||
ADMIN = {
|
ADMIN = {
|
||||||
users = [
|
users = [
|
||||||
|
|||||||
@@ -1,10 +1,24 @@
|
|||||||
{ pkgs, ... }: {
|
{ pkgs, ... }: {
|
||||||
|
homelab.monitoring.scrapeTargets = [{
|
||||||
|
job_name = "unbound";
|
||||||
|
port = 9167;
|
||||||
|
}];
|
||||||
|
|
||||||
networking.firewall.allowedTCPPorts = [
|
networking.firewall.allowedTCPPorts = [
|
||||||
53
|
53
|
||||||
];
|
];
|
||||||
networking.firewall.allowedUDPPorts = [
|
networking.firewall.allowedUDPPorts = [
|
||||||
53
|
53
|
||||||
];
|
];
|
||||||
|
|
||||||
|
services.prometheus.exporters.unbound = {
|
||||||
|
enable = true;
|
||||||
|
unbound.host = "unix:///run/unbound/unbound.ctl";
|
||||||
|
};
|
||||||
|
|
||||||
|
# Grant exporter access to unbound socket
|
||||||
|
systemd.services.prometheus-unbound-exporter.serviceConfig.SupplementaryGroups = [ "unbound" ];
|
||||||
|
|
||||||
services.unbound = {
|
services.unbound = {
|
||||||
enable = true;
|
enable = true;
|
||||||
|
|
||||||
@@ -23,6 +37,11 @@
|
|||||||
do-ip6 = "no";
|
do-ip6 = "no";
|
||||||
do-udp = "yes";
|
do-udp = "yes";
|
||||||
do-tcp = "yes";
|
do-tcp = "yes";
|
||||||
|
extended-statistics = true;
|
||||||
|
};
|
||||||
|
remote-control = {
|
||||||
|
control-enable = true;
|
||||||
|
control-interface = "/run/unbound/unbound.ctl";
|
||||||
};
|
};
|
||||||
stub-zone = {
|
stub-zone = {
|
||||||
name = "home.2rjus.net";
|
name = "home.2rjus.net";
|
||||||
|
|||||||
@@ -1,5 +1,15 @@
|
|||||||
{ pkgs, ... }:
|
{ pkgs, ... }:
|
||||||
{
|
{
|
||||||
|
homelab.monitoring.scrapeTargets = [{
|
||||||
|
job_name = "postgres";
|
||||||
|
port = 9187;
|
||||||
|
}];
|
||||||
|
|
||||||
|
services.prometheus.exporters.postgres = {
|
||||||
|
enable = true;
|
||||||
|
runAsLocalSuperUser = true; # Use peer auth as postgres user
|
||||||
|
};
|
||||||
|
|
||||||
services.postgresql = {
|
services.postgresql = {
|
||||||
enable = true;
|
enable = true;
|
||||||
enableJIT = true;
|
enableJIT = true;
|
||||||
|
|||||||
@@ -166,6 +166,11 @@ in
|
|||||||
settings = {
|
settings = {
|
||||||
ui = true;
|
ui = true;
|
||||||
|
|
||||||
|
telemetry = {
|
||||||
|
prometheus_retention_time = "60s";
|
||||||
|
disable_hostname = true;
|
||||||
|
};
|
||||||
|
|
||||||
storage.file.path = "/var/lib/openbao";
|
storage.file.path = "/var/lib/openbao";
|
||||||
listener.default = {
|
listener.default = {
|
||||||
type = "tcp";
|
type = "tcp";
|
||||||
|
|||||||
@@ -9,4 +9,9 @@
|
|||||||
"processes"
|
"processes"
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
services.prometheus.exporters.systemd = {
|
||||||
|
enable = true;
|
||||||
|
# Default port: 9558
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
21
terraform/vault/policies.tf
Normal file
21
terraform/vault/policies.tf
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
# Generic policies for services (not host-specific)
|
||||||
|
|
||||||
|
resource "vault_policy" "prometheus_metrics" {
|
||||||
|
name = "prometheus-metrics"
|
||||||
|
policy = <<EOT
|
||||||
|
path "sys/metrics" {
|
||||||
|
capabilities = ["read"]
|
||||||
|
}
|
||||||
|
EOT
|
||||||
|
}
|
||||||
|
|
||||||
|
# Long-lived token for Prometheus to scrape OpenBao metrics
|
||||||
|
resource "vault_token" "prometheus_metrics" {
|
||||||
|
policies = [vault_policy.prometheus_metrics.name]
|
||||||
|
ttl = "8760h" # 1 year
|
||||||
|
renewable = true
|
||||||
|
|
||||||
|
metadata = {
|
||||||
|
purpose = "prometheus-metrics-scraping"
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -92,6 +92,13 @@ locals {
|
|||||||
auto_generate = false
|
auto_generate = false
|
||||||
data = { token = var.actions_token_1 }
|
data = { token = var.actions_token_1 }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Prometheus OpenBao token for scraping metrics
|
||||||
|
# Token is created by vault_token.prometheus_metrics in policies.tf
|
||||||
|
"hosts/monitoring01/openbao-token" = {
|
||||||
|
auto_generate = false
|
||||||
|
data = { token = vault_token.prometheus_metrics.client_token }
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -51,3 +51,4 @@ variable "actions_token_1" {
|
|||||||
type = string
|
type = string
|
||||||
sensitive = true
|
sensitive = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user