migrate-to-openbao-pki #31

Merged
torjus merged 3 commits from migrate-to-openbao-pki into master 2026-02-07 17:33:47 +00:00
9 changed files with 75 additions and 142 deletions
Showing only changes of commit 21db7e9573 - Show all commits

View File

@@ -0,0 +1,72 @@
# Certificate Monitoring Plan
## Summary
This document describes the removal of labmon certificate monitoring and outlines future needs for certificate monitoring in the homelab.
## What Was Removed
### labmon Service
The `labmon` service was a custom Go application that provided:
1. **StepMonitor**: Monitoring for step-ca (Smallstep CA) certificate provisioning and health
2. **TLSConnectionMonitor**: Periodic TLS connection checks to verify certificate validity and expiration
The service exposed Prometheus metrics at `:9969` including:
- `labmon_tlsconmon_certificate_seconds_left` - Time until certificate expiration
- `labmon_tlsconmon_certificate_check_error` - Whether the TLS check failed
- `labmon_stepmon_certificate_seconds_left` - Step-CA internal certificate expiration
### Affected Files
- `hosts/monitoring01/configuration.nix` - Removed labmon configuration block
- `services/monitoring/prometheus.nix` - Removed labmon scrape target
- `services/monitoring/rules.yml` - Removed `certificate_rules` alert group
- `services/monitoring/alloy.nix` - Deleted (was only used for labmon profiling)
- `services/monitoring/default.nix` - Removed alloy.nix import
### Removed Alerts
- `certificate_expiring_soon` - Warned when any monitored TLS cert had < 24h validity
- `step_ca_serving_cert_expiring` - Critical alert for step-ca's own serving certificate
- `certificate_check_error` - Warned when TLS connection check failed
- `step_ca_certificate_expiring` - Critical alert for step-ca issued certificates
## Why It Was Removed
1. **step-ca decommissioned**: The primary monitoring target (step-ca) is no longer in use
2. **Outdated codebase**: labmon was a custom tool that required maintenance
3. **Limited value**: With ACME auto-renewal, certificates should renew automatically
## Current State
ACME certificates are now issued by OpenBao PKI at `vault.home.2rjus.net:8200`. The ACME protocol handles automatic renewal, and certificates are typically renewed well before expiration.
## Future Needs
While ACME handles renewal automatically, we should consider monitoring for:
1. **ACME renewal failures**: Alert when a certificate fails to renew
- Could monitor ACME client logs (via Loki queries)
- Could check certificate file modification times
2. **Certificate expiration as backup**: Even with auto-renewal, a last-resort alert for certificates approaching expiration would catch renewal failures
3. **Certificate transparency**: Monitor for unexpected certificate issuance
### Potential Solutions
1. **Prometheus blackbox_exporter**: Can probe TLS endpoints and export certificate expiration metrics
- `probe_ssl_earliest_cert_expiry` metric
- Already a standard tool, well-maintained
2. **Custom Loki alerting**: Query ACME service logs for renewal failures
- Works with existing infrastructure
- No additional services needed
3. **Node-exporter textfile collector**: Script that checks local certificate files and writes expiration metrics
## Status
**Not yet implemented.** This document serves as a placeholder for future work on certificate monitoring.

View File

@@ -100,61 +100,6 @@
];
};
labmon = {
enable = true;
settings = {
ListenAddr = ":9969";
Profiling = true;
StepMonitors = [
{
Enabled = true;
BaseURL = "https://ca.home.2rjus.net";
RootID = "3381bda8015a86b9a3cd1851439d1091890a79005e0f1f7c4301fe4bccc29d80";
}
];
TLSConnectionMonitors = [
{
Enabled = true;
Address = "ca.home.2rjus.net:443";
Verify = true;
Duration = "12h";
}
{
Enabled = true;
Address = "jelly.home.2rjus.net:443";
Verify = true;
Duration = "12h";
}
{
Enabled = true;
Address = "grafana.home.2rjus.net:443";
Verify = true;
Duration = "12h";
}
{
Enabled = true;
Address = "prometheus.home.2rjus.net:443";
Verify = true;
Duration = "12h";
}
{
Enabled = true;
Address = "alertmanager.home.2rjus.net:443";
Verify = true;
Duration = "12h";
}
{
Enabled = true;
Address = "pyroscope.home.2rjus.net:443";
Verify = true;
Duration = "12h";
}
];
};
};
# Open ports in the firewall.
# networking.firewall.allowedTCPPorts = [ ... ];
# networking.firewall.allowedUDPPorts = [ ... ];

View File

@@ -5,7 +5,7 @@
package = pkgs.unstable.caddy;
configFile = pkgs.writeText "Caddyfile" ''
{
acme_ca https://ca.home.2rjus.net/acme/acme/directory
acme_ca https://vault.home.2rjus.net:8200/v1/pki_int/acme/directory
metrics {
per_host

View File

@@ -1,41 +0,0 @@
{ ... }:
{
services.alloy = {
enable = true;
};
environment.etc."alloy/config.alloy" = {
enable = true;
mode = "0644";
text = ''
pyroscope.write "local_pyroscope" {
endpoint {
url = "http://localhost:4040"
}
}
pyroscope.scrape "labmon" {
targets = [{"__address__" = "localhost:9969", "service_name" = "labmon"}]
forward_to = [pyroscope.write.local_pyroscope.receiver]
profiling_config {
profile.process_cpu {
enabled = true
}
profile.memory {
enabled = true
}
profile.mutex {
enabled = true
}
profile.block {
enabled = true
}
profile.goroutine {
enabled = true
}
}
}
'';
};
}

View File

@@ -7,7 +7,6 @@
./pve.nix
./alerttonotify.nix
./pyroscope.nix
./alloy.nix
./tempo.nix
];
}

View File

@@ -178,14 +178,6 @@ in
}
];
}
{
job_name = "labmon";
static_configs = [
{
targets = [ "monitoring01.home.2rjus.net:9969" ];
}
];
}
# TODO: nix-cache_caddy can't be auto-generated because the cert is issued
# for nix-cache.home.2rjus.net (service CNAME), not nix-cache01 (hostname).
# Consider adding a target override to homelab.monitoring.scrapeTargets.

View File

@@ -338,40 +338,6 @@ groups:
annotations:
summary: "Pyroscope service not running on {{ $labels.instance }}"
description: "Pyroscope service not running on {{ $labels.instance }}"
- name: certificate_rules
rules:
- alert: certificate_expiring_soon
expr: labmon_tlsconmon_certificate_seconds_left{address!="ca.home.2rjus.net:443"} < 86400
for: 5m
labels:
severity: warning
annotations:
summary: "TLS certificate expiring soon for {{ $labels.instance }}"
description: "TLS certificate for {{ $labels.address }} is expiring within 24 hours."
- alert: step_ca_serving_cert_expiring
expr: labmon_tlsconmon_certificate_seconds_left{address="ca.home.2rjus.net:443"} < 3600
for: 5m
labels:
severity: critical
annotations:
summary: "Step-CA serving certificate expiring"
description: "The step-ca serving certificate (24h auto-renewed) has less than 1 hour of validity left. Renewal may have failed."
- alert: certificate_check_error
expr: labmon_tlsconmon_certificate_check_error == 1
for: 5m
labels:
severity: warning
annotations:
summary: "Error checking certificate for {{ $labels.address }}"
description: "Certificate check is failing for {{ $labels.address }} on {{ $labels.instance }}."
- alert: step_ca_certificate_expiring
expr: labmon_stepmon_certificate_seconds_left < 3600
for: 5m
labels:
severity: critical
annotations:
summary: "Step-CA certificate expiring for {{ $labels.instance }}"
description: "Step-CA certificate is expiring within 1 hour on {{ $labels.instance }}."
- name: proxmox_rules
rules:
- alert: pve_node_down

View File

@@ -5,7 +5,7 @@
package = pkgs.unstable.caddy;
configFile = pkgs.writeText "Caddyfile" ''
{
acme_ca https://ca.home.2rjus.net/acme/acme/directory
acme_ca https://vault.home.2rjus.net:8200/v1/pki_int/acme/directory
metrics
}

View File

@@ -3,7 +3,7 @@
security.acme = {
acceptTerms = true;
defaults = {
server = "https://ca.home.2rjus.net/acme/acme/directory";
server = "https://vault.home.2rjus.net:8200/v1/pki_int/acme/directory";
email = "root@home.2rjus.net";
dnsPropagationCheck = false;
};