Compare commits
131 Commits
45a5a10881
...
jellyfin-m
| Author | SHA1 | Date | |
|---|---|---|---|
|
16ef202530
|
|||
|
5f3508a6d4
|
|||
|
2ca2509083
|
|||
|
58702bd10b
|
|||
|
c9f47acb01
|
|||
|
09ce018fb2
|
|||
| 3042803c4d | |||
|
1e7200b494
|
|||
|
eec1e374b2
|
|||
|
fcc410afad
|
|||
|
59f0c7ceda
|
|||
| d713f06c6e | |||
|
7374d1ff7f
|
|||
| e912c75b6c | |||
|
b218b4f8bc
|
|||
|
65acf13e6f
|
|||
| 95a96b2192 | |||
|
4f593126c0
|
|||
| 1bba6f106a | |||
|
a6013d3950
|
|||
| 7f69c0738a | |||
|
35924c7b01
|
|||
|
87d8571d62
|
|||
|
43c81f6688
|
|||
|
58f901ad3e
|
|||
|
c13921d302
|
|||
|
2903873d52
|
|||
|
74e7c9faa4
|
|||
| 471f536f1f | |||
|
a013e80f1a
|
|||
|
4cbaa33475
|
|||
|
e329f87b0b
|
|||
|
c151f31011
|
|||
| f5362d6936 | |||
|
3e7aabc73a
|
|||
|
361e7f2a1b
|
|||
|
1942591d2e
|
|||
|
4d614d8716
|
|||
| fd7caf7f00 | |||
|
af8e385b6e
|
|||
|
0db9fc6802
|
|||
|
5d68662035
|
|||
|
d485948df0
|
|||
|
7b804450a3
|
|||
|
2f0dad1acc
|
|||
|
1544415ef3
|
|||
|
5babd7f507
|
|||
|
7e0c5fbf0f
|
|||
|
ffaf95d109
|
|||
|
b2b6ab4799
|
|||
|
5d3d93b280
|
|||
|
ae823e439d
|
|||
|
0d9f49a3b4
|
|||
|
08d9e1ec3f
|
|||
|
fa8d65b612
|
|||
|
6726f111e3
|
|||
| 3a083285cb | |||
|
ed1821b073
|
|||
|
fa4a418007
|
|||
| 963e5f6d3c | |||
|
0bc10cb1fe
|
|||
|
b03e2e8ee4
|
|||
|
ddcbc30665
|
|||
|
75210805d5
|
|||
|
ade0538717
|
|||
|
83fce5f927
|
|||
|
afff3f28ca
|
|||
|
49f7e3ae2e
|
|||
|
751edfc11d
|
|||
|
98a7301985
|
|||
| 34efa58cfe | |||
|
5bfb51a497
|
|||
|
f83145d97a
|
|||
|
47747329c4
|
|||
|
2d9ca2a73f
|
|||
|
98ea679ef2
|
|||
|
b709c0b703
|
|||
|
33c5d5b3f0
|
|||
|
0a28c5f495
|
|||
|
9bd48e0808
|
|||
|
1460eea700
|
|||
|
98c4f54f94
|
|||
|
d1b0a5dc20
|
|||
|
4d32707130
|
|||
|
8e1753c2c8
|
|||
|
75e4fb61a5
|
|||
|
2be213e454
|
|||
|
12c252653b
|
|||
|
6493338c4c
|
|||
|
6e08ba9720
|
|||
|
7ff3d2a09b
|
|||
|
e85f15b73d
|
|||
|
2f5a2a4bf1
|
|||
|
287141c623
|
|||
|
9ed11b712f
|
|||
|
ffad2dd205
|
|||
|
ed7d2aa727
|
|||
|
bf7a025364
|
|||
| 4ae99dbc89 | |||
|
5c142b1323
|
|||
|
4091e51f41
|
|||
|
a8e558a6b7
|
|||
|
4efc798c38
|
|||
|
016f8c9119
|
|||
| fec2a261ab | |||
|
60c04a2052
|
|||
|
39e3f37263
|
|||
| a2d93baba8 | |||
|
f66dfc753c
|
|||
| 79a6a72719 | |||
|
89d0a6f358
|
|||
|
03ebee4d82
|
|||
|
05630eb4d4
|
|||
|
1e52eec02a
|
|||
|
d333aa0164
|
|||
|
a5d5827dcc
|
|||
|
1c13ec12a4
|
|||
|
4bf0eeeadb
|
|||
| 304cb117ce | |||
|
02270a0e4a
|
|||
|
030e8518c5
|
|||
|
9ffdd4f862
|
|||
|
0b977808ca
|
|||
|
8786113f8f
|
|||
|
fdb2c31f84
|
|||
|
78eb04205f
|
|||
| 19cb61ebbc | |||
|
9ed09c9a9c
|
|||
|
b31c64f1b9
|
|||
|
54b6e37420
|
|||
|
b845a8bb8b
|
@@ -19,7 +19,7 @@ You may receive:
|
|||||||
## Audit Log Structure
|
## Audit Log Structure
|
||||||
|
|
||||||
Logs are shipped to Loki via promtail. Audit events use these labels:
|
Logs are shipped to Loki via promtail. Audit events use these labels:
|
||||||
- `host` - hostname
|
- `hostname` - hostname
|
||||||
- `systemd_unit` - typically `auditd.service` for audit logs
|
- `systemd_unit` - typically `auditd.service` for audit logs
|
||||||
- `job` - typically `systemd-journal`
|
- `job` - typically `systemd-journal`
|
||||||
|
|
||||||
@@ -36,7 +36,7 @@ Audit log entries contain structured data:
|
|||||||
|
|
||||||
Find SSH logins and session activity:
|
Find SSH logins and session activity:
|
||||||
```logql
|
```logql
|
||||||
{host="<hostname>", systemd_unit="sshd.service"}
|
{hostname="<hostname>", systemd_unit="sshd.service"}
|
||||||
```
|
```
|
||||||
|
|
||||||
Look for:
|
Look for:
|
||||||
@@ -48,7 +48,7 @@ Look for:
|
|||||||
|
|
||||||
Query executed commands (filter out noise):
|
Query executed commands (filter out noise):
|
||||||
```logql
|
```logql
|
||||||
{host="<hostname>"} |= "EXECVE" != "PATH item" != "PROCTITLE" != "SYSCALL" != "BPF"
|
{hostname="<hostname>"} |= "EXECVE" != "PATH item" != "PROCTITLE" != "SYSCALL" != "BPF"
|
||||||
```
|
```
|
||||||
|
|
||||||
Further filtering:
|
Further filtering:
|
||||||
@@ -60,28 +60,28 @@ Further filtering:
|
|||||||
|
|
||||||
Check for privilege escalation:
|
Check for privilege escalation:
|
||||||
```logql
|
```logql
|
||||||
{host="<hostname>"} |= "sudo" |= "COMMAND"
|
{hostname="<hostname>"} |= "sudo" |= "COMMAND"
|
||||||
```
|
```
|
||||||
|
|
||||||
Or via audit:
|
Or via audit:
|
||||||
```logql
|
```logql
|
||||||
{host="<hostname>"} |= "USER_CMD"
|
{hostname="<hostname>"} |= "USER_CMD"
|
||||||
```
|
```
|
||||||
|
|
||||||
### 4. Service Manipulation
|
### 4. Service Manipulation
|
||||||
|
|
||||||
Check if services were manually stopped/started:
|
Check if services were manually stopped/started:
|
||||||
```logql
|
```logql
|
||||||
{host="<hostname>"} |= "EXECVE" |= "systemctl"
|
{hostname="<hostname>"} |= "EXECVE" |= "systemctl"
|
||||||
```
|
```
|
||||||
|
|
||||||
### 5. File Operations
|
### 5. File Operations
|
||||||
|
|
||||||
Look for file modifications (if auditd rules are configured):
|
Look for file modifications (if auditd rules are configured):
|
||||||
```logql
|
```logql
|
||||||
{host="<hostname>"} |= "EXECVE" |= "vim"
|
{hostname="<hostname>"} |= "EXECVE" |= "vim"
|
||||||
{host="<hostname>"} |= "EXECVE" |= "nano"
|
{hostname="<hostname>"} |= "EXECVE" |= "nano"
|
||||||
{host="<hostname>"} |= "EXECVE" |= "rm"
|
{hostname="<hostname>"} |= "EXECVE" |= "rm"
|
||||||
```
|
```
|
||||||
|
|
||||||
## Query Guidelines
|
## Query Guidelines
|
||||||
@@ -99,7 +99,7 @@ Look for file modifications (if auditd rules are configured):
|
|||||||
**Time-bounded queries:**
|
**Time-bounded queries:**
|
||||||
When investigating around a specific event:
|
When investigating around a specific event:
|
||||||
```logql
|
```logql
|
||||||
{host="<hostname>"} |= "EXECVE" != "systemd"
|
{hostname="<hostname>"} |= "EXECVE" != "systemd"
|
||||||
```
|
```
|
||||||
With `start: "2026-02-08T14:30:00Z"` and `end: "2026-02-08T14:35:00Z"`
|
With `start: "2026-02-08T14:30:00Z"` and `end: "2026-02-08T14:35:00Z"`
|
||||||
|
|
||||||
|
|||||||
@@ -41,13 +41,13 @@ Search for relevant log entries using `query_logs`. Focus on service-specific lo
|
|||||||
**Query strategies (start narrow, expand if needed):**
|
**Query strategies (start narrow, expand if needed):**
|
||||||
- Start with `limit: 20-30`, increase only if needed
|
- Start with `limit: 20-30`, increase only if needed
|
||||||
- Use tight time windows: `start: "15m"` or `start: "30m"` initially
|
- Use tight time windows: `start: "15m"` or `start: "30m"` initially
|
||||||
- Filter to specific services: `{host="<hostname>", systemd_unit="<service>.service"}`
|
- Filter to specific services: `{hostname="<hostname>", systemd_unit="<service>.service"}`
|
||||||
- Search for errors: `{host="<hostname>"} |= "error"` or `|= "failed"`
|
- Search for errors: `{hostname="<hostname>"} |= "error"` or `|= "failed"`
|
||||||
|
|
||||||
**Common patterns:**
|
**Common patterns:**
|
||||||
- Service logs: `{host="<hostname>", systemd_unit="<service>.service"}`
|
- Service logs: `{hostname="<hostname>", systemd_unit="<service>.service"}`
|
||||||
- All errors on host: `{host="<hostname>"} |= "error"`
|
- All errors on host: `{hostname="<hostname>"} |= "error"`
|
||||||
- Journal for a unit: `{host="<hostname>", systemd_unit="nginx.service"} |= "failed"`
|
- Journal for a unit: `{hostname="<hostname>", systemd_unit="nginx.service"} |= "failed"`
|
||||||
|
|
||||||
**Avoid:**
|
**Avoid:**
|
||||||
- Using `start: "1h"` with no filters on busy hosts
|
- Using `start: "1h"` with no filters on busy hosts
|
||||||
@@ -130,7 +130,7 @@ get_commit_info(<hash>) # Get full details of a specific change
|
|||||||
```
|
```
|
||||||
|
|
||||||
**Example workflow for a service-related alert:**
|
**Example workflow for a service-related alert:**
|
||||||
1. Query `nixos_flake_info{hostname="monitoring01"}` → `current_rev: 8959829`
|
1. Query `nixos_flake_info{hostname="monitoring02"}` → `current_rev: 8959829`
|
||||||
2. `resolve_ref("master")` → `4633421`
|
2. `resolve_ref("master")` → `4633421`
|
||||||
3. `is_ancestor("8959829", "4633421")` → Yes, host is behind
|
3. `is_ancestor("8959829", "4633421")` → Yes, host is behind
|
||||||
4. `commits_between("8959829", "4633421")` → 7 commits missing
|
4. `commits_between("8959829", "4633421")` → 7 commits missing
|
||||||
|
|||||||
@@ -30,11 +30,13 @@ Use the `lab-monitoring` MCP server tools:
|
|||||||
### Label Reference
|
### Label Reference
|
||||||
|
|
||||||
Available labels for log queries:
|
Available labels for log queries:
|
||||||
- `host` - Hostname (e.g., `ns1`, `monitoring01`, `ha1`)
|
- `hostname` - Hostname (e.g., `ns1`, `monitoring02`, `ha1`) - matches the Prometheus `hostname` label
|
||||||
- `systemd_unit` - Systemd unit name (e.g., `nsd.service`, `nixos-upgrade.service`)
|
- `systemd_unit` - Systemd unit name (e.g., `nsd.service`, `nixos-upgrade.service`)
|
||||||
- `job` - Either `systemd-journal` (most logs), `varlog` (file-based logs), or `bootstrap` (VM bootstrap logs)
|
- `job` - Either `systemd-journal` (most logs), `varlog` (file-based logs), or `bootstrap` (VM bootstrap logs)
|
||||||
- `filename` - For `varlog` job, the log file path
|
- `filename` - For `varlog` job, the log file path
|
||||||
- `hostname` - Alternative to `host` for some streams
|
- `tier` - Deployment tier (`test` or `prod`)
|
||||||
|
- `role` - Host role (e.g., `dns`, `vault`, `monitoring`) - matches the Prometheus `role` label
|
||||||
|
- `level` - Log level mapped from journal PRIORITY (`critical`, `error`, `warning`, `notice`, `info`, `debug`) - journal scrape only
|
||||||
|
|
||||||
### Log Format
|
### Log Format
|
||||||
|
|
||||||
@@ -47,12 +49,12 @@ Journal logs are JSON-formatted. Key fields:
|
|||||||
|
|
||||||
**Logs from a specific service on a host:**
|
**Logs from a specific service on a host:**
|
||||||
```logql
|
```logql
|
||||||
{host="ns1", systemd_unit="nsd.service"}
|
{hostname="ns1", systemd_unit="nsd.service"}
|
||||||
```
|
```
|
||||||
|
|
||||||
**All logs from a host:**
|
**All logs from a host:**
|
||||||
```logql
|
```logql
|
||||||
{host="monitoring01"}
|
{hostname="monitoring02"}
|
||||||
```
|
```
|
||||||
|
|
||||||
**Logs from a service across all hosts:**
|
**Logs from a service across all hosts:**
|
||||||
@@ -62,17 +64,31 @@ Journal logs are JSON-formatted. Key fields:
|
|||||||
|
|
||||||
**Substring matching (case-sensitive):**
|
**Substring matching (case-sensitive):**
|
||||||
```logql
|
```logql
|
||||||
{host="ha1"} |= "error"
|
{hostname="ha1"} |= "error"
|
||||||
```
|
```
|
||||||
|
|
||||||
**Exclude pattern:**
|
**Exclude pattern:**
|
||||||
```logql
|
```logql
|
||||||
{host="ns1"} != "routine"
|
{hostname="ns1"} != "routine"
|
||||||
```
|
```
|
||||||
|
|
||||||
**Regex matching:**
|
**Regex matching:**
|
||||||
```logql
|
```logql
|
||||||
{systemd_unit="prometheus.service"} |~ "scrape.*failed"
|
{systemd_unit="victoriametrics.service"} |~ "scrape.*failed"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Filter by level (journal scrape only):**
|
||||||
|
```logql
|
||||||
|
{level="error"} # All errors across the fleet
|
||||||
|
{level=~"critical|error", tier="prod"} # Prod errors and criticals
|
||||||
|
{hostname="ns1", level="warning"} # Warnings from a specific host
|
||||||
|
```
|
||||||
|
|
||||||
|
**Filter by tier/role:**
|
||||||
|
```logql
|
||||||
|
{tier="prod"} |= "error" # All errors on prod hosts
|
||||||
|
{role="dns"} # All DNS server logs
|
||||||
|
{tier="test", job="systemd-journal"} # Journal logs from test hosts
|
||||||
```
|
```
|
||||||
|
|
||||||
**File-based logs (caddy access logs, etc):**
|
**File-based logs (caddy access logs, etc):**
|
||||||
@@ -93,7 +109,7 @@ Default lookback is 1 hour. Use `start` parameter for older logs:
|
|||||||
Useful systemd units for troubleshooting:
|
Useful systemd units for troubleshooting:
|
||||||
- `nixos-upgrade.service` - Daily auto-upgrade logs
|
- `nixos-upgrade.service` - Daily auto-upgrade logs
|
||||||
- `nsd.service` - DNS server (ns1/ns2)
|
- `nsd.service` - DNS server (ns1/ns2)
|
||||||
- `prometheus.service` - Metrics collection
|
- `victoriametrics.service` - Metrics collection
|
||||||
- `loki.service` - Log aggregation
|
- `loki.service` - Log aggregation
|
||||||
- `caddy.service` - Reverse proxy
|
- `caddy.service` - Reverse proxy
|
||||||
- `home-assistant.service` - Home automation
|
- `home-assistant.service` - Home automation
|
||||||
@@ -106,7 +122,7 @@ Useful systemd units for troubleshooting:
|
|||||||
|
|
||||||
VMs provisioned from template2 send bootstrap progress directly to Loki via curl (before promtail is available). These logs use `job="bootstrap"` with additional labels:
|
VMs provisioned from template2 send bootstrap progress directly to Loki via curl (before promtail is available). These logs use `job="bootstrap"` with additional labels:
|
||||||
|
|
||||||
- `host` - Target hostname
|
- `hostname` - Target hostname
|
||||||
- `branch` - Git branch being deployed
|
- `branch` - Git branch being deployed
|
||||||
- `stage` - Bootstrap stage (see table below)
|
- `stage` - Bootstrap stage (see table below)
|
||||||
|
|
||||||
@@ -127,7 +143,7 @@ VMs provisioned from template2 send bootstrap progress directly to Loki via curl
|
|||||||
|
|
||||||
```logql
|
```logql
|
||||||
{job="bootstrap"} # All bootstrap logs
|
{job="bootstrap"} # All bootstrap logs
|
||||||
{job="bootstrap", host="myhost"} # Specific host
|
{job="bootstrap", hostname="myhost"} # Specific host
|
||||||
{job="bootstrap", stage="failed"} # All failures
|
{job="bootstrap", stage="failed"} # All failures
|
||||||
{job="bootstrap", stage=~"building|success"} # Track build progress
|
{job="bootstrap", stage=~"building|success"} # Track build progress
|
||||||
```
|
```
|
||||||
@@ -136,7 +152,7 @@ VMs provisioned from template2 send bootstrap progress directly to Loki via curl
|
|||||||
|
|
||||||
Parse JSON and filter on fields:
|
Parse JSON and filter on fields:
|
||||||
```logql
|
```logql
|
||||||
{systemd_unit="prometheus.service"} | json | PRIORITY="3"
|
{systemd_unit="victoriametrics.service"} | json | PRIORITY="3"
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
@@ -226,12 +242,11 @@ All available Prometheus job names:
|
|||||||
- `unbound` - DNS resolver metrics (ns1, ns2)
|
- `unbound` - DNS resolver metrics (ns1, ns2)
|
||||||
- `wireguard` - VPN tunnel metrics (http-proxy)
|
- `wireguard` - VPN tunnel metrics (http-proxy)
|
||||||
|
|
||||||
**Monitoring stack (localhost on monitoring01):**
|
**Monitoring stack (localhost on monitoring02):**
|
||||||
- `prometheus` - Prometheus self-metrics
|
- `victoriametrics` - VictoriaMetrics self-metrics
|
||||||
- `loki` - Loki self-metrics
|
- `loki` - Loki self-metrics
|
||||||
- `grafana` - Grafana self-metrics
|
- `grafana` - Grafana self-metrics
|
||||||
- `alertmanager` - Alertmanager metrics
|
- `alertmanager` - Alertmanager metrics
|
||||||
- `pushgateway` - Push-based metrics gateway
|
|
||||||
|
|
||||||
**External/infrastructure:**
|
**External/infrastructure:**
|
||||||
- `pve-exporter` - Proxmox hypervisor metrics
|
- `pve-exporter` - Proxmox hypervisor metrics
|
||||||
@@ -246,7 +261,7 @@ All scrape targets have these labels:
|
|||||||
**Standard labels:**
|
**Standard labels:**
|
||||||
- `instance` - Full target address (`<hostname>.home.2rjus.net:<port>`)
|
- `instance` - Full target address (`<hostname>.home.2rjus.net:<port>`)
|
||||||
- `job` - Job name (e.g., `node-exporter`, `unbound`, `nixos-exporter`)
|
- `job` - Job name (e.g., `node-exporter`, `unbound`, `nixos-exporter`)
|
||||||
- `hostname` - Short hostname (e.g., `ns1`, `monitoring01`) - use this for host filtering
|
- `hostname` - Short hostname (e.g., `ns1`, `monitoring02`) - use this for host filtering
|
||||||
|
|
||||||
**Host metadata labels** (when configured in `homelab.host`):
|
**Host metadata labels** (when configured in `homelab.host`):
|
||||||
- `role` - Host role (e.g., `dns`, `build-host`, `vault`)
|
- `role` - Host role (e.g., `dns`, `build-host`, `vault`)
|
||||||
@@ -259,7 +274,7 @@ Use the `hostname` label for easy host filtering across all jobs:
|
|||||||
|
|
||||||
```promql
|
```promql
|
||||||
{hostname="ns1"} # All metrics from ns1
|
{hostname="ns1"} # All metrics from ns1
|
||||||
node_load1{hostname="monitoring01"} # Specific metric by hostname
|
node_load1{hostname="monitoring02"} # Specific metric by hostname
|
||||||
up{hostname="ha1"} # Check if ha1 is up
|
up{hostname="ha1"} # Check if ha1 is up
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -267,10 +282,10 @@ This is simpler than wildcarding the `instance` label:
|
|||||||
|
|
||||||
```promql
|
```promql
|
||||||
# Old way (still works but verbose)
|
# Old way (still works but verbose)
|
||||||
up{instance=~"monitoring01.*"}
|
up{instance=~"monitoring02.*"}
|
||||||
|
|
||||||
# New way (preferred)
|
# New way (preferred)
|
||||||
up{hostname="monitoring01"}
|
up{hostname="monitoring02"}
|
||||||
```
|
```
|
||||||
|
|
||||||
### Filtering by Role/Tier
|
### Filtering by Role/Tier
|
||||||
@@ -308,8 +323,8 @@ Current host labels:
|
|||||||
|
|
||||||
1. Check `up{job="<service>"}` or `up{hostname="<host>"}` for scrape failures
|
1. Check `up{job="<service>"}` or `up{hostname="<host>"}` for scrape failures
|
||||||
2. Use `list_targets` to see target health details
|
2. Use `list_targets` to see target health details
|
||||||
3. Query service logs: `{host="<host>", systemd_unit="<service>.service"}`
|
3. Query service logs: `{hostname="<host>", systemd_unit="<service>.service"}`
|
||||||
4. Search for errors: `{host="<host>"} |= "error"`
|
4. Search for errors: `{hostname="<host>"} |= "error"`
|
||||||
5. Check `list_alerts` for related alerts
|
5. Check `list_alerts` for related alerts
|
||||||
6. Use role filters for group issues: `up{role="dns"}` to check all DNS servers
|
6. Use role filters for group issues: `up{role="dns"}` to check all DNS servers
|
||||||
|
|
||||||
@@ -324,17 +339,17 @@ Current host labels:
|
|||||||
|
|
||||||
When provisioning new VMs, track bootstrap progress:
|
When provisioning new VMs, track bootstrap progress:
|
||||||
|
|
||||||
1. Watch bootstrap logs: `{job="bootstrap", host="<hostname>"}`
|
1. Watch bootstrap logs: `{job="bootstrap", hostname="<hostname>"}`
|
||||||
2. Check for failures: `{job="bootstrap", host="<hostname>", stage="failed"}`
|
2. Check for failures: `{job="bootstrap", hostname="<hostname>", stage="failed"}`
|
||||||
3. After success, verify host appears in metrics: `up{hostname="<hostname>"}`
|
3. After success, verify host appears in metrics: `up{hostname="<hostname>"}`
|
||||||
4. Check logs are flowing: `{host="<hostname>"}`
|
4. Check logs are flowing: `{hostname="<hostname>"}`
|
||||||
|
|
||||||
See [docs/host-creation.md](../../../docs/host-creation.md) for the full host creation pipeline.
|
See [docs/host-creation.md](../../../docs/host-creation.md) for the full host creation pipeline.
|
||||||
|
|
||||||
### Debug SSH/Access Issues
|
### Debug SSH/Access Issues
|
||||||
|
|
||||||
```logql
|
```logql
|
||||||
{host="<host>", systemd_unit="sshd.service"}
|
{hostname="<host>", systemd_unit="sshd.service"}
|
||||||
```
|
```
|
||||||
|
|
||||||
### Check Recent Upgrades
|
### Check Recent Upgrades
|
||||||
|
|||||||
@@ -73,6 +73,7 @@ Additional context, caveats, or references.
|
|||||||
- **Reference existing patterns**: Mention how this fits with existing infrastructure
|
- **Reference existing patterns**: Mention how this fits with existing infrastructure
|
||||||
- **Tables for comparisons**: Use markdown tables when comparing options
|
- **Tables for comparisons**: Use markdown tables when comparing options
|
||||||
- **Practical focus**: Emphasize what needs to happen, not theory
|
- **Practical focus**: Emphasize what needs to happen, not theory
|
||||||
|
- **Mermaid diagrams**: Use mermaid code blocks for architecture diagrams, flow charts, or other graphs when relevant to the plan. Keep node labels short and use `<br/>` for line breaks
|
||||||
|
|
||||||
## Examples of Good Plans
|
## Examples of Good Plans
|
||||||
|
|
||||||
|
|||||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -2,6 +2,9 @@
|
|||||||
result
|
result
|
||||||
result-*
|
result-*
|
||||||
|
|
||||||
|
# MCP config (contains secrets)
|
||||||
|
.mcp.json
|
||||||
|
|
||||||
# Terraform/OpenTofu
|
# Terraform/OpenTofu
|
||||||
terraform/.terraform/
|
terraform/.terraform/
|
||||||
terraform/.terraform.lock.hcl
|
terraform/.terraform.lock.hcl
|
||||||
|
|||||||
@@ -20,7 +20,9 @@
|
|||||||
"env": {
|
"env": {
|
||||||
"PROMETHEUS_URL": "https://prometheus.home.2rjus.net",
|
"PROMETHEUS_URL": "https://prometheus.home.2rjus.net",
|
||||||
"ALERTMANAGER_URL": "https://alertmanager.home.2rjus.net",
|
"ALERTMANAGER_URL": "https://alertmanager.home.2rjus.net",
|
||||||
"LOKI_URL": "http://monitoring01.home.2rjus.net:3100"
|
"LOKI_URL": "https://loki.home.2rjus.net",
|
||||||
|
"LOKI_USERNAME": "promtail",
|
||||||
|
"LOKI_PASSWORD": "<password from: bao kv get -field=password secret/shared/loki/push-auth>"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"homelab-deploy": {
|
"homelab-deploy": {
|
||||||
@@ -31,7 +33,8 @@
|
|||||||
"--",
|
"--",
|
||||||
"mcp",
|
"mcp",
|
||||||
"--nats-url", "nats://nats1.home.2rjus.net:4222",
|
"--nats-url", "nats://nats1.home.2rjus.net:4222",
|
||||||
"--nkey-file", "/home/torjus/.config/homelab-deploy/test-deployer.nkey"
|
"--nkey-file", "/home/torjus/.config/homelab-deploy/test-deployer.nkey",
|
||||||
|
"--enable-builds"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"git-explorer": {
|
"git-explorer": {
|
||||||
@@ -43,4 +46,3 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
83
CLAUDE.md
83
CLAUDE.md
@@ -39,6 +39,30 @@ Do not automatically deploy changes. Deployments are usually done by updating th
|
|||||||
|
|
||||||
Do not run SSH commands directly. If a command needs to be run on a remote host, provide the command to the user and ask them to run it manually.
|
Do not run SSH commands directly. If a command needs to be run on a remote host, provide the command to the user and ask them to run it manually.
|
||||||
|
|
||||||
|
### Sharing Command Output via Loki
|
||||||
|
|
||||||
|
All hosts have the `pipe-to-loki` script for sending command output or terminal sessions to Loki, allowing users to share output with Claude without copy-pasting.
|
||||||
|
|
||||||
|
**Pipe mode** - send command output:
|
||||||
|
```bash
|
||||||
|
command | pipe-to-loki # Auto-generated ID
|
||||||
|
command | pipe-to-loki --id my-test # Custom ID
|
||||||
|
```
|
||||||
|
|
||||||
|
**Session mode** - record interactive terminal session:
|
||||||
|
```bash
|
||||||
|
pipe-to-loki --record # Start recording, exit to send
|
||||||
|
pipe-to-loki --record --id my-session # With custom ID
|
||||||
|
```
|
||||||
|
|
||||||
|
The script prints the session ID which the user can share. Query results with:
|
||||||
|
```logql
|
||||||
|
{job="pipe-to-loki"} # All entries
|
||||||
|
{job="pipe-to-loki", id="my-test"} # Specific ID
|
||||||
|
{job="pipe-to-loki", hostname="testvm01"} # From specific host
|
||||||
|
{job="pipe-to-loki", type="session"} # Only sessions
|
||||||
|
```
|
||||||
|
|
||||||
### Testing Feature Branches on Hosts
|
### Testing Feature Branches on Hosts
|
||||||
|
|
||||||
All hosts have the `nixos-rebuild-test` helper script for testing feature branches before merging:
|
All hosts have the `nixos-rebuild-test` helper script for testing feature branches before merging:
|
||||||
@@ -90,6 +114,12 @@ nix develop -c tofu -chdir=terraform/vault apply
|
|||||||
cd terraform && tofu plan
|
cd terraform && tofu plan
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Ansible
|
||||||
|
|
||||||
|
Ansible configuration and playbooks are in `/ansible/`. See [ansible/README.md](ansible/README.md) for inventory groups, available playbooks, and usage examples.
|
||||||
|
|
||||||
|
The devshell sets `ANSIBLE_CONFIG` automatically, so no `-i` flag is needed.
|
||||||
|
|
||||||
### Secrets Management
|
### Secrets Management
|
||||||
|
|
||||||
Secrets are managed by OpenBao (Vault) using AppRole authentication. Most hosts use the
|
Secrets are managed by OpenBao (Vault) using AppRole authentication. Most hosts use the
|
||||||
@@ -102,6 +132,8 @@ Terraform manages the secrets and AppRole policies in `terraform/vault/`.
|
|||||||
|
|
||||||
**Important:** Never amend commits to `master` unless the user explicitly asks for it. Amending rewrites history and causes issues for deployed configurations.
|
**Important:** Never amend commits to `master` unless the user explicitly asks for it. Amending rewrites history and causes issues for deployed configurations.
|
||||||
|
|
||||||
|
**Important:** Never force push to `master`. If a commit on master has an error, fix it with a new commit rather than rewriting history.
|
||||||
|
|
||||||
**Important:** Do not use `gh pr create` to create pull requests. The git server does not support GitHub CLI for PR creation. Instead, push the branch and let the user create the PR manually via the web interface.
|
**Important:** Do not use `gh pr create` to create pull requests. The git server does not support GitHub CLI for PR creation. Instead, push the branch and let the user create the PR manually via the web interface.
|
||||||
|
|
||||||
When starting a new plan or task, the first step should typically be to create and checkout a new branch with an appropriate name (e.g., `git checkout -b dns-automation` or `git checkout -b fix-nginx-config`).
|
When starting a new plan or task, the first step should typically be to create and checkout a new branch with an appropriate name (e.g., `git checkout -b dns-automation` or `git checkout -b fix-nginx-config`).
|
||||||
@@ -215,7 +247,7 @@ nix develop -c homelab-deploy -- deploy \
|
|||||||
deploy.prod.<hostname>
|
deploy.prod.<hostname>
|
||||||
```
|
```
|
||||||
|
|
||||||
Subject format: `deploy.<tier>.<hostname>` (e.g., `deploy.prod.monitoring01`, `deploy.test.testvm01`)
|
Subject format: `deploy.<tier>.<hostname>` (e.g., `deploy.prod.monitoring02`, `deploy.test.testvm01`)
|
||||||
|
|
||||||
**Verifying Deployments:**
|
**Verifying Deployments:**
|
||||||
|
|
||||||
@@ -255,7 +287,10 @@ The `current_rev` label contains the git commit hash of the deployed flake confi
|
|||||||
- `/docs/` - Documentation and plans
|
- `/docs/` - Documentation and plans
|
||||||
- `plans/` - Future plans and proposals
|
- `plans/` - Future plans and proposals
|
||||||
- `plans/completed/` - Completed plans (moved here when done)
|
- `plans/completed/` - Completed plans (moved here when done)
|
||||||
- `/playbooks/` - Ansible playbooks for fleet management
|
- `/ansible/` - Ansible configuration and playbooks
|
||||||
|
- `ansible.cfg` - Ansible configuration (inventory path, defaults)
|
||||||
|
- `inventory/` - Dynamic and static inventory sources
|
||||||
|
- `playbooks/` - Ansible playbooks for fleet management
|
||||||
|
|
||||||
### Configuration Inheritance
|
### Configuration Inheritance
|
||||||
|
|
||||||
@@ -274,29 +309,16 @@ All hosts automatically get:
|
|||||||
- OpenBao (Vault) secrets management via AppRole
|
- OpenBao (Vault) secrets management via AppRole
|
||||||
- Internal ACME CA integration (OpenBao PKI at vault.home.2rjus.net)
|
- Internal ACME CA integration (OpenBao PKI at vault.home.2rjus.net)
|
||||||
- Daily auto-upgrades with auto-reboot
|
- Daily auto-upgrades with auto-reboot
|
||||||
- Prometheus node-exporter + Promtail (logs to monitoring01)
|
- Prometheus node-exporter + Promtail (logs to monitoring02)
|
||||||
- Monitoring scrape target auto-registration via `homelab.monitoring` options
|
- Monitoring scrape target auto-registration via `homelab.monitoring` options
|
||||||
- Custom root CA trust
|
- Custom root CA trust
|
||||||
- DNS zone auto-registration via `homelab.dns` options
|
- DNS zone auto-registration via `homelab.dns` options
|
||||||
|
|
||||||
### Active Hosts
|
### Hosts
|
||||||
|
|
||||||
Production servers:
|
Host configurations are in `/hosts/<hostname>/`. See `flake.nix` for the complete list of `nixosConfigurations`.
|
||||||
- `ns1`, `ns2` - Primary/secondary DNS servers (10.69.13.5/6)
|
|
||||||
- `vault01` - OpenBao (Vault) secrets server + PKI CA
|
|
||||||
- `ha1` - Home Assistant + Zigbee2MQTT + Mosquitto
|
|
||||||
- `http-proxy` - Reverse proxy
|
|
||||||
- `monitoring01` - Full observability stack (Prometheus, Grafana, Loki, Tempo, Pyroscope)
|
|
||||||
- `jelly01` - Jellyfin media server
|
|
||||||
- `nix-cache01` - Binary cache server + GitHub Actions runner
|
|
||||||
- `pgdb1` - PostgreSQL database
|
|
||||||
- `nats1` - NATS messaging server
|
|
||||||
|
|
||||||
Test/staging hosts:
|
Use `nix flake show` or `nix develop -c ansible-inventory --graph` to list all hosts.
|
||||||
- `testvm01`, `testvm02`, `testvm03` - Test-tier VMs for branch testing and deployment validation
|
|
||||||
|
|
||||||
Template hosts:
|
|
||||||
- `template1`, `template2` - Base templates for cloning new hosts
|
|
||||||
|
|
||||||
### Flake Inputs
|
### Flake Inputs
|
||||||
|
|
||||||
@@ -313,7 +335,7 @@ Template hosts:
|
|||||||
- Infrastructure subnet: `10.69.13.x`
|
- Infrastructure subnet: `10.69.13.x`
|
||||||
- DNS: ns1/ns2 provide authoritative DNS with primary-secondary setup
|
- DNS: ns1/ns2 provide authoritative DNS with primary-secondary setup
|
||||||
- Internal CA for ACME certificates (no Let's Encrypt)
|
- Internal CA for ACME certificates (no Let's Encrypt)
|
||||||
- Centralized monitoring at monitoring01
|
- Centralized monitoring at monitoring02
|
||||||
- Static networking via systemd-networkd
|
- Static networking via systemd-networkd
|
||||||
|
|
||||||
### Secrets Management
|
### Secrets Management
|
||||||
@@ -327,7 +349,7 @@ Most hosts use OpenBao (Vault) for secrets:
|
|||||||
- `extractKey` option extracts a single key from vault JSON as a plain file
|
- `extractKey` option extracts a single key from vault JSON as a plain file
|
||||||
- Secrets fetched at boot by `vault-secret-<name>.service` systemd units
|
- Secrets fetched at boot by `vault-secret-<name>.service` systemd units
|
||||||
- Fallback to cached secrets in `/var/lib/vault/cache/` when Vault is unreachable
|
- Fallback to cached secrets in `/var/lib/vault/cache/` when Vault is unreachable
|
||||||
- Provision AppRole credentials: `nix develop -c ansible-playbook playbooks/provision-approle.yml -e hostname=<host>`
|
- Provision AppRole credentials: `nix develop -c ansible-playbook ansible/playbooks/provision-approle.yml -l <hostname>`
|
||||||
|
|
||||||
### Auto-Upgrade System
|
### Auto-Upgrade System
|
||||||
|
|
||||||
@@ -351,7 +373,7 @@ Template VMs are built from `hosts/template2` and deployed to Proxmox using Ansi
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Build NixOS image and deploy to Proxmox as template
|
# Build NixOS image and deploy to Proxmox as template
|
||||||
nix develop -c ansible-playbook -i playbooks/inventory.ini playbooks/build-and-deploy-template.yml
|
nix develop -c ansible-playbook ansible/playbooks/build-and-deploy-template.yml
|
||||||
```
|
```
|
||||||
|
|
||||||
This playbook:
|
This playbook:
|
||||||
@@ -426,7 +448,7 @@ This means:
|
|||||||
- `tofu plan` won't show spurious changes for Proxmox-managed defaults
|
- `tofu plan` won't show spurious changes for Proxmox-managed defaults
|
||||||
|
|
||||||
**When rebuilding the template:**
|
**When rebuilding the template:**
|
||||||
1. Run `nix develop -c ansible-playbook -i playbooks/inventory.ini playbooks/build-and-deploy-template.yml`
|
1. Run `nix develop -c ansible-playbook ansible/playbooks/build-and-deploy-template.yml`
|
||||||
2. Update `default_template_name` in `terraform/variables.tf` if the name changed
|
2. Update `default_template_name` in `terraform/variables.tf` if the name changed
|
||||||
3. Run `tofu plan` - should show no VM recreations (only template name in state)
|
3. Run `tofu plan` - should show no VM recreations (only template name in state)
|
||||||
4. Run `tofu apply` - updates state without touching existing VMs
|
4. Run `tofu apply` - updates state without touching existing VMs
|
||||||
@@ -458,23 +480,21 @@ See [docs/host-creation.md](docs/host-creation.md) for the complete host creatio
|
|||||||
|
|
||||||
### Monitoring Stack
|
### Monitoring Stack
|
||||||
|
|
||||||
All hosts ship metrics and logs to `monitoring01`:
|
All hosts ship metrics and logs to `monitoring02`:
|
||||||
- **Metrics**: Prometheus scrapes node-exporter from all hosts
|
- **Metrics**: VictoriaMetrics scrapes node-exporter from all hosts
|
||||||
- **Logs**: Promtail ships logs to Loki on monitoring01
|
- **Logs**: Promtail ships logs to Loki on monitoring02
|
||||||
- **Access**: Grafana at monitoring01 for visualization
|
- **Access**: Grafana at monitoring02 for visualization
|
||||||
- **Tracing**: Tempo for distributed tracing
|
|
||||||
- **Profiling**: Pyroscope for continuous profiling
|
|
||||||
|
|
||||||
**Scrape Target Auto-Generation:**
|
**Scrape Target Auto-Generation:**
|
||||||
|
|
||||||
Prometheus scrape targets are automatically generated from host configurations, following the same pattern as DNS zone generation:
|
VictoriaMetrics scrape targets are automatically generated from host configurations, following the same pattern as DNS zone generation:
|
||||||
|
|
||||||
- **Node-exporter**: All flake hosts with static IPs are automatically added as node-exporter targets
|
- **Node-exporter**: All flake hosts with static IPs are automatically added as node-exporter targets
|
||||||
- **Service targets**: Defined via `homelab.monitoring.scrapeTargets` in service modules
|
- **Service targets**: Defined via `homelab.monitoring.scrapeTargets` in service modules
|
||||||
- **External targets**: Non-flake hosts defined in `/services/monitoring/external-targets.nix`
|
- **External targets**: Non-flake hosts defined in `/services/monitoring/external-targets.nix`
|
||||||
- **Library**: `lib/monitoring.nix` provides `generateNodeExporterTargets` and `generateScrapeConfigs`
|
- **Library**: `lib/monitoring.nix` provides `generateNodeExporterTargets` and `generateScrapeConfigs`
|
||||||
|
|
||||||
Service modules declare their scrape targets directly via `homelab.monitoring.scrapeTargets`. The Prometheus config on monitoring01 auto-generates scrape configs from all hosts. See "Homelab Module Options" section for available options.
|
Service modules declare their scrape targets directly via `homelab.monitoring.scrapeTargets`. The VictoriaMetrics config on monitoring02 auto-generates scrape configs from all hosts. See "Homelab Module Options" section for available options.
|
||||||
|
|
||||||
To add monitoring targets for non-NixOS hosts, edit `/services/monitoring/external-targets.nix`.
|
To add monitoring targets for non-NixOS hosts, edit `/services/monitoring/external-targets.nix`.
|
||||||
|
|
||||||
@@ -509,6 +529,7 @@ The `modules/homelab/` directory defines custom options used across hosts for au
|
|||||||
- `priority` - Alerting priority: `high` or `low`. Controls alerting thresholds for the host.
|
- `priority` - Alerting priority: `high` or `low`. Controls alerting thresholds for the host.
|
||||||
- `role` - Primary role designation (e.g., `dns`, `database`, `bastion`, `vault`)
|
- `role` - Primary role designation (e.g., `dns`, `database`, `bastion`, `vault`)
|
||||||
- `labels` - Free-form key-value metadata for host categorization
|
- `labels` - Free-form key-value metadata for host categorization
|
||||||
|
- `ansible = "false"` - Exclude host from Ansible dynamic inventory
|
||||||
|
|
||||||
**DNS options (`homelab.dns.*`):**
|
**DNS options (`homelab.dns.*`):**
|
||||||
- `enable` (default: `true`) - Include host in DNS zone generation
|
- `enable` (default: `true`) - Include host in DNS zone generation
|
||||||
|
|||||||
@@ -10,9 +10,9 @@ NixOS Flake-based configuration repository for a homelab infrastructure. All hos
|
|||||||
| `ca` | Internal Certificate Authority |
|
| `ca` | Internal Certificate Authority |
|
||||||
| `ha1` | Home Assistant + Zigbee2MQTT + Mosquitto |
|
| `ha1` | Home Assistant + Zigbee2MQTT + Mosquitto |
|
||||||
| `http-proxy` | Reverse proxy |
|
| `http-proxy` | Reverse proxy |
|
||||||
| `monitoring01` | Prometheus, Grafana, Loki, Tempo, Pyroscope |
|
| `monitoring02` | VictoriaMetrics, Grafana, Loki, Alertmanager |
|
||||||
| `jelly01` | Jellyfin media server |
|
| `jelly01` | Jellyfin media server |
|
||||||
| `nix-cache01` | Nix binary cache |
|
| `nix-cache02` | Nix binary cache + NATS-based build service |
|
||||||
| `nats1` | NATS messaging |
|
| `nats1` | NATS messaging |
|
||||||
| `vault01` | OpenBao (Vault) secrets management |
|
| `vault01` | OpenBao (Vault) secrets management |
|
||||||
| `template1`, `template2` | VM templates for cloning new hosts |
|
| `template1`, `template2` | VM templates for cloning new hosts |
|
||||||
@@ -121,4 +121,4 @@ No manual intervention is required after `tofu apply`.
|
|||||||
- Infrastructure subnet: `10.69.13.0/24`
|
- Infrastructure subnet: `10.69.13.0/24`
|
||||||
- DNS: ns1/ns2 authoritative with primary-secondary AXFR
|
- DNS: ns1/ns2 authoritative with primary-secondary AXFR
|
||||||
- Internal CA for TLS certificates (migrating from step-ca to OpenBao PKI)
|
- Internal CA for TLS certificates (migrating from step-ca to OpenBao PKI)
|
||||||
- Centralized monitoring at monitoring01
|
- Centralized monitoring at monitoring02
|
||||||
|
|||||||
120
ansible/README.md
Normal file
120
ansible/README.md
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
# Ansible Configuration
|
||||||
|
|
||||||
|
This directory contains Ansible configuration for fleet management tasks.
|
||||||
|
|
||||||
|
## Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
ansible/
|
||||||
|
├── ansible.cfg # Ansible configuration
|
||||||
|
├── inventory/
|
||||||
|
│ ├── dynamic_flake.py # Dynamic inventory from NixOS flake
|
||||||
|
│ ├── static.yml # Non-flake hosts (Proxmox, etc.)
|
||||||
|
│ └── group_vars/
|
||||||
|
│ └── all.yml # Common variables
|
||||||
|
└── playbooks/
|
||||||
|
├── build-and-deploy-template.yml
|
||||||
|
├── provision-approle.yml
|
||||||
|
├── restart-service.yml
|
||||||
|
└── run-upgrade.yml
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
The devshell automatically configures `ANSIBLE_CONFIG`, so commands work without extra flags:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# List inventory groups
|
||||||
|
nix develop -c ansible-inventory --graph
|
||||||
|
|
||||||
|
# List hosts in a specific group
|
||||||
|
nix develop -c ansible-inventory --list | jq '.role_dns'
|
||||||
|
|
||||||
|
# Run a playbook
|
||||||
|
nix develop -c ansible-playbook ansible/playbooks/run-upgrade.yml -l tier_test
|
||||||
|
```
|
||||||
|
|
||||||
|
## Inventory
|
||||||
|
|
||||||
|
The inventory combines dynamic and static sources automatically.
|
||||||
|
|
||||||
|
### Dynamic Inventory (from flake)
|
||||||
|
|
||||||
|
The `dynamic_flake.py` script extracts hosts from the NixOS flake using `homelab.host.*` options:
|
||||||
|
|
||||||
|
**Groups generated:**
|
||||||
|
- `flake_hosts` - All NixOS hosts from the flake
|
||||||
|
- `tier_test`, `tier_prod` - By `homelab.host.tier`
|
||||||
|
- `role_dns`, `role_vault`, `role_monitoring`, etc. - By `homelab.host.role`
|
||||||
|
|
||||||
|
**Host variables set:**
|
||||||
|
- `tier` - Deployment tier (test/prod)
|
||||||
|
- `role` - Host role
|
||||||
|
- `short_hostname` - Hostname without domain
|
||||||
|
|
||||||
|
### Static Inventory
|
||||||
|
|
||||||
|
Non-flake hosts are defined in `inventory/static.yml`:
|
||||||
|
|
||||||
|
- `proxmox` - Proxmox hypervisors
|
||||||
|
|
||||||
|
## Playbooks
|
||||||
|
|
||||||
|
| Playbook | Description | Example |
|
||||||
|
|----------|-------------|---------|
|
||||||
|
| `run-upgrade.yml` | Trigger nixos-upgrade on hosts | `-l tier_prod` |
|
||||||
|
| `restart-service.yml` | Restart a systemd service | `-l role_dns -e service=unbound` |
|
||||||
|
| `reboot.yml` | Rolling reboot (one host at a time) | `-l tier_test` |
|
||||||
|
| `provision-approle.yml` | Deploy Vault credentials (single host only) | `-l testvm01` |
|
||||||
|
| `build-and-deploy-template.yml` | Build and deploy Proxmox template | (no limit needed) |
|
||||||
|
|
||||||
|
### Examples
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Restart unbound on all DNS servers
|
||||||
|
nix develop -c ansible-playbook ansible/playbooks/restart-service.yml \
|
||||||
|
-l role_dns -e service=unbound
|
||||||
|
|
||||||
|
# Trigger upgrade on all test hosts
|
||||||
|
nix develop -c ansible-playbook ansible/playbooks/run-upgrade.yml -l tier_test
|
||||||
|
|
||||||
|
# Provision Vault credentials for a specific host
|
||||||
|
nix develop -c ansible-playbook ansible/playbooks/provision-approle.yml -l testvm01
|
||||||
|
|
||||||
|
# Build and deploy Proxmox template
|
||||||
|
nix develop -c ansible-playbook ansible/playbooks/build-and-deploy-template.yml
|
||||||
|
|
||||||
|
# Rolling reboot of test hosts (one at a time, waits for each to come back)
|
||||||
|
nix develop -c ansible-playbook ansible/playbooks/reboot.yml -l tier_test
|
||||||
|
```
|
||||||
|
|
||||||
|
## Excluding Flake Hosts
|
||||||
|
|
||||||
|
To exclude a flake host from the dynamic inventory, add the `ansible = "false"` label in the host's configuration:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
homelab.host.labels.ansible = "false";
|
||||||
|
```
|
||||||
|
|
||||||
|
Hosts with `homelab.dns.enable = false` are also excluded automatically.
|
||||||
|
|
||||||
|
## Adding Non-Flake Hosts
|
||||||
|
|
||||||
|
Edit `inventory/static.yml` to add hosts not managed by the NixOS flake:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
all:
|
||||||
|
children:
|
||||||
|
my_group:
|
||||||
|
hosts:
|
||||||
|
host1.example.com:
|
||||||
|
ansible_user: admin
|
||||||
|
```
|
||||||
|
|
||||||
|
## Common Variables
|
||||||
|
|
||||||
|
Variables in `inventory/group_vars/all.yml` apply to all hosts:
|
||||||
|
|
||||||
|
- `ansible_user` - Default SSH user (root)
|
||||||
|
- `domain` - Domain name (home.2rjus.net)
|
||||||
|
- `vault_addr` - Vault server URL
|
||||||
17
ansible/ansible.cfg
Normal file
17
ansible/ansible.cfg
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
[defaults]
|
||||||
|
inventory = inventory/
|
||||||
|
remote_user = root
|
||||||
|
host_key_checking = False
|
||||||
|
|
||||||
|
# Reduce SSH connection overhead
|
||||||
|
forks = 10
|
||||||
|
pipelining = True
|
||||||
|
|
||||||
|
# Output formatting (YAML output via builtin default callback)
|
||||||
|
stdout_callback = default
|
||||||
|
callbacks_enabled = profile_tasks
|
||||||
|
result_format = yaml
|
||||||
|
|
||||||
|
[ssh_connection]
|
||||||
|
# Reuse SSH connections
|
||||||
|
ssh_args = -o ControlMaster=auto -o ControlPersist=60s
|
||||||
162
ansible/inventory/dynamic_flake.py
Executable file
162
ansible/inventory/dynamic_flake.py
Executable file
@@ -0,0 +1,162 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Dynamic Ansible inventory script that extracts host information from the NixOS flake.
|
||||||
|
|
||||||
|
Generates groups:
|
||||||
|
- flake_hosts: All hosts defined in the flake
|
||||||
|
- tier_test, tier_prod: Hosts by deployment tier
|
||||||
|
- role_<name>: Hosts by role (dns, vault, monitoring, etc.)
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
./dynamic_flake.py --list # Return full inventory
|
||||||
|
./dynamic_flake.py --host X # Return host vars (not used, but required by Ansible)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def get_flake_dir() -> Path:
|
||||||
|
"""Find the flake root directory."""
|
||||||
|
script_dir = Path(__file__).resolve().parent
|
||||||
|
# ansible/inventory/dynamic_flake.py -> repo root
|
||||||
|
return script_dir.parent.parent
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_flake() -> dict:
|
||||||
|
"""Evaluate the flake and extract host metadata."""
|
||||||
|
flake_dir = get_flake_dir()
|
||||||
|
|
||||||
|
# Nix expression to extract relevant config from each host
|
||||||
|
nix_expr = """
|
||||||
|
configs: builtins.mapAttrs (name: cfg: {
|
||||||
|
hostname = cfg.config.networking.hostName;
|
||||||
|
domain = cfg.config.networking.domain or "home.2rjus.net";
|
||||||
|
tier = cfg.config.homelab.host.tier;
|
||||||
|
role = cfg.config.homelab.host.role;
|
||||||
|
labels = cfg.config.homelab.host.labels;
|
||||||
|
dns_enabled = cfg.config.homelab.dns.enable;
|
||||||
|
}) configs
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
[
|
||||||
|
"nix",
|
||||||
|
"eval",
|
||||||
|
"--json",
|
||||||
|
f"{flake_dir}#nixosConfigurations",
|
||||||
|
"--apply",
|
||||||
|
nix_expr,
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True,
|
||||||
|
cwd=flake_dir,
|
||||||
|
)
|
||||||
|
return json.loads(result.stdout)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"Error evaluating flake: {e.stderr}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
print(f"Error parsing nix output: {e}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_group_name(name: str) -> str:
|
||||||
|
"""Sanitize a string for use as an Ansible group name.
|
||||||
|
|
||||||
|
Ansible group names should contain only alphanumeric characters and underscores.
|
||||||
|
"""
|
||||||
|
return name.replace("-", "_")
|
||||||
|
|
||||||
|
|
||||||
|
def build_inventory(hosts_data: dict) -> dict:
|
||||||
|
"""Build Ansible inventory structure from host data."""
|
||||||
|
inventory = {
|
||||||
|
"_meta": {"hostvars": {}},
|
||||||
|
"flake_hosts": {"hosts": []},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Track groups we need to create
|
||||||
|
tier_groups: dict[str, list[str]] = {}
|
||||||
|
role_groups: dict[str, list[str]] = {}
|
||||||
|
|
||||||
|
for _config_name, host_info in hosts_data.items():
|
||||||
|
hostname = host_info["hostname"]
|
||||||
|
domain = host_info["domain"]
|
||||||
|
tier = host_info["tier"]
|
||||||
|
role = host_info["role"]
|
||||||
|
labels = host_info["labels"]
|
||||||
|
dns_enabled = host_info["dns_enabled"]
|
||||||
|
|
||||||
|
# Skip hosts that have DNS disabled (like templates)
|
||||||
|
if not dns_enabled:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip hosts with ansible = "false" label
|
||||||
|
if labels.get("ansible") == "false":
|
||||||
|
continue
|
||||||
|
|
||||||
|
fqdn = f"{hostname}.{domain}"
|
||||||
|
|
||||||
|
# Use short hostname as inventory name, FQDN for connection
|
||||||
|
inventory_name = hostname
|
||||||
|
|
||||||
|
# Add to flake_hosts group
|
||||||
|
inventory["flake_hosts"]["hosts"].append(inventory_name)
|
||||||
|
|
||||||
|
# Add host variables
|
||||||
|
inventory["_meta"]["hostvars"][inventory_name] = {
|
||||||
|
"ansible_host": fqdn, # Connect using FQDN
|
||||||
|
"fqdn": fqdn,
|
||||||
|
"tier": tier,
|
||||||
|
"role": role,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Group by tier
|
||||||
|
tier_group = f"tier_{sanitize_group_name(tier)}"
|
||||||
|
if tier_group not in tier_groups:
|
||||||
|
tier_groups[tier_group] = []
|
||||||
|
tier_groups[tier_group].append(inventory_name)
|
||||||
|
|
||||||
|
# Group by role (if set)
|
||||||
|
if role:
|
||||||
|
role_group = f"role_{sanitize_group_name(role)}"
|
||||||
|
if role_group not in role_groups:
|
||||||
|
role_groups[role_group] = []
|
||||||
|
role_groups[role_group].append(inventory_name)
|
||||||
|
|
||||||
|
# Add tier groups to inventory
|
||||||
|
for group_name, hosts in tier_groups.items():
|
||||||
|
inventory[group_name] = {"hosts": hosts}
|
||||||
|
|
||||||
|
# Add role groups to inventory
|
||||||
|
for group_name, hosts in role_groups.items():
|
||||||
|
inventory[group_name] = {"hosts": hosts}
|
||||||
|
|
||||||
|
return inventory
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print("Usage: dynamic_flake.py --list | --host <hostname>", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if sys.argv[1] == "--list":
|
||||||
|
hosts_data = evaluate_flake()
|
||||||
|
inventory = build_inventory(hosts_data)
|
||||||
|
print(json.dumps(inventory, indent=2))
|
||||||
|
elif sys.argv[1] == "--host":
|
||||||
|
# Ansible calls this to get vars for a specific host
|
||||||
|
# We provide all vars in _meta.hostvars, so just return empty
|
||||||
|
print(json.dumps({}))
|
||||||
|
else:
|
||||||
|
print(f"Unknown option: {sys.argv[1]}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
5
ansible/inventory/group_vars/all.yml
Normal file
5
ansible/inventory/group_vars/all.yml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
# Common variables for all hosts
|
||||||
|
|
||||||
|
ansible_user: root
|
||||||
|
domain: home.2rjus.net
|
||||||
|
vault_addr: https://vault01.home.2rjus.net:8200
|
||||||
13
ansible/inventory/static.yml
Normal file
13
ansible/inventory/static.yml
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
# Static inventory for non-flake hosts
|
||||||
|
#
|
||||||
|
# Hosts defined here are merged with the dynamic flake inventory.
|
||||||
|
# Use this for infrastructure that isn't managed by NixOS.
|
||||||
|
#
|
||||||
|
# Use short hostnames as inventory names with ansible_host for FQDN.
|
||||||
|
|
||||||
|
all:
|
||||||
|
children:
|
||||||
|
proxmox:
|
||||||
|
hosts:
|
||||||
|
pve1:
|
||||||
|
ansible_host: pve1.home.2rjus.net
|
||||||
@@ -15,13 +15,13 @@
|
|||||||
- name: Build NixOS image
|
- name: Build NixOS image
|
||||||
ansible.builtin.command:
|
ansible.builtin.command:
|
||||||
cmd: "nixos-rebuild build-image --image-variant proxmox --flake .#template2"
|
cmd: "nixos-rebuild build-image --image-variant proxmox --flake .#template2"
|
||||||
chdir: "{{ playbook_dir }}/.."
|
chdir: "{{ playbook_dir }}/../.."
|
||||||
register: build_result
|
register: build_result
|
||||||
changed_when: true
|
changed_when: true
|
||||||
|
|
||||||
- name: Find built image file
|
- name: Find built image file
|
||||||
ansible.builtin.find:
|
ansible.builtin.find:
|
||||||
paths: "{{ playbook_dir}}/../result"
|
paths: "{{ playbook_dir}}/../../result"
|
||||||
patterns: "*.vma.zst"
|
patterns: "*.vma.zst"
|
||||||
recurse: true
|
recurse: true
|
||||||
register: image_files
|
register: image_files
|
||||||
@@ -105,7 +105,7 @@
|
|||||||
gather_facts: false
|
gather_facts: false
|
||||||
|
|
||||||
vars:
|
vars:
|
||||||
terraform_dir: "{{ playbook_dir }}/../terraform"
|
terraform_dir: "{{ playbook_dir }}/../../terraform"
|
||||||
|
|
||||||
tasks:
|
tasks:
|
||||||
- name: Get image filename from earlier play
|
- name: Get image filename from earlier play
|
||||||
@@ -1,7 +1,27 @@
|
|||||||
---
|
---
|
||||||
# Provision OpenBao AppRole credentials to an existing host
|
# Provision OpenBao AppRole credentials to a host
|
||||||
# Usage: nix develop -c ansible-playbook playbooks/provision-approle.yml -e hostname=ha1
|
#
|
||||||
|
# Usage: ansible-playbook ansible/playbooks/provision-approle.yml -l <hostname>
|
||||||
# Requires: BAO_ADDR and BAO_TOKEN environment variables set
|
# Requires: BAO_ADDR and BAO_TOKEN environment variables set
|
||||||
|
#
|
||||||
|
# IMPORTANT: This playbook must target exactly one host to prevent
|
||||||
|
# accidentally regenerating credentials for multiple hosts.
|
||||||
|
|
||||||
|
- name: Validate single host target
|
||||||
|
hosts: all
|
||||||
|
gather_facts: false
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
- name: Fail if targeting multiple hosts
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: |
|
||||||
|
This playbook must target exactly one host.
|
||||||
|
Use: ansible-playbook provision-approle.yml -l <hostname>
|
||||||
|
|
||||||
|
Targeting multiple hosts would regenerate credentials for all of them,
|
||||||
|
potentially breaking existing services.
|
||||||
|
when: ansible_play_hosts | length != 1
|
||||||
|
run_once: true
|
||||||
|
|
||||||
- name: Fetch AppRole credentials from OpenBao
|
- name: Fetch AppRole credentials from OpenBao
|
||||||
hosts: localhost
|
hosts: localhost
|
||||||
@@ -9,18 +29,17 @@
|
|||||||
gather_facts: false
|
gather_facts: false
|
||||||
|
|
||||||
vars:
|
vars:
|
||||||
vault_addr: "{{ lookup('env', 'BAO_ADDR') | default('https://vault01.home.2rjus.net:8200', true) }}"
|
target_host: "{{ groups['all'] | first }}"
|
||||||
domain: "home.2rjus.net"
|
target_hostname: "{{ hostvars[target_host]['short_hostname'] | default(target_host.split('.')[0]) }}"
|
||||||
|
|
||||||
tasks:
|
tasks:
|
||||||
- name: Validate hostname is provided
|
- name: Display target host
|
||||||
ansible.builtin.fail:
|
ansible.builtin.debug:
|
||||||
msg: "hostname variable is required. Use: -e hostname=<name>"
|
msg: "Provisioning AppRole credentials for: {{ target_hostname }}"
|
||||||
when: hostname is not defined
|
|
||||||
|
|
||||||
- name: Get role-id for host
|
- name: Get role-id for host
|
||||||
ansible.builtin.command:
|
ansible.builtin.command:
|
||||||
cmd: "bao read -field=role_id auth/approle/role/{{ hostname }}/role-id"
|
cmd: "bao read -field=role_id auth/approle/role/{{ target_hostname }}/role-id"
|
||||||
environment:
|
environment:
|
||||||
BAO_ADDR: "{{ vault_addr }}"
|
BAO_ADDR: "{{ vault_addr }}"
|
||||||
BAO_SKIP_VERIFY: "1"
|
BAO_SKIP_VERIFY: "1"
|
||||||
@@ -29,25 +48,26 @@
|
|||||||
|
|
||||||
- name: Generate secret-id for host
|
- name: Generate secret-id for host
|
||||||
ansible.builtin.command:
|
ansible.builtin.command:
|
||||||
cmd: "bao write -field=secret_id -f auth/approle/role/{{ hostname }}/secret-id"
|
cmd: "bao write -field=secret_id -f auth/approle/role/{{ target_hostname }}/secret-id"
|
||||||
environment:
|
environment:
|
||||||
BAO_ADDR: "{{ vault_addr }}"
|
BAO_ADDR: "{{ vault_addr }}"
|
||||||
BAO_SKIP_VERIFY: "1"
|
BAO_SKIP_VERIFY: "1"
|
||||||
register: secret_id_result
|
register: secret_id_result
|
||||||
changed_when: true
|
changed_when: true
|
||||||
|
|
||||||
- name: Add target host to inventory
|
- name: Store credentials for next play
|
||||||
ansible.builtin.add_host:
|
ansible.builtin.set_fact:
|
||||||
name: "{{ hostname }}.{{ domain }}"
|
|
||||||
groups: vault_target
|
|
||||||
ansible_user: root
|
|
||||||
vault_role_id: "{{ role_id_result.stdout }}"
|
vault_role_id: "{{ role_id_result.stdout }}"
|
||||||
vault_secret_id: "{{ secret_id_result.stdout }}"
|
vault_secret_id: "{{ secret_id_result.stdout }}"
|
||||||
|
|
||||||
- name: Deploy AppRole credentials to host
|
- name: Deploy AppRole credentials to host
|
||||||
hosts: vault_target
|
hosts: all
|
||||||
gather_facts: false
|
gather_facts: false
|
||||||
|
|
||||||
|
vars:
|
||||||
|
vault_role_id: "{{ hostvars['localhost']['vault_role_id'] }}"
|
||||||
|
vault_secret_id: "{{ hostvars['localhost']['vault_secret_id'] }}"
|
||||||
|
|
||||||
tasks:
|
tasks:
|
||||||
- name: Create AppRole directory
|
- name: Create AppRole directory
|
||||||
ansible.builtin.file:
|
ansible.builtin.file:
|
||||||
48
ansible/playbooks/reboot.yml
Normal file
48
ansible/playbooks/reboot.yml
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
---
|
||||||
|
# Reboot hosts with rolling strategy to avoid taking down redundant services
|
||||||
|
#
|
||||||
|
# Usage examples:
|
||||||
|
# # Reboot a single host
|
||||||
|
# ansible-playbook reboot.yml -l testvm01
|
||||||
|
#
|
||||||
|
# # Reboot all test hosts (one at a time)
|
||||||
|
# ansible-playbook reboot.yml -l tier_test
|
||||||
|
#
|
||||||
|
# # Reboot all DNS servers safely (one at a time)
|
||||||
|
# ansible-playbook reboot.yml -l role_dns
|
||||||
|
#
|
||||||
|
# Safety features:
|
||||||
|
# - serial: 1 ensures only one host reboots at a time
|
||||||
|
# - Waits for host to come back online before proceeding
|
||||||
|
# - Groups hosts by role to avoid rebooting same-role hosts consecutively
|
||||||
|
|
||||||
|
- name: Reboot hosts (rolling)
|
||||||
|
hosts: all
|
||||||
|
serial: 1
|
||||||
|
order: shuffle # Randomize to spread out same-role hosts
|
||||||
|
gather_facts: false
|
||||||
|
|
||||||
|
vars:
|
||||||
|
reboot_timeout: 300 # 5 minutes to wait for host to come back
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
- name: Display reboot target
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "Rebooting {{ inventory_hostname }} (role: {{ role | default('none') }})"
|
||||||
|
|
||||||
|
- name: Reboot the host
|
||||||
|
ansible.builtin.systemd:
|
||||||
|
name: reboot.target
|
||||||
|
state: started
|
||||||
|
async: 1
|
||||||
|
poll: 0
|
||||||
|
ignore_errors: true
|
||||||
|
|
||||||
|
- name: Wait for host to come back online
|
||||||
|
ansible.builtin.wait_for_connection:
|
||||||
|
delay: 5
|
||||||
|
timeout: "{{ reboot_timeout }}"
|
||||||
|
|
||||||
|
- name: Display reboot result
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "{{ inventory_hostname }} rebooted successfully"
|
||||||
40
ansible/playbooks/restart-service.yml
Normal file
40
ansible/playbooks/restart-service.yml
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
---
|
||||||
|
# Restart a systemd service on target hosts
|
||||||
|
#
|
||||||
|
# Usage examples:
|
||||||
|
# # Restart unbound on all DNS servers
|
||||||
|
# ansible-playbook restart-service.yml -l role_dns -e service=unbound
|
||||||
|
#
|
||||||
|
# # Restart nginx on a specific host
|
||||||
|
# ansible-playbook restart-service.yml -l http-proxy.home.2rjus.net -e service=nginx
|
||||||
|
#
|
||||||
|
# # Restart promtail on all prod hosts
|
||||||
|
# ansible-playbook restart-service.yml -l tier_prod -e service=promtail
|
||||||
|
|
||||||
|
- name: Restart systemd service
|
||||||
|
hosts: all
|
||||||
|
gather_facts: false
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
- name: Validate service name provided
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: |
|
||||||
|
The 'service' variable is required.
|
||||||
|
Usage: ansible-playbook restart-service.yml -l <target> -e service=<name>
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
-e service=nginx
|
||||||
|
-e service=unbound
|
||||||
|
-e service=promtail
|
||||||
|
when: service is not defined
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: Restart {{ service }}
|
||||||
|
ansible.builtin.systemd:
|
||||||
|
name: "{{ service }}"
|
||||||
|
state: restarted
|
||||||
|
register: restart_result
|
||||||
|
|
||||||
|
- name: Display result
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "Service {{ service }} restarted on {{ inventory_hostname }}"
|
||||||
@@ -50,7 +50,7 @@ homelab.host.tier = "test"; # or "prod"
|
|||||||
During the bootstrap process, status updates are sent to Loki. Query bootstrap logs with:
|
During the bootstrap process, status updates are sent to Loki. Query bootstrap logs with:
|
||||||
|
|
||||||
```
|
```
|
||||||
{job="bootstrap", host="<hostname>"}
|
{job="bootstrap", hostname="<hostname>"}
|
||||||
```
|
```
|
||||||
|
|
||||||
### Bootstrap Stages
|
### Bootstrap Stages
|
||||||
@@ -72,7 +72,7 @@ The bootstrap process reports these stages via the `stage` label:
|
|||||||
|
|
||||||
```
|
```
|
||||||
# All bootstrap activity for a host
|
# All bootstrap activity for a host
|
||||||
{job="bootstrap", host="myhost"}
|
{job="bootstrap", hostname="myhost"}
|
||||||
|
|
||||||
# Track all failures
|
# Track all failures
|
||||||
{job="bootstrap", stage="failed"}
|
{job="bootstrap", stage="failed"}
|
||||||
@@ -87,7 +87,7 @@ Once the VM reboots with its full configuration, it will start publishing metric
|
|||||||
|
|
||||||
1. Check bootstrap completed successfully:
|
1. Check bootstrap completed successfully:
|
||||||
```
|
```
|
||||||
{job="bootstrap", host="<hostname>", stage="success"}
|
{job="bootstrap", hostname="<hostname>", stage="success"}
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Verify the host is up and reporting metrics:
|
2. Verify the host is up and reporting metrics:
|
||||||
@@ -102,7 +102,7 @@ Once the VM reboots with its full configuration, it will start publishing metric
|
|||||||
|
|
||||||
4. Check logs are flowing:
|
4. Check logs are flowing:
|
||||||
```
|
```
|
||||||
{host="<hostname>"}
|
{hostname="<hostname>"}
|
||||||
```
|
```
|
||||||
|
|
||||||
5. Confirm expected services are running and producing logs
|
5. Confirm expected services are running and producing logs
|
||||||
@@ -119,7 +119,7 @@ Once the VM reboots with its full configuration, it will start publishing metric
|
|||||||
|
|
||||||
1. Check bootstrap logs in Loki - if they never progress past `building`, the rebuild likely consumed all resources:
|
1. Check bootstrap logs in Loki - if they never progress past `building`, the rebuild likely consumed all resources:
|
||||||
```
|
```
|
||||||
{job="bootstrap", host="<hostname>"}
|
{job="bootstrap", hostname="<hostname>"}
|
||||||
```
|
```
|
||||||
|
|
||||||
2. **USER**: SSH into the host and check the bootstrap service:
|
2. **USER**: SSH into the host and check the bootstrap service:
|
||||||
@@ -149,7 +149,7 @@ Usually caused by running the `create-host` script without proper credentials, o
|
|||||||
|
|
||||||
2. Check bootstrap logs for vault-related stages:
|
2. Check bootstrap logs for vault-related stages:
|
||||||
```
|
```
|
||||||
{job="bootstrap", host="<hostname>", stage=~"vault.*"}
|
{job="bootstrap", hostname="<hostname>", stage=~"vault.*"}
|
||||||
```
|
```
|
||||||
|
|
||||||
3. **USER**: Regenerate and provision credentials manually:
|
3. **USER**: Regenerate and provision credentials manually:
|
||||||
|
|||||||
@@ -66,9 +66,9 @@ This future migration path is a strong argument for Kanidm over LDAP-only soluti
|
|||||||
- Vault integration for idm_admin password
|
- Vault integration for idm_admin password
|
||||||
- LDAPS on port 636
|
- LDAPS on port 636
|
||||||
|
|
||||||
2. **Configure declarative provisioning** ✅
|
2. **Configure provisioning** ✅
|
||||||
- Groups: `admins`, `users`, `ssh-users`
|
- Groups provisioned declaratively: `admins`, `users`, `ssh-users`
|
||||||
- User: `torjus` (member of all groups)
|
- Users managed imperatively via CLI (allows setting POSIX passwords in one step)
|
||||||
- POSIX attributes enabled (UID/GID range 65,536-69,999)
|
- POSIX attributes enabled (UID/GID range 65,536-69,999)
|
||||||
|
|
||||||
3. **Test NAS integration** (in progress)
|
3. **Test NAS integration** (in progress)
|
||||||
@@ -80,14 +80,16 @@ This future migration path is a strong argument for Kanidm over LDAP-only soluti
|
|||||||
- Grafana
|
- Grafana
|
||||||
- Other services as needed
|
- Other services as needed
|
||||||
|
|
||||||
5. **Create client module** in `system/` for PAM/NSS
|
5. **Create client module** in `system/` for PAM/NSS ✅
|
||||||
- Enable on all hosts that need central auth
|
- Module: `system/kanidm-client.nix`
|
||||||
- Configure trusted CA
|
- `homelab.kanidm.enable = true` enables PAM/NSS
|
||||||
|
- Short usernames (not SPN format)
|
||||||
|
- Home directory symlinks via `home_alias`
|
||||||
|
- Enabled on test tier: testvm01, testvm02, testvm03
|
||||||
|
|
||||||
6. **Documentation**
|
6. **Documentation** ✅
|
||||||
- User management procedures
|
- `docs/user-management.md` - CLI workflows, troubleshooting
|
||||||
- Adding new OAuth2 clients
|
- User/group creation procedures verified working
|
||||||
- Troubleshooting PAM/NSS issues
|
|
||||||
|
|
||||||
## Progress
|
## Progress
|
||||||
|
|
||||||
@@ -106,14 +108,37 @@ This future migration path is a strong argument for Kanidm over LDAP-only soluti
|
|||||||
- Prometheus monitoring scrape target configured
|
- Prometheus monitoring scrape target configured
|
||||||
|
|
||||||
**Provisioned entities:**
|
**Provisioned entities:**
|
||||||
- Groups: `admins`, `users`, `ssh-users`
|
- Groups: `admins`, `users`, `ssh-users` (declarative)
|
||||||
- User: `torjus` (member of all groups, POSIX enabled with GID 65536)
|
- Users managed via CLI (imperative)
|
||||||
|
|
||||||
**Verified working:**
|
**Verified working:**
|
||||||
- WebUI login with idm_admin
|
- WebUI login with idm_admin
|
||||||
- LDAP bind and search with POSIX-enabled user
|
- LDAP bind and search with POSIX-enabled user
|
||||||
- LDAPS with valid internal CA certificate
|
- LDAPS with valid internal CA certificate
|
||||||
|
|
||||||
|
### Completed (2026-02-08) - PAM/NSS Client
|
||||||
|
|
||||||
|
**Client module deployed (`system/kanidm-client.nix`):**
|
||||||
|
- `homelab.kanidm.enable = true` enables PAM/NSS integration
|
||||||
|
- Connects to auth.home.2rjus.net
|
||||||
|
- Short usernames (`torjus` instead of `torjus@home.2rjus.net`)
|
||||||
|
- Home directory symlinks (`/home/torjus` → UUID-based dir)
|
||||||
|
- Login restricted to `ssh-users` group
|
||||||
|
|
||||||
|
**Enabled on test tier:**
|
||||||
|
- testvm01, testvm02, testvm03
|
||||||
|
|
||||||
|
**Verified working:**
|
||||||
|
- User/group resolution via `getent`
|
||||||
|
- SSH login with Kanidm unix passwords
|
||||||
|
- Home directory creation with symlinks
|
||||||
|
- Imperative user/group creation via CLI
|
||||||
|
|
||||||
|
**Documentation:**
|
||||||
|
- `docs/user-management.md` with full CLI workflows
|
||||||
|
- Password requirements (min 10 chars)
|
||||||
|
- Troubleshooting guide (nscd, cache invalidation)
|
||||||
|
|
||||||
### UID/GID Range (Resolved)
|
### UID/GID Range (Resolved)
|
||||||
|
|
||||||
**Range: 65,536 - 69,999** (manually allocated)
|
**Range: 65,536 - 69,999** (manually allocated)
|
||||||
@@ -126,12 +151,30 @@ Rationale:
|
|||||||
- Well above NixOS system users (typically <1000)
|
- Well above NixOS system users (typically <1000)
|
||||||
- Avoids Podman/container issues with very high GIDs
|
- Avoids Podman/container issues with very high GIDs
|
||||||
|
|
||||||
|
### Completed (2026-02-08) - OAuth2/OIDC for Grafana
|
||||||
|
|
||||||
|
**OAuth2 client deployed for Grafana on monitoring02:**
|
||||||
|
- Client ID: `grafana`
|
||||||
|
- Redirect URL: `https://grafana-test.home.2rjus.net/login/generic_oauth`
|
||||||
|
- Scope maps: `openid`, `profile`, `email`, `groups` for `users` group
|
||||||
|
- Role mapping: `admins` group → Grafana Admin, others → Viewer
|
||||||
|
|
||||||
|
**Configuration locations:**
|
||||||
|
- Kanidm OAuth2 client: `services/kanidm/default.nix`
|
||||||
|
- Grafana OIDC config: `services/grafana/default.nix`
|
||||||
|
- Vault secret: `services/grafana/oauth2-client-secret`
|
||||||
|
|
||||||
|
**Key findings:**
|
||||||
|
- PKCE is required by Kanidm - enable `use_pkce = true` in Grafana
|
||||||
|
- Must set `email_attribute_path`, `login_attribute_path`, `name_attribute_path` to extract from userinfo
|
||||||
|
- Users need: primary credential (password + TOTP for MFA), membership in `users` group, email address set
|
||||||
|
- Unix password is separate from primary credential (web login requires primary credential)
|
||||||
|
|
||||||
### Next Steps
|
### Next Steps
|
||||||
|
|
||||||
1. Deploy to monitoring01 to enable Prometheus scraping
|
1. Enable PAM/NSS on production hosts (after test tier validation)
|
||||||
2. Configure TrueNAS LDAP client for NAS integration testing
|
2. Configure TrueNAS LDAP client for NAS integration testing
|
||||||
3. Add OAuth2 clients (Grafana first)
|
3. Add OAuth2 clients for other services as needed
|
||||||
4. Create PAM/NSS client module for other hosts
|
|
||||||
|
|
||||||
## References
|
## References
|
||||||
|
|
||||||
46
docs/plans/completed/garage-s3-storage.md
Normal file
46
docs/plans/completed/garage-s3-storage.md
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
# Garage S3 Storage Server
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Deploy a Garage instance for self-hosted S3-compatible object storage.
|
||||||
|
|
||||||
|
## Garage Basics
|
||||||
|
|
||||||
|
- S3-compatible distributed object storage designed for self-hosting
|
||||||
|
- Supports per-key, per-bucket permissions (read/write/owner)
|
||||||
|
- Keys without explicit grants have no access
|
||||||
|
|
||||||
|
## NixOS Module
|
||||||
|
|
||||||
|
Available as `services.garage` with these key options:
|
||||||
|
|
||||||
|
- `services.garage.enable` - Enable the service
|
||||||
|
- `services.garage.package` - Must be set explicitly
|
||||||
|
- `services.garage.settings` - Freeform TOML config (replication mode, ports, RPC, etc.)
|
||||||
|
- `services.garage.settings.metadata_dir` - Metadata storage (SSD recommended)
|
||||||
|
- `services.garage.settings.data_dir` - Data block storage (supports multiple dirs since v0.9)
|
||||||
|
- `services.garage.environmentFile` - For secrets like `GARAGE_RPC_SECRET`
|
||||||
|
- `services.garage.logLevel` - error/warn/info/debug/trace
|
||||||
|
|
||||||
|
The NixOS module only manages the server daemon. Buckets and keys are managed externally.
|
||||||
|
|
||||||
|
## Bucket/Key Management
|
||||||
|
|
||||||
|
No declarative NixOS options for buckets or keys. Two options:
|
||||||
|
|
||||||
|
1. **Terraform provider** - `jkossis/terraform-provider-garage` manages buckets, keys, and permissions via the Garage Admin API v2. Could live in `terraform/garage/` similar to `terraform/vault/`.
|
||||||
|
2. **CLI** - `garage key create`, `garage bucket create`, `garage bucket allow`
|
||||||
|
|
||||||
|
## Integration Ideas
|
||||||
|
|
||||||
|
- Store Garage API keys in Vault, fetch via `vault.secrets` on consuming hosts
|
||||||
|
- Terraform manages both Vault secrets and Garage buckets/keys
|
||||||
|
- Enable admin API with token for Terraform provider access
|
||||||
|
- Add Prometheus metrics scraping (Garage exposes metrics endpoint)
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- Single-node or multi-node replication?
|
||||||
|
- Which host to deploy on?
|
||||||
|
- What to store? (backups, media, app data)
|
||||||
|
- Expose via HTTP proxy or direct S3 API only?
|
||||||
156
docs/plans/completed/monitoring-migration-victoriametrics.md
Normal file
156
docs/plans/completed/monitoring-migration-victoriametrics.md
Normal file
@@ -0,0 +1,156 @@
|
|||||||
|
# Monitoring Stack Migration to VictoriaMetrics
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Migrate from Prometheus to VictoriaMetrics on a new host (monitoring02) to gain better compression
|
||||||
|
and longer retention. Run in parallel with monitoring01 until validated, then switch over using
|
||||||
|
a `monitoring` CNAME for seamless transition.
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
**monitoring02** (10.69.13.24) - **PRIMARY**:
|
||||||
|
- 4 CPU cores, 8GB RAM, 60GB disk
|
||||||
|
- VictoriaMetrics with 3-month retention
|
||||||
|
- vmalert with alerting enabled (routes to local Alertmanager)
|
||||||
|
- Alertmanager -> alerttonotify -> NATS notification pipeline
|
||||||
|
- Grafana with Kanidm OIDC (`grafana.home.2rjus.net`)
|
||||||
|
- Loki (log aggregation)
|
||||||
|
- CNAMEs: monitoring, alertmanager, grafana, grafana-test, metrics, vmalert, loki
|
||||||
|
|
||||||
|
**monitoring01** (10.69.13.13) - **SHUT DOWN**:
|
||||||
|
- No longer running, pending decommission
|
||||||
|
|
||||||
|
## Decision: VictoriaMetrics
|
||||||
|
|
||||||
|
Per `docs/plans/long-term-metrics-storage.md`, VictoriaMetrics is the recommended starting point:
|
||||||
|
- Single binary replacement for Prometheus
|
||||||
|
- 5-10x better compression (30 days could become 180+ days in same space)
|
||||||
|
- Same PromQL query language (Grafana dashboards work unchanged)
|
||||||
|
- Same scrape config format (existing auto-generated configs work)
|
||||||
|
|
||||||
|
If multi-year retention with downsampling becomes necessary later, Thanos can be evaluated.
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────┐
|
||||||
|
│ monitoring02 │
|
||||||
|
│ VictoriaMetrics│
|
||||||
|
│ + Grafana │
|
||||||
|
monitoring │ + Loki │
|
||||||
|
CNAME ──────────│ + Alertmanager │
|
||||||
|
│ (vmalert) │
|
||||||
|
└─────────────────┘
|
||||||
|
▲
|
||||||
|
│ scrapes
|
||||||
|
┌───────────────┼───────────────┐
|
||||||
|
│ │ │
|
||||||
|
┌────┴────┐ ┌─────┴────┐ ┌─────┴────┐
|
||||||
|
│ ns1 │ │ ha1 │ │ ... │
|
||||||
|
│ :9100 │ │ :9100 │ │ :9100 │
|
||||||
|
└─────────┘ └──────────┘ └──────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## Implementation Plan
|
||||||
|
|
||||||
|
### Phase 1: Create monitoring02 Host [COMPLETE]
|
||||||
|
|
||||||
|
Host created and deployed at 10.69.13.24 (prod tier) with:
|
||||||
|
- 4 CPU cores, 8GB RAM, 60GB disk
|
||||||
|
- Vault integration enabled
|
||||||
|
- NATS-based remote deployment enabled
|
||||||
|
- Grafana with Kanidm OIDC deployed as test instance (`grafana-test.home.2rjus.net`)
|
||||||
|
|
||||||
|
### Phase 2: Set Up VictoriaMetrics Stack [COMPLETE]
|
||||||
|
|
||||||
|
New service module at `services/victoriametrics/` for VictoriaMetrics + vmalert + Alertmanager.
|
||||||
|
Imported by monitoring02 alongside the existing Grafana service.
|
||||||
|
|
||||||
|
1. **VictoriaMetrics** (port 8428):
|
||||||
|
- `services.victoriametrics.enable = true`
|
||||||
|
- `retentionPeriod = "3"` (3 months)
|
||||||
|
- All scrape configs migrated from Prometheus (22 jobs including auto-generated)
|
||||||
|
- Static user override (DynamicUser disabled) for credential file access
|
||||||
|
- OpenBao token fetch service + 30min refresh timer
|
||||||
|
- Apiary bearer token via vault.secrets
|
||||||
|
|
||||||
|
2. **vmalert** for alerting rules:
|
||||||
|
- Points to VictoriaMetrics datasource at localhost:8428
|
||||||
|
- Reuses existing `services/monitoring/rules.yml` directly via `settings.rule`
|
||||||
|
- Notifier sends to local Alertmanager at localhost:9093
|
||||||
|
|
||||||
|
3. **Alertmanager** (port 9093):
|
||||||
|
- Same configuration as monitoring01 (alerttonotify webhook routing)
|
||||||
|
- alerttonotify imported on monitoring02, routes alerts via NATS
|
||||||
|
|
||||||
|
4. **Grafana** (port 3000):
|
||||||
|
- VictoriaMetrics datasource (localhost:8428) as default
|
||||||
|
- Loki datasource pointing to localhost:3100
|
||||||
|
|
||||||
|
5. **Loki** (port 3100):
|
||||||
|
- Same configuration as monitoring01 in standalone `services/loki/` module
|
||||||
|
- Grafana datasource updated to localhost:3100
|
||||||
|
|
||||||
|
**Note:** pve-exporter and pushgateway scrape targets are not included on monitoring02.
|
||||||
|
pve-exporter requires a local exporter instance; pushgateway is replaced by VictoriaMetrics
|
||||||
|
native push support.
|
||||||
|
|
||||||
|
### Phase 3: Parallel Operation [COMPLETE]
|
||||||
|
|
||||||
|
Ran both monitoring01 and monitoring02 simultaneously to validate data collection and dashboards.
|
||||||
|
|
||||||
|
### Phase 4: Add monitoring CNAME [COMPLETE]
|
||||||
|
|
||||||
|
Added CNAMEs to monitoring02: monitoring, alertmanager, grafana, metrics, vmalert, loki.
|
||||||
|
|
||||||
|
### Phase 5: Update References [COMPLETE]
|
||||||
|
|
||||||
|
- Moved alertmanager, grafana, prometheus CNAMEs from http-proxy to monitoring02
|
||||||
|
- Removed corresponding Caddy reverse proxy entries from http-proxy
|
||||||
|
- monitoring02 Caddy serves alertmanager, grafana, metrics, vmalert directly
|
||||||
|
|
||||||
|
### Phase 6: Enable Alerting [COMPLETE]
|
||||||
|
|
||||||
|
- Switched vmalert from blackhole mode to local Alertmanager
|
||||||
|
- alerttonotify service running on monitoring02 (NATS nkey from Vault)
|
||||||
|
- prometheus-metrics Vault policy added for OpenBao scraping
|
||||||
|
- Full alerting pipeline verified: vmalert -> Alertmanager -> alerttonotify -> NATS
|
||||||
|
|
||||||
|
### Phase 7: Cutover and Decommission [IN PROGRESS]
|
||||||
|
|
||||||
|
- monitoring01 shut down (2026-02-17)
|
||||||
|
- Vault AppRole moved from approle.tf to hosts-generated.tf with extra_policies support
|
||||||
|
|
||||||
|
**Remaining cleanup (separate branch):**
|
||||||
|
- [ ] Update `system/monitoring/logs.nix` - Promtail still points to monitoring01
|
||||||
|
- [ ] Update `hosts/template2/bootstrap.nix` - Bootstrap Loki URL still points to monitoring01
|
||||||
|
- [ ] Remove monitoring01 from flake.nix and host configuration
|
||||||
|
- [ ] Destroy monitoring01 VM in Proxmox
|
||||||
|
- [ ] Remove monitoring01 from terraform state
|
||||||
|
- [ ] Remove or archive `services/monitoring/` (Prometheus config)
|
||||||
|
|
||||||
|
## Completed
|
||||||
|
|
||||||
|
- 2026-02-08: Phase 1 - monitoring02 host created
|
||||||
|
- 2026-02-17: Phase 2 - VictoriaMetrics, vmalert, Alertmanager, Loki, Grafana configured
|
||||||
|
- 2026-02-17: Phase 6 - Alerting enabled, CNAMEs migrated, monitoring01 shut down
|
||||||
|
|
||||||
|
## VictoriaMetrics Service Configuration
|
||||||
|
|
||||||
|
Implemented in `services/victoriametrics/default.nix`. Key design decisions:
|
||||||
|
|
||||||
|
- **Static user**: VictoriaMetrics NixOS module uses `DynamicUser`, overridden with a static
|
||||||
|
`victoriametrics` user so vault.secrets and credential files work correctly
|
||||||
|
- **Shared rules**: vmalert reuses `services/monitoring/rules.yml` via `settings.rule` path
|
||||||
|
reference (no YAML-to-Nix conversion needed)
|
||||||
|
- **Scrape config reuse**: Uses the same `lib/monitoring.nix` functions and
|
||||||
|
`services/monitoring/external-targets.nix` as Prometheus for auto-generated targets
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- VictoriaMetrics uses port 8428 vs Prometheus 9090
|
||||||
|
- PromQL compatibility is excellent
|
||||||
|
- VictoriaMetrics native push replaces Pushgateway (remove from http-proxy if not needed)
|
||||||
|
- monitoring02 deployed via OpenTofu using `create-host` script
|
||||||
|
- Grafana dashboards defined declaratively via NixOS, not imported from monitoring01 state
|
||||||
|
- Tempo and Pyroscope deferred (not actively used; can be added later if needed)
|
||||||
135
docs/plans/completed/monitoring02-reboot-alert-investigation.md
Normal file
135
docs/plans/completed/monitoring02-reboot-alert-investigation.md
Normal file
@@ -0,0 +1,135 @@
|
|||||||
|
# monitoring02 Reboot Alert Investigation
|
||||||
|
|
||||||
|
**Date:** 2026-02-10
|
||||||
|
**Status:** Completed - False positive identified
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
A `host_reboot` alert fired for monitoring02 at 16:27:36 UTC. Investigation determined this was a **false positive** caused by NTP clock adjustments, not an actual reboot.
|
||||||
|
|
||||||
|
## Alert Details
|
||||||
|
|
||||||
|
- **Alert:** `host_reboot`
|
||||||
|
- **Rule:** `changes(node_boot_time_seconds[10m]) > 0`
|
||||||
|
- **Host:** monitoring02
|
||||||
|
- **Time:** 2026-02-10T16:27:36Z
|
||||||
|
|
||||||
|
## Investigation Findings
|
||||||
|
|
||||||
|
### Evidence Against Actual Reboot
|
||||||
|
|
||||||
|
1. **Uptime:** System had been up for ~40 hours (143,751 seconds) at time of alert
|
||||||
|
2. **Consistent BOOT_ID:** All logs showed the same systemd BOOT_ID (`fd26e7f3d86f4cd688d1b1d7af62f2ad`) from Feb 9 through the alert time
|
||||||
|
3. **No log gaps:** Logs were continuous - no shutdown/restart cycle visible
|
||||||
|
4. **Prometheus metrics:** `node_boot_time_seconds` showed a 1-second fluctuation, then returned to normal
|
||||||
|
|
||||||
|
### Root Cause: NTP Clock Adjustment
|
||||||
|
|
||||||
|
The `node_boot_time_seconds` metric fluctuated by 1 second due to how Linux calculates boot time:
|
||||||
|
|
||||||
|
```
|
||||||
|
btime = current_wall_clock_time - monotonic_uptime
|
||||||
|
```
|
||||||
|
|
||||||
|
When NTP adjusts the wall clock, `btime` shifts by the same amount. The `node_timex_*` metrics confirmed this:
|
||||||
|
|
||||||
|
| Metric | Value |
|
||||||
|
|--------|-------|
|
||||||
|
| `node_timex_maxerror_seconds` (max in 3h) | 1.02 seconds |
|
||||||
|
| `node_timex_maxerror_seconds` (max in 24h) | 2.05 seconds |
|
||||||
|
| `node_timex_sync_status` | 1 (synced) |
|
||||||
|
| Current `node_timex_offset_seconds` | ~9ms (normal) |
|
||||||
|
|
||||||
|
The kernel's estimated maximum clock error spiked to over 1 second, causing the boot time calculation to drift momentarily.
|
||||||
|
|
||||||
|
Additionally, `systemd-resolved` logged "Clock change detected. Flushing caches." at 16:26:53Z, corroborating the NTP adjustment.
|
||||||
|
|
||||||
|
## Current Time Sync Configuration
|
||||||
|
|
||||||
|
### NixOS Guests
|
||||||
|
- **NTP client:** systemd-timesyncd (NixOS default)
|
||||||
|
- **No explicit configuration** in the codebase
|
||||||
|
- Uses default NixOS NTP server pool
|
||||||
|
|
||||||
|
### Proxmox VMs
|
||||||
|
- **Clocksource:** `kvm-clock` (optimal for KVM VMs)
|
||||||
|
- **QEMU guest agent:** Enabled
|
||||||
|
- **No additional QEMU timing args** configured
|
||||||
|
|
||||||
|
## Potential Improvements
|
||||||
|
|
||||||
|
### 1. Improve Alert Rule (Recommended)
|
||||||
|
|
||||||
|
Add tolerance to filter out small NTP adjustments:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Current rule (triggers on any change)
|
||||||
|
expr: changes(node_boot_time_seconds[10m]) > 0
|
||||||
|
|
||||||
|
# Improved rule (requires >60 second shift)
|
||||||
|
expr: changes(node_boot_time_seconds[10m]) > 0 and abs(delta(node_boot_time_seconds[10m])) > 60
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Switch to Chrony (Optional)
|
||||||
|
|
||||||
|
Chrony handles time adjustments more gracefully than systemd-timesyncd:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
# In common/vm/qemu-guest.nix
|
||||||
|
{
|
||||||
|
services.qemuGuest.enable = true;
|
||||||
|
|
||||||
|
services.timesyncd.enable = false;
|
||||||
|
services.chrony = {
|
||||||
|
enable = true;
|
||||||
|
extraConfig = ''
|
||||||
|
makestep 1 3
|
||||||
|
rtcsync
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Add QEMU Timing Args (Optional)
|
||||||
|
|
||||||
|
In `terraform/vms.tf`:
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
args = "-global kvm-pit.lost_tick_policy=delay -rtc driftfix=slew"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Local NTP Server (Optional)
|
||||||
|
|
||||||
|
Running a local NTP server (e.g., on ns1/ns2) would reduce latency and improve sync stability across all hosts.
|
||||||
|
|
||||||
|
## Monitoring NTP Health
|
||||||
|
|
||||||
|
The `node_timex_*` metrics from node_exporter provide visibility into NTP health:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
# Clock offset from reference
|
||||||
|
node_timex_offset_seconds
|
||||||
|
|
||||||
|
# Sync status (1 = synced)
|
||||||
|
node_timex_sync_status
|
||||||
|
|
||||||
|
# Maximum estimated error - useful for alerting
|
||||||
|
node_timex_maxerror_seconds
|
||||||
|
```
|
||||||
|
|
||||||
|
A potential alert for NTP issues:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- alert: ntp_clock_drift
|
||||||
|
expr: node_timex_maxerror_seconds > 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "High clock drift on {{ $labels.hostname }}"
|
||||||
|
description: "NTP max error is {{ $value }}s on {{ $labels.hostname }}"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Conclusion
|
||||||
|
|
||||||
|
No action required for the alert itself - the system was healthy. Consider implementing the improved alert rule to prevent future false positives from NTP adjustments.
|
||||||
156
docs/plans/completed/nix-cache-reprovision.md
Normal file
156
docs/plans/completed/nix-cache-reprovision.md
Normal file
@@ -0,0 +1,156 @@
|
|||||||
|
# Nix Cache Host Reprovision
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Reprovision `nix-cache01` using the OpenTofu workflow, and improve the build/cache system with:
|
||||||
|
1. NATS-based remote build triggering (replacing the current bash script)
|
||||||
|
2. Safer flake update workflow that validates builds before pushing to master
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
**Phase 1: New Build Host** - COMPLETE
|
||||||
|
**Phase 2: NATS Build Triggering** - COMPLETE
|
||||||
|
**Phase 3: Safe Flake Update Workflow** - NOT STARTED
|
||||||
|
**Phase 4: Complete Migration** - COMPLETE
|
||||||
|
**Phase 5: Scheduled Builds** - COMPLETE
|
||||||
|
|
||||||
|
## Completed Work
|
||||||
|
|
||||||
|
### New Build Host (nix-cache02)
|
||||||
|
|
||||||
|
Instead of reprovisioning nix-cache01 in-place, we created a new host `nix-cache02` at 10.69.13.25:
|
||||||
|
|
||||||
|
- **Specs**: 8 CPU cores, 16GB RAM (temporarily, will increase to 24GB after nix-cache01 decommissioned), 200GB disk
|
||||||
|
- **Provisioned via OpenTofu** with automatic Vault credential bootstrapping
|
||||||
|
- **Builder service** configured with two repos:
|
||||||
|
- `nixos-servers` → `git+https://git.t-juice.club/torjus/nixos-servers.git`
|
||||||
|
- `nixos` (gunter) → `git+https://git.t-juice.club/torjus/nixos.git`
|
||||||
|
|
||||||
|
### NATS-Based Build Triggering
|
||||||
|
|
||||||
|
The `homelab-deploy` tool was extended with a builder mode:
|
||||||
|
|
||||||
|
**NATS Subjects:**
|
||||||
|
- `build.<repo>.<target>` - e.g., `build.nixos-servers.all` or `build.nixos-servers.ns1`
|
||||||
|
|
||||||
|
**NATS Permissions (in DEPLOY account):**
|
||||||
|
| User | Publish | Subscribe |
|
||||||
|
|------|---------|-----------|
|
||||||
|
| Builder | `build.responses.>` | `build.>` |
|
||||||
|
| Test deployer | `deploy.test.>`, `deploy.discover`, `build.>` | `deploy.responses.>`, `deploy.discover`, `build.responses.>` |
|
||||||
|
| Admin deployer | `deploy.>`, `build.>` | `deploy.>`, `build.responses.>` |
|
||||||
|
|
||||||
|
**Vault Secrets:**
|
||||||
|
- `shared/homelab-deploy/builder-nkey` - NKey seed for builder authentication
|
||||||
|
|
||||||
|
**NixOS Configuration:**
|
||||||
|
- `hosts/nix-cache02/builder.nix` - Builder service configuration
|
||||||
|
- `services/nats/default.nix` - Updated with builder NATS user
|
||||||
|
|
||||||
|
**MCP Integration:**
|
||||||
|
- `.mcp.json` updated with `--enable-builds` flag
|
||||||
|
- Build tool available via MCP for Claude Code
|
||||||
|
|
||||||
|
**Tested:**
|
||||||
|
- Single host build: `build nixos-servers testvm01` (~30s)
|
||||||
|
- All hosts build: `build nixos-servers all` (16 hosts in ~226s)
|
||||||
|
|
||||||
|
### Harmonia Binary Cache
|
||||||
|
|
||||||
|
- Parameterized `services/nix-cache/harmonia.nix` to use hostname-based Vault paths
|
||||||
|
- Parameterized `services/nix-cache/proxy.nix` for hostname-based domain
|
||||||
|
- New signing key: `nix-cache02.home.2rjus.net-1`
|
||||||
|
- Vault secret: `hosts/nix-cache02/cache-secret`
|
||||||
|
- Removed unused Gitea Actions runner from nix-cache01
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
### nix-cache02 (Active)
|
||||||
|
- Running at 10.69.13.25
|
||||||
|
- Serving `https://nix-cache.home.2rjus.net` (canonical URL)
|
||||||
|
- Builder service active, responding to NATS build requests
|
||||||
|
- Metrics exposed on port 9973 (`homelab-deploy-builder` job)
|
||||||
|
- Harmonia binary cache server running
|
||||||
|
- Signing key: `nix-cache02.home.2rjus.net-1`
|
||||||
|
- Prod tier with `build-host` role
|
||||||
|
|
||||||
|
### nix-cache01 (Decommissioned)
|
||||||
|
- VM deleted from Proxmox
|
||||||
|
- Host configuration removed from repo
|
||||||
|
- Vault AppRole and secrets removed
|
||||||
|
- Old signing key removed from trusted-public-keys
|
||||||
|
|
||||||
|
## Remaining Work
|
||||||
|
|
||||||
|
### Phase 3: Safe Flake Update Workflow
|
||||||
|
|
||||||
|
1. Create `.github/workflows/flake-update-safe.yaml`
|
||||||
|
2. Disable or remove old `flake-update.yaml`
|
||||||
|
3. Test manually with `workflow_dispatch`
|
||||||
|
4. Monitor first automated run
|
||||||
|
|
||||||
|
### Phase 4: Complete Migration ✅
|
||||||
|
|
||||||
|
1. ~~**Add Harmonia to nix-cache02**~~ ✅ Done - new signing key, parameterized service
|
||||||
|
2. ~~**Add trusted public key to all hosts**~~ ✅ Done - `system/nix.nix` updated
|
||||||
|
3. ~~**Test cache from other hosts**~~ ✅ Done - verified from testvm01
|
||||||
|
4. ~~**Update proxy and DNS**~~ ✅ Done - `nix-cache.home.2rjus.net` CNAME now points to nix-cache02
|
||||||
|
5. ~~**Deploy to all hosts**~~ ✅ Done - all hosts have new trusted key
|
||||||
|
6. ~~**Decommission nix-cache01**~~ ✅ Done - 2026-02-10:
|
||||||
|
- Removed `hosts/nix-cache01/` directory
|
||||||
|
- Removed `services/nix-cache/build-flakes.{nix,sh}`
|
||||||
|
- Removed Vault AppRole and secrets
|
||||||
|
- Removed old signing key from `system/nix.nix`
|
||||||
|
- Removed from `flake.nix`
|
||||||
|
- Deleted VM from Proxmox
|
||||||
|
|
||||||
|
### Phase 5: Scheduled Builds ✅
|
||||||
|
|
||||||
|
Implemented a systemd timer on nix-cache02 that triggers builds every 2 hours:
|
||||||
|
|
||||||
|
- **Timer**: `scheduled-build.timer` runs every 2 hours with 5m random jitter
|
||||||
|
- **Service**: `scheduled-build.service` calls `homelab-deploy build` for both repos
|
||||||
|
- **Authentication**: Dedicated scheduler NKey stored in Vault
|
||||||
|
- **NATS user**: Added to DEPLOY account with publish `build.>` and subscribe `build.responses.>`
|
||||||
|
|
||||||
|
Files:
|
||||||
|
- `hosts/nix-cache02/scheduler.nix` - Timer and service configuration
|
||||||
|
- `services/nats/default.nix` - Scheduler NATS user
|
||||||
|
- `terraform/vault/secrets.tf` - Scheduler NKey secret
|
||||||
|
- `terraform/vault/variables.tf` - Variable for scheduler NKey
|
||||||
|
|
||||||
|
## Resolved Questions
|
||||||
|
|
||||||
|
- **Parallel vs sequential builds?** Sequential - hosts share packages, subsequent builds are fast after first
|
||||||
|
- **What about gunter?** Configured as `nixos` repo in builder settings
|
||||||
|
- **Disk size?** 200GB for new host
|
||||||
|
- **Build host specs?** 8 cores, 16-24GB RAM matches current nix-cache01
|
||||||
|
|
||||||
|
### Phase 6: Observability
|
||||||
|
|
||||||
|
1. **Alerting rules** for build failures:
|
||||||
|
```promql
|
||||||
|
# Alert if any build fails
|
||||||
|
increase(homelab_deploy_build_host_total{status="failure"}[1h]) > 0
|
||||||
|
|
||||||
|
# Alert if no successful builds in 24h (scheduled builds stopped)
|
||||||
|
time() - homelab_deploy_build_last_success_timestamp > 86400
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Grafana dashboard** for build metrics:
|
||||||
|
- Build success/failure rate over time
|
||||||
|
- Average build duration per host (histogram)
|
||||||
|
- Build frequency (builds per hour/day)
|
||||||
|
- Last successful build timestamp per repo
|
||||||
|
|
||||||
|
Available metrics:
|
||||||
|
- `homelab_deploy_builds_total{repo, status}` - total builds by repo and status
|
||||||
|
- `homelab_deploy_build_host_total{repo, host, status}` - per-host build counts
|
||||||
|
- `homelab_deploy_build_duration_seconds_{bucket,sum,count}` - build duration histogram
|
||||||
|
- `homelab_deploy_build_last_timestamp{repo}` - last build attempt
|
||||||
|
- `homelab_deploy_build_last_success_timestamp{repo}` - last successful build
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- [x] ~~When to cut over DNS from nix-cache01 to nix-cache02?~~ Done - 2026-02-10
|
||||||
|
- [ ] Implement safe flake update workflow before or after full migration?
|
||||||
87
docs/plans/completed/openbao-kanidm-oidc.md
Normal file
87
docs/plans/completed/openbao-kanidm-oidc.md
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
# OpenBao + Kanidm OIDC Integration
|
||||||
|
|
||||||
|
## Status: Completed
|
||||||
|
|
||||||
|
Implemented 2026-02-09.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Enable Kanidm users to authenticate to OpenBao (Vault) using OIDC for Web UI access. Members of the `admins` group get full read/write access to secrets.
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
### Files Modified
|
||||||
|
|
||||||
|
| File | Changes |
|
||||||
|
|------|---------|
|
||||||
|
| `terraform/vault/oidc.tf` | New - OIDC auth backend and roles |
|
||||||
|
| `terraform/vault/policies.tf` | Added oidc-admin and oidc-default policies |
|
||||||
|
| `terraform/vault/secrets.tf` | Added OAuth2 client secret |
|
||||||
|
| `terraform/vault/approle.tf` | Granted kanidm01 access to openbao secrets |
|
||||||
|
| `services/kanidm/default.nix` | Added openbao OAuth2 client, enabled imperative group membership |
|
||||||
|
|
||||||
|
### Kanidm Configuration
|
||||||
|
|
||||||
|
OAuth2 client `openbao` with:
|
||||||
|
- Confidential client (uses client secret)
|
||||||
|
- Web UI callback only: `https://vault.home.2rjus.net:8200/ui/vault/auth/oidc/oidc/callback`
|
||||||
|
- Legacy crypto enabled (RS256 for OpenBao compatibility)
|
||||||
|
- Scope maps for `admins` and `users` groups
|
||||||
|
|
||||||
|
Group membership is now managed imperatively (`overwriteMembers = false`) to prevent provisioning from resetting group memberships on service restart.
|
||||||
|
|
||||||
|
### OpenBao Configuration
|
||||||
|
|
||||||
|
OIDC auth backend at `/oidc` with two roles:
|
||||||
|
|
||||||
|
| Role | Bound Claims | Policy | Access |
|
||||||
|
|------|--------------|--------|--------|
|
||||||
|
| `admin` | `groups = admins@home.2rjus.net` | `oidc-admin` | Full read/write to secrets, system health/metrics |
|
||||||
|
| `default` | (none) | `oidc-default` | Token lookup-self, system health |
|
||||||
|
|
||||||
|
Both roles request scopes: `openid`, `profile`, `email`, `groups`
|
||||||
|
|
||||||
|
### Policies
|
||||||
|
|
||||||
|
**oidc-admin:**
|
||||||
|
- `secret/*` - create, read, update, delete, list
|
||||||
|
- `sys/health` - read
|
||||||
|
- `sys/metrics` - read
|
||||||
|
- `sys/auth` - read
|
||||||
|
- `sys/mounts` - read
|
||||||
|
|
||||||
|
**oidc-default:**
|
||||||
|
- `auth/token/lookup-self` - read
|
||||||
|
- `sys/health` - read
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Web UI Login
|
||||||
|
1. Navigate to https://vault.home.2rjus.net:8200
|
||||||
|
2. Select "OIDC" authentication method
|
||||||
|
3. Enter role: `admin` (for admins) or `default` (for any user)
|
||||||
|
4. Click "Sign in with OIDC"
|
||||||
|
5. Authenticate with Kanidm
|
||||||
|
|
||||||
|
### Group Management
|
||||||
|
Add users to admins group for full access:
|
||||||
|
```bash
|
||||||
|
kanidm group add-members admins <username>
|
||||||
|
```
|
||||||
|
|
||||||
|
## Limitations
|
||||||
|
|
||||||
|
**CLI login not supported:** Kanidm requires HTTPS for all redirect URIs on confidential (non-public) OAuth2 clients. OpenBao CLI uses `http://localhost:8250/oidc/callback` which Kanidm rejects. Public clients would allow localhost redirects, but OpenBao requires a client secret for OIDC auth.
|
||||||
|
|
||||||
|
## Lessons Learned
|
||||||
|
|
||||||
|
1. **Kanidm group names:** Groups are returned as `groupname@domain` (e.g., `admins@home.2rjus.net`), not just the short name
|
||||||
|
2. **RS256 required:** OpenBao only supports RS256 for JWT signing; Kanidm defaults to ES256, requiring `enableLegacyCrypto = true`
|
||||||
|
3. **Scope request:** OIDC roles must explicitly request the `groups` scope via `oidc_scopes`
|
||||||
|
4. **Provisioning resets:** Kanidm provisioning with default `overwriteMembers = true` resets group memberships on restart
|
||||||
|
5. **Two-phase Terraform:** Secret must exist before OIDC backend can validate discovery URL
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [OpenBao JWT/OIDC Auth Method](https://openbao.org/docs/auth/jwt/)
|
||||||
|
- [Kanidm OAuth2 Documentation](https://kanidm.github.io/kanidm/stable/integrations/oauth2.html)
|
||||||
@@ -20,9 +20,9 @@ Hosts to migrate:
|
|||||||
| http-proxy | Stateless | Reverse proxy, recreate |
|
| http-proxy | Stateless | Reverse proxy, recreate |
|
||||||
| nats1 | Stateless | Messaging, recreate |
|
| nats1 | Stateless | Messaging, recreate |
|
||||||
| ha1 | Stateful | Home Assistant + Zigbee2MQTT + Mosquitto |
|
| ha1 | Stateful | Home Assistant + Zigbee2MQTT + Mosquitto |
|
||||||
| monitoring01 | Stateful | Prometheus, Grafana, Loki |
|
| ~~monitoring01~~ | ~~Decommission~~ | ✓ Complete — replaced by monitoring02 (VictoriaMetrics) |
|
||||||
| jelly01 | Stateful | Jellyfin metadata, watch history, config |
|
| jelly01 | Stateful | Jellyfin metadata, watch history, config |
|
||||||
| pgdb1 | Decommission | Only used by Open WebUI on gunter, migrating to local postgres |
|
| ~~pgdb1~~ | ~~Decommission~~ | ✓ Complete |
|
||||||
| ~~jump~~ | ~~Decommission~~ | ✓ Complete |
|
| ~~jump~~ | ~~Decommission~~ | ✓ Complete |
|
||||||
| ~~auth01~~ | ~~Decommission~~ | ✓ Complete |
|
| ~~auth01~~ | ~~Decommission~~ | ✓ Complete |
|
||||||
| ~~ca~~ | ~~Deferred~~ | ✓ Complete |
|
| ~~ca~~ | ~~Deferred~~ | ✓ Complete |
|
||||||
@@ -31,10 +31,12 @@ Hosts to migrate:
|
|||||||
|
|
||||||
Before migrating any stateful host, ensure restic backups are in place and verified.
|
Before migrating any stateful host, ensure restic backups are in place and verified.
|
||||||
|
|
||||||
### 1a. Expand monitoring01 Grafana Backup
|
### ~~1a. Expand monitoring01 Grafana Backup~~ ✓ N/A
|
||||||
|
|
||||||
The existing backup only covers `/var/lib/grafana/plugins` and a sqlite dump of `grafana.db`.
|
~~The existing backup only covers `/var/lib/grafana/plugins` and a sqlite dump of `grafana.db`.
|
||||||
Expand to back up all of `/var/lib/grafana/` to capture config directory and any other state.
|
Expand to back up all of `/var/lib/grafana/` to capture config directory and any other state.~~
|
||||||
|
|
||||||
|
No longer needed — monitoring01 decommissioned, replaced by monitoring02 with declarative Grafana dashboards.
|
||||||
|
|
||||||
### 1b. Add Jellyfin Backup to jelly01
|
### 1b. Add Jellyfin Backup to jelly01
|
||||||
|
|
||||||
@@ -94,15 +96,17 @@ For each stateful host, the procedure is:
|
|||||||
7. Start services and verify functionality
|
7. Start services and verify functionality
|
||||||
8. Decommission the old VM
|
8. Decommission the old VM
|
||||||
|
|
||||||
### 3a. monitoring01
|
### 3a. monitoring01 ✓ COMPLETE
|
||||||
|
|
||||||
1. Run final Grafana backup
|
~~1. Run final Grafana backup~~
|
||||||
2. Provision new monitoring01 via OpenTofu
|
~~2. Provision new monitoring01 via OpenTofu~~
|
||||||
3. After bootstrap, restore `/var/lib/grafana/` from restic
|
~~3. After bootstrap, restore `/var/lib/grafana/` from restic~~
|
||||||
4. Restart Grafana, verify dashboards and datasources are intact
|
~~4. Restart Grafana, verify dashboards and datasources are intact~~
|
||||||
5. Prometheus and Loki start fresh with empty data (acceptable)
|
~~5. Prometheus and Loki start fresh with empty data (acceptable)~~
|
||||||
6. Verify all scrape targets are being collected
|
~~6. Verify all scrape targets are being collected~~
|
||||||
7. Decommission old VM
|
~~7. Decommission old VM~~
|
||||||
|
|
||||||
|
Replaced by monitoring02 with VictoriaMetrics, standalone Loki and Grafana modules. Host configuration, old service modules, and terraform resources removed.
|
||||||
|
|
||||||
### 3b. jelly01
|
### 3b. jelly01
|
||||||
|
|
||||||
@@ -163,19 +167,19 @@ Host was already removed from flake.nix and VM destroyed. Configuration cleaned
|
|||||||
|
|
||||||
Host configuration, services, and VM already removed.
|
Host configuration, services, and VM already removed.
|
||||||
|
|
||||||
### pgdb1 (in progress)
|
### pgdb1 ✓ COMPLETE
|
||||||
|
|
||||||
Only consumer was Open WebUI on gunter, which has been migrated to use local PostgreSQL.
|
~~Only consumer was Open WebUI on gunter, which has been migrated to use local PostgreSQL.~~
|
||||||
|
|
||||||
1. ~~Verify Open WebUI on gunter is using local PostgreSQL (not pgdb1)~~ ✓
|
~~1. Verify Open WebUI on gunter is using local PostgreSQL (not pgdb1)~~
|
||||||
2. ~~Remove host configuration from `hosts/pgdb1/`~~ ✓
|
~~2. Remove host configuration from `hosts/pgdb1/`~~
|
||||||
3. ~~Remove `services/postgres/` (only used by pgdb1)~~ ✓
|
~~3. Remove `services/postgres/` (only used by pgdb1)~~
|
||||||
4. ~~Remove from `flake.nix`~~ ✓
|
~~4. Remove from `flake.nix`~~
|
||||||
5. ~~Remove Vault AppRole from `terraform/vault/approle.tf`~~ ✓
|
~~5. Remove Vault AppRole from `terraform/vault/approle.tf`~~
|
||||||
6. Destroy the VM in Proxmox
|
~~6. Destroy the VM in Proxmox~~
|
||||||
7. ~~Commit cleanup~~ ✓
|
~~7. Commit cleanup~~
|
||||||
|
|
||||||
See `docs/plans/pgdb1-decommission.md` for detailed plan.
|
Host configuration, services, terraform resources, and VM removed. See `docs/plans/pgdb1-decommission.md` for detailed plan.
|
||||||
|
|
||||||
## Phase 5: Decommission ca Host ✓ COMPLETE
|
## Phase 5: Decommission ca Host ✓ COMPLETE
|
||||||
|
|
||||||
|
|||||||
196
docs/plans/loki-improvements.md
Normal file
196
docs/plans/loki-improvements.md
Normal file
@@ -0,0 +1,196 @@
|
|||||||
|
# Loki Setup Improvements
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The current Loki deployment on monitoring01 is functional but minimal. It lacks retention policies, rate limiting, and uses local filesystem storage. This plan evaluates improvement options across several dimensions: retention management, storage backend, resource limits, and operational improvements.
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
**Loki** on monitoring01 (`services/monitoring/loki.nix`):
|
||||||
|
- Single-node deployment, no HA
|
||||||
|
- Filesystem storage at `/var/lib/loki/chunks` (~6.8 GB as of 2026-02-13)
|
||||||
|
- TSDB index (v13 schema, 24h period)
|
||||||
|
- 30-day compactor-based retention with basic rate limits
|
||||||
|
- No caching layer
|
||||||
|
- Auth disabled (trusted network)
|
||||||
|
|
||||||
|
**Promtail** on all 16 hosts (`system/monitoring/logs.nix`):
|
||||||
|
- Ships systemd journal (JSON) + `/var/log/**/*.log`
|
||||||
|
- Labels: `hostname`, `tier`, `role`, `level`, `job` (systemd-journal/varlog), `systemd_unit`
|
||||||
|
- `level` label mapped from journal PRIORITY (critical/error/warning/notice/info/debug)
|
||||||
|
- Hardcoded to `http://monitoring01.home.2rjus.net:3100`
|
||||||
|
|
||||||
|
**Additional log sources:**
|
||||||
|
- `pipe-to-loki` script (manual log submission, `job=pipe-to-loki`)
|
||||||
|
- Bootstrap logs from template2 (`job=bootstrap`)
|
||||||
|
|
||||||
|
**Context:** The VictoriaMetrics migration plan (`docs/plans/monitoring-migration-victoriametrics.md`) includes moving Loki to monitoring02 with "same configuration as current". These improvements could be applied either before or after that migration.
|
||||||
|
|
||||||
|
## Improvement Areas
|
||||||
|
|
||||||
|
### 1. Retention Policy
|
||||||
|
|
||||||
|
**Implemented.** Compactor-based retention with 30-day period. Note: Loki 3.6.3 requires `delete_request_store = "filesystem"` when retention is enabled (not documented in older guides).
|
||||||
|
|
||||||
|
```nix
|
||||||
|
compactor = {
|
||||||
|
working_directory = "/var/lib/loki/compactor";
|
||||||
|
compaction_interval = "10m";
|
||||||
|
retention_enabled = true;
|
||||||
|
retention_delete_delay = "2h";
|
||||||
|
retention_delete_worker_count = 150;
|
||||||
|
delete_request_store = "filesystem";
|
||||||
|
};
|
||||||
|
|
||||||
|
limits_config = {
|
||||||
|
retention_period = "30d";
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Storage Backend
|
||||||
|
|
||||||
|
**Decision:** Stay with filesystem storage for now. Garage S3 was considered but ruled out - the current single-node Garage (replication_factor=1) offers no real durability benefit over local disk. S3 storage can be revisited after the NAS migration, when a more robust S3-compatible solution will likely be available.
|
||||||
|
|
||||||
|
### 3. Limits Configuration
|
||||||
|
|
||||||
|
**Implemented.** Basic guardrails added alongside retention in `limits_config`:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
limits_config = {
|
||||||
|
retention_period = "30d";
|
||||||
|
ingestion_rate_mb = 10; # MB/s per tenant
|
||||||
|
ingestion_burst_size_mb = 20; # Burst allowance
|
||||||
|
max_streams_per_user = 10000; # Prevent label explosion
|
||||||
|
max_query_series = 500; # Limit query resource usage
|
||||||
|
max_query_parallelism = 8;
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Promtail Label Improvements
|
||||||
|
|
||||||
|
**Problem:** Label inconsistencies and missing useful metadata:
|
||||||
|
- The `varlog` scrape config uses `hostname` while journal uses `host` (different label name)
|
||||||
|
- No `tier` or `role` labels, making it hard to filter logs by deployment tier or host function
|
||||||
|
|
||||||
|
**Implemented:** Standardized on `hostname` to match Prometheus labels. The journal scrape previously used a relabel from `__journal__hostname` to `host`; now both scrape configs use a static `hostname` label from `config.networking.hostName`. Also updated `pipe-to-loki` and bootstrap scripts to use `hostname` instead of `host`.
|
||||||
|
|
||||||
|
1. **Standardized label:** Both scrape configs use `hostname` (matching Prometheus) via shared `hostLabels`
|
||||||
|
2. **Added `tier` label:** Static label from `config.homelab.host.tier` (`test`/`prod`) on both scrape configs
|
||||||
|
3. **Added `role` label:** Static label from `config.homelab.host.role` on both scrape configs (conditionally, only when non-null)
|
||||||
|
|
||||||
|
No cardinality impact - `tier` and `role` are 1:1 with `hostname`, so they add metadata to existing streams without creating new ones.
|
||||||
|
|
||||||
|
This enables queries like:
|
||||||
|
- `{tier="prod"} |= "error"` - all errors on prod hosts
|
||||||
|
- `{role="dns"}` - all DNS server logs
|
||||||
|
- `{tier="test", job="systemd-journal"}` - journal logs from test hosts
|
||||||
|
|
||||||
|
### 5. Journal Priority → Level Label
|
||||||
|
|
||||||
|
**Implemented.** Promtail pipeline stages map journal `PRIORITY` to a `level` label:
|
||||||
|
|
||||||
|
| PRIORITY | level |
|
||||||
|
|----------|-------|
|
||||||
|
| 0-2 | critical |
|
||||||
|
| 3 | error |
|
||||||
|
| 4 | warning |
|
||||||
|
| 5 | notice |
|
||||||
|
| 6 | info |
|
||||||
|
| 7 | debug |
|
||||||
|
|
||||||
|
Uses a `json` stage to extract PRIORITY, `template` to map to level name, and `labels` to attach it. This gives reliable level filtering for all journal logs, unlike Loki's `detected_level` which only works for apps that embed level keywords in message text.
|
||||||
|
|
||||||
|
Example queries:
|
||||||
|
- `{level="error"}` - all errors across the fleet
|
||||||
|
- `{level=~"critical|error", tier="prod"}` - prod errors and criticals
|
||||||
|
- `{level="warning", role="dns"}` - warnings from DNS servers
|
||||||
|
|
||||||
|
### 6. Enable JSON Logging on Services
|
||||||
|
|
||||||
|
**Problem:** Many services support structured JSON log output but may be using plain text by default. JSON logs are significantly easier to query in Loki - `| json` cleanly extracts all fields, whereas plain text requires fragile regex or pattern matching.
|
||||||
|
|
||||||
|
**Audit results (2026-02-13):**
|
||||||
|
|
||||||
|
**Already logging JSON:**
|
||||||
|
- Caddy (all instances) - JSON by default for access logs
|
||||||
|
- homelab-deploy (listener/builder) - Go app, logs structured JSON
|
||||||
|
|
||||||
|
**Supports JSON, not configured (high value):**
|
||||||
|
|
||||||
|
| Service | How to enable | Config file |
|
||||||
|
|---------|--------------|-------------|
|
||||||
|
| Prometheus | `--log.format=json` | `services/monitoring/prometheus.nix` |
|
||||||
|
| Alertmanager | `--log.format=json` | `services/monitoring/prometheus.nix` |
|
||||||
|
| Loki | `--log.format=json` | `services/monitoring/loki.nix` |
|
||||||
|
| Grafana | `log.console.format = "json"` | `services/monitoring/grafana.nix` |
|
||||||
|
| Tempo | `log_format: json` in config | `services/monitoring/tempo.nix` |
|
||||||
|
| OpenBao | `log_format = "json"` | `services/vault/default.nix` |
|
||||||
|
|
||||||
|
**Supports JSON, not configured (lower value - minimal log output):**
|
||||||
|
|
||||||
|
| Service | How to enable |
|
||||||
|
|---------|--------------|
|
||||||
|
| Pyroscope | `--log.format=json` (OCI container) |
|
||||||
|
| Blackbox Exporter | `--log.format=json` |
|
||||||
|
| Node Exporter | `--log.format=json` (all 16 hosts) |
|
||||||
|
| Systemd Exporter | `--log.format=json` (all 16 hosts) |
|
||||||
|
|
||||||
|
**No JSON support (syslog/text only):**
|
||||||
|
- NSD, Unbound, OpenSSH, Mosquitto
|
||||||
|
|
||||||
|
**Needs verification:**
|
||||||
|
- Kanidm, Jellyfin, Home Assistant, Harmonia, Zigbee2MQTT, NATS
|
||||||
|
|
||||||
|
**Recommendation:** Start with the monitoring stack (Prometheus, Alertmanager, Loki, Grafana, Tempo) since they're all Go apps with the same `--log.format=json` flag. Then OpenBao. The exporters are lower priority since they produce minimal log output.
|
||||||
|
|
||||||
|
### 7. Monitoring CNAME for Promtail Target
|
||||||
|
|
||||||
|
**Problem:** Promtail hardcodes `monitoring01.home.2rjus.net:3100`. The VictoriaMetrics migration plan already addresses this by switching to a `monitoring` CNAME.
|
||||||
|
|
||||||
|
**Recommendation:** This should happen as part of the monitoring02 migration, not independently. If we do Loki improvements before that migration, keep pointing to monitoring01.
|
||||||
|
|
||||||
|
## Priority Ranking
|
||||||
|
|
||||||
|
| # | Improvement | Effort | Impact | Status |
|
||||||
|
|---|-------------|--------|--------|--------|
|
||||||
|
| 1 | **Retention policy** | Low | High | Done (30d compactor retention) |
|
||||||
|
| 2 | **Limits config** | Low | Medium | Done (rate limits + stream guards) |
|
||||||
|
| 3 | **Promtail labels** | Trivial | Low | Done (hostname/tier/role/level) |
|
||||||
|
| 4 | **Journal priority → level** | Low-medium | Medium | Done (pipeline stages) |
|
||||||
|
| 5 | **JSON logging audit** | Low-medium | Medium | Audited, not yet enabled |
|
||||||
|
| 6 | **Monitoring CNAME** | Low | Medium | Part of monitoring02 migration |
|
||||||
|
|
||||||
|
## Implementation Steps
|
||||||
|
|
||||||
|
### Phase 1: Retention + Labels (done 2026-02-13)
|
||||||
|
|
||||||
|
1. ~~Add `compactor` section to `services/monitoring/loki.nix`~~ Done
|
||||||
|
2. ~~Add `limits_config` with 30-day retention and basic rate limits~~ Done
|
||||||
|
3. ~~Update `system/monitoring/logs.nix`~~ Done:
|
||||||
|
- Standardized on `hostname` label (matching Prometheus) for both scrape configs
|
||||||
|
- Added `tier` and `role` static labels from `homelab.host` options
|
||||||
|
- Added pipeline stages for journal PRIORITY → `level` label mapping
|
||||||
|
4. ~~Update `pipe-to-loki` and bootstrap scripts to use `hostname`~~ Done
|
||||||
|
5. ~~Deploy and verify labels~~ Done - all 15 hosts reporting with correct labels
|
||||||
|
|
||||||
|
### Phase 2: JSON Logging (not started)
|
||||||
|
|
||||||
|
Enable JSON logging on services that support it, starting with the monitoring stack:
|
||||||
|
1. Prometheus, Alertmanager, Loki, Grafana, Tempo (`--log.format=json`)
|
||||||
|
2. OpenBao (`log_format = "json"`)
|
||||||
|
3. Lower priority: exporters (node-exporter, systemd-exporter, blackbox)
|
||||||
|
|
||||||
|
### Phase 3 (future): S3 Storage Migration
|
||||||
|
|
||||||
|
Revisit after NAS migration when a proper S3-compatible storage solution is available. At that point, add a new schema period with `object_store = "s3"` - the old filesystem period will continue serving historical data until it ages out past retention.
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- [ ] Do we want per-stream retention (e.g., keep bootstrap/pipe-to-loki longer)?
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Loki schema changes require adding a new period entry (not modifying existing ones). The old period continues serving historical data.
|
||||||
|
- Loki 3.6.3 requires `delete_request_store = "filesystem"` in the compactor config when retention is enabled.
|
||||||
|
- S3 storage deferred until post-NAS migration when a proper solution is available.
|
||||||
|
- As of 2026-02-13, Loki uses ~6.8 GB for ~30 days of logs from 16 hosts. Prometheus uses ~7.6 GB on the same disk (33 GB total, ~8 GB free).
|
||||||
@@ -1,219 +0,0 @@
|
|||||||
# Monitoring Stack Migration to VictoriaMetrics
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
Migrate from Prometheus to VictoriaMetrics on a new host (monitoring02) to gain better compression
|
|
||||||
and longer retention. Run in parallel with monitoring01 until validated, then switch over using
|
|
||||||
a `monitoring` CNAME for seamless transition.
|
|
||||||
|
|
||||||
## Current State
|
|
||||||
|
|
||||||
**monitoring01** (10.69.13.13):
|
|
||||||
- 4 CPU cores, 4GB RAM, 33GB disk
|
|
||||||
- Prometheus with 30-day retention (15s scrape interval)
|
|
||||||
- Alertmanager (routes to alerttonotify webhook)
|
|
||||||
- Grafana (dashboards, datasources)
|
|
||||||
- Loki (log aggregation from all hosts via Promtail)
|
|
||||||
- Tempo (distributed tracing)
|
|
||||||
- Pyroscope (continuous profiling)
|
|
||||||
|
|
||||||
**Hardcoded References to monitoring01:**
|
|
||||||
- `system/monitoring/logs.nix` - Promtail sends logs to `http://monitoring01.home.2rjus.net:3100`
|
|
||||||
- `hosts/template2/bootstrap.nix` - Bootstrap logs to Loki (keep as-is until decommission)
|
|
||||||
- `services/http-proxy/proxy.nix` - Caddy proxies Prometheus, Alertmanager, Grafana, Pyroscope, Pushgateway
|
|
||||||
|
|
||||||
**Auto-generated:**
|
|
||||||
- Prometheus scrape targets (from `lib/monitoring.nix` + `homelab.monitoring.scrapeTargets`)
|
|
||||||
- Node-exporter targets (from all hosts with static IPs)
|
|
||||||
|
|
||||||
## Decision: VictoriaMetrics
|
|
||||||
|
|
||||||
Per `docs/plans/long-term-metrics-storage.md`, VictoriaMetrics is the recommended starting point:
|
|
||||||
- Single binary replacement for Prometheus
|
|
||||||
- 5-10x better compression (30 days could become 180+ days in same space)
|
|
||||||
- Same PromQL query language (Grafana dashboards work unchanged)
|
|
||||||
- Same scrape config format (existing auto-generated configs work)
|
|
||||||
|
|
||||||
If multi-year retention with downsampling becomes necessary later, Thanos can be evaluated.
|
|
||||||
|
|
||||||
## Architecture
|
|
||||||
|
|
||||||
```
|
|
||||||
┌─────────────────┐
|
|
||||||
│ monitoring02 │
|
|
||||||
│ VictoriaMetrics│
|
|
||||||
│ + Grafana │
|
|
||||||
monitoring │ + Loki │
|
|
||||||
CNAME ──────────│ + Tempo │
|
|
||||||
│ + Pyroscope │
|
|
||||||
│ + Alertmanager │
|
|
||||||
│ (vmalert) │
|
|
||||||
└─────────────────┘
|
|
||||||
▲
|
|
||||||
│ scrapes
|
|
||||||
┌───────────────┼───────────────┐
|
|
||||||
│ │ │
|
|
||||||
┌────┴────┐ ┌─────┴────┐ ┌─────┴────┐
|
|
||||||
│ ns1 │ │ ha1 │ │ ... │
|
|
||||||
│ :9100 │ │ :9100 │ │ :9100 │
|
|
||||||
└─────────┘ └──────────┘ └──────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
## Implementation Plan
|
|
||||||
|
|
||||||
### Phase 1: Create monitoring02 Host
|
|
||||||
|
|
||||||
Use `create-host` script which handles flake.nix and terraform/vms.tf automatically.
|
|
||||||
|
|
||||||
1. **Run create-host**: `nix develop -c create-host monitoring02 10.69.13.24`
|
|
||||||
2. **Update VM resources** in `terraform/vms.tf`:
|
|
||||||
- 4 cores (same as monitoring01)
|
|
||||||
- 8GB RAM (double, for VictoriaMetrics headroom)
|
|
||||||
- 100GB disk (for 3+ months retention with compression)
|
|
||||||
3. **Update host configuration**: Import monitoring services
|
|
||||||
4. **Create Vault AppRole**: Add to `terraform/vault/approle.tf`
|
|
||||||
|
|
||||||
### Phase 2: Set Up VictoriaMetrics Stack
|
|
||||||
|
|
||||||
Create new service module at `services/monitoring/victoriametrics/` for testing alongside existing
|
|
||||||
Prometheus config. Once validated, this can replace the Prometheus module.
|
|
||||||
|
|
||||||
1. **VictoriaMetrics** (port 8428):
|
|
||||||
- `services.victoriametrics.enable = true`
|
|
||||||
- `services.victoriametrics.retentionPeriod = "3m"` (3 months, increase later based on disk usage)
|
|
||||||
- Migrate scrape configs via `prometheusConfig`
|
|
||||||
- Use native push support (replaces Pushgateway)
|
|
||||||
|
|
||||||
2. **vmalert** for alerting rules:
|
|
||||||
- `services.vmalert.enable = true`
|
|
||||||
- Point to VictoriaMetrics for metrics evaluation
|
|
||||||
- Keep rules in separate `rules.yml` file (same format as Prometheus)
|
|
||||||
- No receiver configured during parallel operation (prevents duplicate alerts)
|
|
||||||
|
|
||||||
3. **Alertmanager** (port 9093):
|
|
||||||
- Keep existing configuration (alerttonotify webhook routing)
|
|
||||||
- Only enable receiver after cutover from monitoring01
|
|
||||||
|
|
||||||
4. **Loki** (port 3100):
|
|
||||||
- Same configuration as current
|
|
||||||
|
|
||||||
5. **Grafana** (port 3000):
|
|
||||||
- Define dashboards declaratively via NixOS options (not imported from monitoring01)
|
|
||||||
- Reference existing dashboards on monitoring01 for content inspiration
|
|
||||||
- Configure VictoriaMetrics datasource (port 8428)
|
|
||||||
- Configure Loki datasource
|
|
||||||
|
|
||||||
6. **Tempo** (ports 3200, 3201):
|
|
||||||
- Same configuration
|
|
||||||
|
|
||||||
7. **Pyroscope** (port 4040):
|
|
||||||
- Same Docker-based deployment
|
|
||||||
|
|
||||||
### Phase 3: Parallel Operation
|
|
||||||
|
|
||||||
Run both monitoring01 and monitoring02 simultaneously:
|
|
||||||
|
|
||||||
1. **Dual scraping**: Both hosts scrape the same targets
|
|
||||||
- Validates VictoriaMetrics is collecting data correctly
|
|
||||||
|
|
||||||
2. **Dual log shipping**: Configure Promtail to send logs to both Loki instances
|
|
||||||
- Add second client in `system/monitoring/logs.nix` pointing to monitoring02
|
|
||||||
|
|
||||||
3. **Validate dashboards**: Access Grafana on monitoring02, verify dashboards work
|
|
||||||
|
|
||||||
4. **Validate alerts**: Verify vmalert evaluates rules correctly (no receiver = no notifications)
|
|
||||||
|
|
||||||
5. **Compare resource usage**: Monitor disk/memory consumption between hosts
|
|
||||||
|
|
||||||
### Phase 4: Add monitoring CNAME
|
|
||||||
|
|
||||||
Add CNAME to monitoring02 once validated:
|
|
||||||
|
|
||||||
```nix
|
|
||||||
# hosts/monitoring02/configuration.nix
|
|
||||||
homelab.dns.cnames = [ "monitoring" ];
|
|
||||||
```
|
|
||||||
|
|
||||||
This creates `monitoring.home.2rjus.net` pointing to monitoring02.
|
|
||||||
|
|
||||||
### Phase 5: Update References
|
|
||||||
|
|
||||||
Update hardcoded references to use the CNAME:
|
|
||||||
|
|
||||||
1. **system/monitoring/logs.nix**:
|
|
||||||
- Remove dual-shipping, point only to `http://monitoring.home.2rjus.net:3100`
|
|
||||||
|
|
||||||
2. **services/http-proxy/proxy.nix**: Update reverse proxy backends:
|
|
||||||
- prometheus.home.2rjus.net -> monitoring.home.2rjus.net:8428
|
|
||||||
- alertmanager.home.2rjus.net -> monitoring.home.2rjus.net:9093
|
|
||||||
- grafana.home.2rjus.net -> monitoring.home.2rjus.net:3000
|
|
||||||
- pyroscope.home.2rjus.net -> monitoring.home.2rjus.net:4040
|
|
||||||
|
|
||||||
Note: `hosts/template2/bootstrap.nix` stays pointed at monitoring01 until decommission.
|
|
||||||
|
|
||||||
### Phase 6: Enable Alerting
|
|
||||||
|
|
||||||
Once ready to cut over:
|
|
||||||
1. Enable Alertmanager receiver on monitoring02
|
|
||||||
2. Verify test alerts route correctly
|
|
||||||
|
|
||||||
### Phase 7: Cutover and Decommission
|
|
||||||
|
|
||||||
1. **Stop monitoring01**: Prevent duplicate alerts during transition
|
|
||||||
2. **Update bootstrap.nix**: Point to `monitoring.home.2rjus.net`
|
|
||||||
3. **Verify all targets scraped**: Check VictoriaMetrics UI
|
|
||||||
4. **Verify logs flowing**: Check Loki on monitoring02
|
|
||||||
5. **Decommission monitoring01**:
|
|
||||||
- Remove from flake.nix
|
|
||||||
- Remove host configuration
|
|
||||||
- Destroy VM in Proxmox
|
|
||||||
- Remove from terraform state
|
|
||||||
|
|
||||||
## Open Questions
|
|
||||||
|
|
||||||
- [ ] What disk size for monitoring02? 100GB should allow 3+ months with VictoriaMetrics compression
|
|
||||||
- [ ] Which dashboards to recreate declaratively? (Review monitoring01 Grafana for current set)
|
|
||||||
|
|
||||||
## VictoriaMetrics Service Configuration
|
|
||||||
|
|
||||||
Example NixOS configuration for monitoring02:
|
|
||||||
|
|
||||||
```nix
|
|
||||||
# VictoriaMetrics replaces Prometheus
|
|
||||||
services.victoriametrics = {
|
|
||||||
enable = true;
|
|
||||||
retentionPeriod = "3m"; # 3 months, increase based on disk usage
|
|
||||||
prometheusConfig = {
|
|
||||||
global.scrape_interval = "15s";
|
|
||||||
scrape_configs = [
|
|
||||||
# Auto-generated node-exporter targets
|
|
||||||
# Service-specific scrape targets
|
|
||||||
# External targets
|
|
||||||
];
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
# vmalert for alerting rules (no receiver during parallel operation)
|
|
||||||
services.vmalert = {
|
|
||||||
enable = true;
|
|
||||||
datasource.url = "http://localhost:8428";
|
|
||||||
# notifier.alertmanager.url = "http://localhost:9093"; # Enable after cutover
|
|
||||||
rule = [ ./rules.yml ];
|
|
||||||
};
|
|
||||||
```
|
|
||||||
|
|
||||||
## Rollback Plan
|
|
||||||
|
|
||||||
If issues arise after cutover:
|
|
||||||
1. Move `monitoring` CNAME back to monitoring01
|
|
||||||
2. Restart monitoring01 services
|
|
||||||
3. Revert Promtail config to point only to monitoring01
|
|
||||||
4. Revert http-proxy backends
|
|
||||||
|
|
||||||
## Notes
|
|
||||||
|
|
||||||
- VictoriaMetrics uses port 8428 vs Prometheus 9090
|
|
||||||
- PromQL compatibility is excellent
|
|
||||||
- VictoriaMetrics native push replaces Pushgateway (remove from http-proxy if not needed)
|
|
||||||
- monitoring02 deployed via OpenTofu using `create-host` script
|
|
||||||
- Grafana dashboards defined declaratively via NixOS, not imported from monitoring01 state
|
|
||||||
145
docs/plans/new-services.md
Normal file
145
docs/plans/new-services.md
Normal file
@@ -0,0 +1,145 @@
|
|||||||
|
# New Service Candidates
|
||||||
|
|
||||||
|
Ideas for additional services to deploy in the homelab. These lean more enterprise/obscure
|
||||||
|
than the typical self-hosted fare.
|
||||||
|
|
||||||
|
## Litestream
|
||||||
|
|
||||||
|
Continuous SQLite replication to S3-compatible storage. Streams WAL changes in near-real-time,
|
||||||
|
providing point-in-time recovery without scheduled backup jobs.
|
||||||
|
|
||||||
|
**Why:** Several services use SQLite (Home Assistant, potentially others). Litestream would
|
||||||
|
give continuous backup to Garage S3 with minimal resource overhead and near-zero configuration.
|
||||||
|
Replaces cron-based backup scripts with a small daemon per database.
|
||||||
|
|
||||||
|
**Integration points:**
|
||||||
|
- Garage S3 as replication target (already deployed)
|
||||||
|
- Home Assistant SQLite database is the primary candidate
|
||||||
|
- Could also cover any future SQLite-backed services
|
||||||
|
|
||||||
|
**Complexity:** Low. Single Go binary, minimal config (source DB path + S3 endpoint).
|
||||||
|
|
||||||
|
**NixOS packaging:** Available in nixpkgs as `litestream`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ntopng
|
||||||
|
|
||||||
|
Deep network traffic analysis and flow monitoring. Provides real-time visibility into bandwidth
|
||||||
|
usage, protocol distribution, top talkers, and anomaly detection via a web UI.
|
||||||
|
|
||||||
|
**Why:** We have host-level metrics (node-exporter) and logs (Loki) but no network-level
|
||||||
|
visibility. ntopng would show traffic patterns across the infrastructure — NFS throughput to
|
||||||
|
the NAS, DNS query volume, inter-host traffic, and bandwidth anomalies. Useful for capacity
|
||||||
|
planning and debugging network issues.
|
||||||
|
|
||||||
|
**Integration points:**
|
||||||
|
- Could export metrics to Prometheus via its built-in exporter
|
||||||
|
- Web UI behind http-proxy with Kanidm OIDC (if supported) or Pomerium
|
||||||
|
- NetFlow/sFlow from managed switches (if available)
|
||||||
|
- Passive traffic capture on a mirror port or the monitoring host itself
|
||||||
|
|
||||||
|
**Complexity:** Medium. Needs network tap or mirror port for full visibility, or can run
|
||||||
|
in host-local mode. May need a dedicated interface or VLAN mirror.
|
||||||
|
|
||||||
|
**NixOS packaging:** Available in nixpkgs as `ntopng`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Renovate
|
||||||
|
|
||||||
|
Automated dependency update bot that understands Nix flakes natively. Creates branches/PRs
|
||||||
|
to bump flake inputs on a configurable schedule.
|
||||||
|
|
||||||
|
**Why:** Currently `nix flake update` is manual. Renovate can automatically propose updates
|
||||||
|
to individual flake inputs (nixpkgs, homelab-deploy, nixos-exporter, etc.), group related
|
||||||
|
updates, and respect schedules. More granular than updating everything at once — can bump
|
||||||
|
nixpkgs weekly but hold back other inputs, auto-merge patch-level changes, etc.
|
||||||
|
|
||||||
|
**Integration points:**
|
||||||
|
- Runs against git.t-juice.club repositories
|
||||||
|
- Understands `flake.lock` format natively
|
||||||
|
- Could target both `nixos-servers` and `nixos` repos
|
||||||
|
- Update branches would be validated by homelab-deploy builder
|
||||||
|
|
||||||
|
**Complexity:** Medium. Needs git forge integration (Gitea/Forgejo API). Self-hosted runner
|
||||||
|
mode available. Configuration via `renovate.json` in each repo.
|
||||||
|
|
||||||
|
**NixOS packaging:** Available in nixpkgs as `renovate`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Pomerium
|
||||||
|
|
||||||
|
Identity-aware reverse proxy implementing zero-trust access. Every request is authenticated
|
||||||
|
and authorized based on identity, device, and context — not just network location.
|
||||||
|
|
||||||
|
**Why:** Currently Caddy terminates TLS but doesn't enforce authentication on most services.
|
||||||
|
Pomerium would put Kanidm OIDC authentication in front of every internal service, with
|
||||||
|
per-route authorization policies (e.g., "only admins can access Prometheus," "require re-auth
|
||||||
|
for Vault UI"). Directly addresses the security hardening plan's goals.
|
||||||
|
|
||||||
|
**Integration points:**
|
||||||
|
- Kanidm as OIDC identity provider (already deployed)
|
||||||
|
- Could replace or sit in front of Caddy for internal services
|
||||||
|
- Per-route policies based on Kanidm groups (admins, users, ssh-users)
|
||||||
|
- Centralizes access logging and audit trail
|
||||||
|
|
||||||
|
**Complexity:** Medium-high. Needs careful integration with existing Caddy reverse proxy.
|
||||||
|
Decision needed on whether Pomerium replaces Caddy or works alongside it (Pomerium for
|
||||||
|
auth, Caddy for TLS termination and routing, or Pomerium handles everything).
|
||||||
|
|
||||||
|
**NixOS packaging:** Available in nixpkgs as `pomerium`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Apache Guacamole
|
||||||
|
|
||||||
|
Clientless remote desktop and SSH gateway. Provides browser-based access to hosts via
|
||||||
|
RDP, VNC, SSH, and Telnet with no client software required. Supports session recording
|
||||||
|
and playback.
|
||||||
|
|
||||||
|
**Why:** Provides an alternative remote access path that doesn't require VPN software or
|
||||||
|
SSH keys on the client device. Useful for accessing hosts from untrusted machines (phone,
|
||||||
|
borrowed laptop) or providing temporary access to others. Session recording gives an audit
|
||||||
|
trail. Could complement the WireGuard remote access plan rather than replace it.
|
||||||
|
|
||||||
|
**Integration points:**
|
||||||
|
- Kanidm for authentication (OIDC or LDAP)
|
||||||
|
- Behind http-proxy or Pomerium for TLS
|
||||||
|
- SSH access to all hosts in the fleet
|
||||||
|
- Session recordings could be stored on Garage S3
|
||||||
|
- Could serve as the "emergency access" path when VPN is unavailable
|
||||||
|
|
||||||
|
**Complexity:** Medium. Java-based (guacd + web app), typically needs PostgreSQL for
|
||||||
|
connection/user storage (already available). Docker is the common deployment method but
|
||||||
|
native packaging exists.
|
||||||
|
|
||||||
|
**NixOS packaging:** Available in nixpkgs as `guacamole-server` and `guacamole-client`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## CrowdSec
|
||||||
|
|
||||||
|
Collaborative intrusion prevention system with crowd-sourced threat intelligence.
|
||||||
|
Parses logs to detect attack patterns, applies remediation (firewall bans, CAPTCHA),
|
||||||
|
and shares/receives threat signals from a global community network.
|
||||||
|
|
||||||
|
**Why:** Goes beyond fail2ban with behavioral detection, crowd-sourced IP reputation,
|
||||||
|
and a scenario-based engine. Fits the security hardening plan. The community blocklist
|
||||||
|
means we benefit from threat intelligence gathered across thousands of deployments.
|
||||||
|
Could parse SSH logs, HTTP access logs, and other service logs to detect and block
|
||||||
|
malicious activity.
|
||||||
|
|
||||||
|
**Integration points:**
|
||||||
|
- Could consume logs from Loki or directly from journald/log files
|
||||||
|
- Firewall bouncer for iptables/nftables remediation
|
||||||
|
- Caddy bouncer for HTTP-level blocking
|
||||||
|
- Prometheus metrics exporter for alert integration
|
||||||
|
- Scenarios available for SSH brute force, HTTP scanning, and more
|
||||||
|
- Feeds into existing alerting pipeline (Alertmanager -> alerttonotify)
|
||||||
|
|
||||||
|
**Complexity:** Medium. Agent (log parser + decision engine) on each host or centralized.
|
||||||
|
Bouncers (enforcement) on edge hosts. Free community tier includes threat intel access.
|
||||||
|
|
||||||
|
**NixOS packaging:** Available in nixpkgs as `crowdsec`.
|
||||||
@@ -1,212 +0,0 @@
|
|||||||
# Nix Cache Host Reprovision
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
Reprovision `nix-cache01` using the OpenTofu workflow, and improve the build/cache system with:
|
|
||||||
1. NATS-based remote build triggering (replacing the current bash script)
|
|
||||||
2. Safer flake update workflow that validates builds before pushing to master
|
|
||||||
|
|
||||||
## Current State
|
|
||||||
|
|
||||||
### Host Configuration
|
|
||||||
- `nix-cache01` at 10.69.13.15 serves the binary cache via Harmonia
|
|
||||||
- Runs Gitea Actions runner for CI workflows
|
|
||||||
- Has `homelab.deploy.enable = true` (already supports NATS-based deployment)
|
|
||||||
- Uses a dedicated XFS volume at `/nix` for cache storage
|
|
||||||
|
|
||||||
### Current Build System (`services/nix-cache/build-flakes.sh`)
|
|
||||||
- Runs every 30 minutes via systemd timer
|
|
||||||
- Clones/pulls two repos: `nixos-servers` and `nixos` (gunter)
|
|
||||||
- Builds all hosts with `nixos-rebuild build` (no blacklist despite docs mentioning it)
|
|
||||||
- Pushes success/failure metrics to pushgateway
|
|
||||||
- Simple but has no filtering, no parallelism, no remote triggering
|
|
||||||
|
|
||||||
### Current Flake Update Workflow (`.github/workflows/flake-update.yaml`)
|
|
||||||
- Runs daily at midnight via cron
|
|
||||||
- Runs `nix flake update --commit-lock-file`
|
|
||||||
- Pushes directly to master
|
|
||||||
- No build validation — can push broken inputs
|
|
||||||
|
|
||||||
## Improvement 1: NATS-Based Remote Build Triggering
|
|
||||||
|
|
||||||
### Design
|
|
||||||
|
|
||||||
Extend the existing `homelab-deploy` tool to support a "build" command that triggers builds on the cache host. This reuses the NATS infrastructure already in place.
|
|
||||||
|
|
||||||
| Approach | Pros | Cons |
|
|
||||||
|----------|------|------|
|
|
||||||
| Extend homelab-deploy | Reuses existing NATS auth, NKey handling, CLI | Adds scope to existing tool |
|
|
||||||
| New nix-cache-tool | Clean separation | Duplicate NATS boilerplate, new credentials |
|
|
||||||
| Gitea Actions webhook | No custom tooling | Less flexible, tied to Gitea |
|
|
||||||
|
|
||||||
**Recommendation:** Extend `homelab-deploy` with a build subcommand. The tool already has NATS client code, authentication handling, and a listener module in NixOS.
|
|
||||||
|
|
||||||
### Implementation
|
|
||||||
|
|
||||||
1. Add new message type to homelab-deploy: `build.<host>` subject
|
|
||||||
2. Listener on nix-cache01 subscribes to `build.>` wildcard
|
|
||||||
3. On message receipt, builds the specified host and returns success/failure
|
|
||||||
4. CLI command: `homelab-deploy build <hostname>` or `homelab-deploy build --all`
|
|
||||||
|
|
||||||
### Benefits
|
|
||||||
- Trigger rebuild for specific host to ensure it's cached
|
|
||||||
- Could be called from CI after merging PRs
|
|
||||||
- Reuses existing NATS infrastructure and auth
|
|
||||||
- Progress/status could stream back via NATS reply
|
|
||||||
|
|
||||||
## Improvement 2: Smarter Flake Update Workflow
|
|
||||||
|
|
||||||
### Current Problems
|
|
||||||
1. Updates can push breaking changes to master
|
|
||||||
2. No visibility into what broke when it does
|
|
||||||
3. Hosts that auto-update can pull broken configs
|
|
||||||
|
|
||||||
### Proposed Workflow
|
|
||||||
|
|
||||||
```
|
|
||||||
┌─────────────────────────────────────────────────────────────────┐
|
|
||||||
│ Flake Update Workflow │
|
|
||||||
├─────────────────────────────────────────────────────────────────┤
|
|
||||||
│ 1. nix flake update (on feature branch) │
|
|
||||||
│ 2. Build ALL hosts locally │
|
|
||||||
│ 3. If all pass → fast-forward merge to master │
|
|
||||||
│ 4. If any fail → create PR with failure logs attached │
|
|
||||||
└─────────────────────────────────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
### Implementation Options
|
|
||||||
|
|
||||||
| Option | Description | Pros | Cons |
|
|
||||||
|--------|-------------|------|------|
|
|
||||||
| **A: Self-hosted runner** | Build on nix-cache01 | Fast (local cache), simple | Ties up cache host during build |
|
|
||||||
| **B: Gitea Actions only** | Use container runner | Clean separation | Slow (no cache), resource limits |
|
|
||||||
| **C: Hybrid** | Trigger builds on nix-cache01 via NATS from Actions | Best of both | More complex |
|
|
||||||
|
|
||||||
**Recommendation:** Option A with nix-cache01 as the runner. The host is already running Gitea Actions runner and has the cache. Building all ~16 hosts is disk I/O heavy but feasible on dedicated hardware.
|
|
||||||
|
|
||||||
### Workflow Steps
|
|
||||||
|
|
||||||
1. Workflow runs on schedule (daily or weekly)
|
|
||||||
2. Creates branch `flake-update/YYYY-MM-DD`
|
|
||||||
3. Runs `nix flake update --commit-lock-file`
|
|
||||||
4. Builds each host: `nix build .#nixosConfigurations.<host>.config.system.build.toplevel`
|
|
||||||
5. If all succeed:
|
|
||||||
- Fast-forward merge to master
|
|
||||||
- Delete feature branch
|
|
||||||
6. If any fail:
|
|
||||||
- Create PR from the update branch
|
|
||||||
- Attach build logs as PR comment
|
|
||||||
- Label PR with `needs-review` or `build-failure`
|
|
||||||
- Do NOT merge automatically
|
|
||||||
|
|
||||||
### Workflow File Changes
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
# New: .github/workflows/flake-update-safe.yaml
|
|
||||||
name: Safe flake update
|
|
||||||
on:
|
|
||||||
schedule:
|
|
||||||
- cron: "0 2 * * 0" # Weekly on Sunday at 2 AM
|
|
||||||
workflow_dispatch: # Manual trigger
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
update-and-validate:
|
|
||||||
runs-on: homelab # Use self-hosted runner on nix-cache01
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
ref: master
|
|
||||||
fetch-depth: 0 # Need full history for merge
|
|
||||||
|
|
||||||
- name: Create update branch
|
|
||||||
run: |
|
|
||||||
BRANCH="flake-update/$(date +%Y-%m-%d)"
|
|
||||||
git checkout -b "$BRANCH"
|
|
||||||
|
|
||||||
- name: Update flake
|
|
||||||
run: nix flake update --commit-lock-file
|
|
||||||
|
|
||||||
- name: Build all hosts
|
|
||||||
id: build
|
|
||||||
run: |
|
|
||||||
FAILED=""
|
|
||||||
for host in $(nix flake show --json | jq -r '.nixosConfigurations | keys[]'); do
|
|
||||||
echo "Building $host..."
|
|
||||||
if ! nix build ".#nixosConfigurations.$host.config.system.build.toplevel" 2>&1 | tee "build-$host.log"; then
|
|
||||||
FAILED="$FAILED $host"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
echo "failed=$FAILED" >> $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
- name: Merge to master (if all pass)
|
|
||||||
if: steps.build.outputs.failed == ''
|
|
||||||
run: |
|
|
||||||
git checkout master
|
|
||||||
git merge --ff-only "$BRANCH"
|
|
||||||
git push origin master
|
|
||||||
git push origin --delete "$BRANCH"
|
|
||||||
|
|
||||||
- name: Create PR (if any fail)
|
|
||||||
if: steps.build.outputs.failed != ''
|
|
||||||
run: |
|
|
||||||
git push origin "$BRANCH"
|
|
||||||
# Create PR via Gitea API with build logs
|
|
||||||
# ... (PR creation with log attachment)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Migration Steps
|
|
||||||
|
|
||||||
### Phase 1: Reprovision Host via OpenTofu
|
|
||||||
|
|
||||||
1. Add `nix-cache01` to `terraform/vms.tf`:
|
|
||||||
```hcl
|
|
||||||
"nix-cache01" = {
|
|
||||||
ip = "10.69.13.15/24"
|
|
||||||
cpu_cores = 4
|
|
||||||
memory = 8192
|
|
||||||
disk_size = "100G" # Larger for nix store
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Shut down existing nix-cache01 VM
|
|
||||||
3. Run `tofu apply` to provision new VM
|
|
||||||
4. Verify bootstrap completes and cache is serving
|
|
||||||
|
|
||||||
**Note:** The cache will be cold after reprovision. Run initial builds to populate.
|
|
||||||
|
|
||||||
### Phase 2: Add Build Triggering to homelab-deploy
|
|
||||||
|
|
||||||
1. Add `build` command to homelab-deploy CLI
|
|
||||||
2. Add listener handler in NixOS module for `build.*` subjects
|
|
||||||
3. Update nix-cache01 config to enable build listener
|
|
||||||
4. Test with `homelab-deploy build testvm01`
|
|
||||||
|
|
||||||
### Phase 3: Implement Safe Flake Update Workflow
|
|
||||||
|
|
||||||
1. Create `.github/workflows/flake-update-safe.yaml`
|
|
||||||
2. Disable or remove old `flake-update.yaml`
|
|
||||||
3. Test manually with `workflow_dispatch`
|
|
||||||
4. Monitor first automated run
|
|
||||||
|
|
||||||
### Phase 4: Remove Old Build Script
|
|
||||||
|
|
||||||
1. After new workflow is stable, remove:
|
|
||||||
- `services/nix-cache/build-flakes.nix`
|
|
||||||
- `services/nix-cache/build-flakes.sh`
|
|
||||||
2. The new workflow handles scheduled builds
|
|
||||||
|
|
||||||
## Open Questions
|
|
||||||
|
|
||||||
- [ ] What runner labels should the self-hosted runner use for the update workflow?
|
|
||||||
- [ ] Should we build hosts in parallel (faster) or sequentially (easier to debug)?
|
|
||||||
- [ ] How long to keep flake-update PRs open before auto-closing stale ones?
|
|
||||||
- [ ] Should successful updates trigger a NATS notification to rebuild all hosts?
|
|
||||||
- [ ] What to do about `gunter` (external nixos repo) - include in validation?
|
|
||||||
- [ ] Disk size for new nix-cache01 - is 100G enough for cache + builds?
|
|
||||||
|
|
||||||
## Notes
|
|
||||||
|
|
||||||
- The existing `homelab.deploy.enable = true` on nix-cache01 means it already has NATS connectivity
|
|
||||||
- The Harmonia service and cache signing key will work the same after reprovision
|
|
||||||
- Actions runner token is in Vault, will be provisioned automatically
|
|
||||||
- Consider adding a `homelab.host.role = "build-host"` label for monitoring/filtering
|
|
||||||
162
docs/plans/nixos-router.md
Normal file
162
docs/plans/nixos-router.md
Normal file
@@ -0,0 +1,162 @@
|
|||||||
|
# NixOS Router — Replace EdgeRouter
|
||||||
|
|
||||||
|
Replace the aging Ubiquiti EdgeRouter (gw, 10.69.10.1) with a NixOS-based router.
|
||||||
|
The EdgeRouter is suspected to be a throughput bottleneck. A NixOS router integrates
|
||||||
|
naturally with the existing fleet: same config management, same monitoring pipeline,
|
||||||
|
same deployment workflow.
|
||||||
|
|
||||||
|
## Goals
|
||||||
|
|
||||||
|
- Eliminate the EdgeRouter throughput bottleneck
|
||||||
|
- Full integration with existing monitoring (node-exporter, promtail, Prometheus, Loki)
|
||||||
|
- Declarative firewall and routing config managed in the flake
|
||||||
|
- Inter-VLAN routing for all existing subnets
|
||||||
|
- DHCP server for client subnets
|
||||||
|
- NetFlow/traffic accounting for future ntopng integration
|
||||||
|
- Foundation for WireGuard remote access (see remote-access.md)
|
||||||
|
|
||||||
|
## Current Network Topology
|
||||||
|
|
||||||
|
**Subnets (known VLANs):**
|
||||||
|
| VLAN/Subnet | Purpose | Notable hosts |
|
||||||
|
|----------------|------------------|----------------------------------------|
|
||||||
|
| 10.69.10.0/24 | Gateway | gw (10.69.10.1) |
|
||||||
|
| 10.69.12.0/24 | Core services | nas, pve1, arr jails, restic |
|
||||||
|
| 10.69.13.0/24 | Infrastructure | All NixOS servers (static IPs) |
|
||||||
|
| 10.69.22.0/24 | WLAN | unifi-ctrl |
|
||||||
|
| 10.69.30.0/24 | Workstations | gunter |
|
||||||
|
| 10.69.31.0/24 | Media | media |
|
||||||
|
| 10.69.99.0/24 | Management | sw1 (MikroTik CRS326-24G-2S+) |
|
||||||
|
|
||||||
|
**DNS:** ns1 (10.69.13.5) and ns2 (10.69.13.6) handle all resolution. Upstream is
|
||||||
|
Cloudflare/Google over DoT via Unbound.
|
||||||
|
|
||||||
|
**Switch:** MikroTik CRS326-24G-2S+ — L2 switching with VLAN trunking. Capable of
|
||||||
|
L3 routing via RouterOS but not ideal for sustained routing throughput.
|
||||||
|
|
||||||
|
## Hardware
|
||||||
|
|
||||||
|
Needs a small x86 box with:
|
||||||
|
- At least 2 NICs (WAN + LAN trunk). Dual 2.5GbE preferred.
|
||||||
|
- Enough CPU for nftables NAT at line rate (any modern x86 is fine)
|
||||||
|
- 4-8 GB RAM (plenty for routing + DHCP + NetFlow accounting)
|
||||||
|
- Low power consumption, fanless preferred for always-on use
|
||||||
|
|
||||||
|
Candidates:
|
||||||
|
- Topton / CWWK mini PC with dual/quad Intel 2.5GbE (~100-150 EUR)
|
||||||
|
- Protectli Vault (more expensive, ~200-300 EUR, proven in pfSense/OPNsense community)
|
||||||
|
- Any mini PC with one onboard NIC + one USB 2.5GbE adapter (cheapest, less ideal)
|
||||||
|
|
||||||
|
The LAN port would carry a VLAN trunk to the MikroTik switch, with sub-interfaces
|
||||||
|
for each VLAN. WAN port connects to the ISP uplink.
|
||||||
|
|
||||||
|
## NixOS Configuration
|
||||||
|
|
||||||
|
### Stability Policy
|
||||||
|
|
||||||
|
The router is treated differently from the rest of the fleet:
|
||||||
|
- **No auto-upgrade** — `system.autoUpgrade.enable = false`
|
||||||
|
- **No homelab-deploy listener** — `homelab.deploy.enable = false`
|
||||||
|
- **Manual updates only** — update every few months, test-build first
|
||||||
|
- **Use `nixos-rebuild boot`** — changes take effect on next deliberate reboot
|
||||||
|
- **Tier: prod, priority: high** — alerts treated with highest priority
|
||||||
|
|
||||||
|
### Core Services
|
||||||
|
|
||||||
|
**Routing & NAT:**
|
||||||
|
- `systemd-networkd` for all interface config (consistent with rest of fleet)
|
||||||
|
- VLAN sub-interfaces on the LAN trunk (one per subnet)
|
||||||
|
- `networking.nftables` for stateful firewall and NAT
|
||||||
|
- IP forwarding enabled (`net.ipv4.ip_forward = 1`)
|
||||||
|
- Masquerade outbound traffic on WAN interface
|
||||||
|
|
||||||
|
**DHCP:**
|
||||||
|
- Kea or dnsmasq for DHCP on client subnets (WLAN, workstations, media)
|
||||||
|
- Infrastructure subnet (10.69.13.0/24) stays static — no DHCP needed
|
||||||
|
- Static leases for known devices
|
||||||
|
|
||||||
|
**Firewall (nftables):**
|
||||||
|
- Default deny between VLANs
|
||||||
|
- Explicit allow rules for known cross-VLAN traffic:
|
||||||
|
- All subnets → ns1/ns2 (DNS)
|
||||||
|
- All subnets → monitoring01 (metrics/logs)
|
||||||
|
- Infrastructure → all (management access)
|
||||||
|
- Workstations → media, core services
|
||||||
|
- NAT masquerade on WAN
|
||||||
|
- Rate limiting on WAN-facing services
|
||||||
|
|
||||||
|
**Traffic Accounting:**
|
||||||
|
- nftables flow accounting or softflowd for NetFlow export
|
||||||
|
- Export to future ntopng instance (see new-services.md)
|
||||||
|
|
||||||
|
### Monitoring Integration
|
||||||
|
|
||||||
|
Since this is a NixOS host in the flake, it gets the standard monitoring stack for free:
|
||||||
|
- node-exporter for system metrics (CPU, memory, NIC throughput per interface)
|
||||||
|
- promtail shipping logs to Loki
|
||||||
|
- Prometheus scrape target auto-registration
|
||||||
|
- Alertmanager alerts for host-down, high CPU, etc.
|
||||||
|
|
||||||
|
Additional router-specific monitoring:
|
||||||
|
- Per-VLAN interface traffic metrics via node-exporter (automatic for all interfaces)
|
||||||
|
- NAT connection tracking table size
|
||||||
|
- WAN uplink status and throughput
|
||||||
|
- DHCP lease metrics (if Kea, it has a Prometheus exporter)
|
||||||
|
|
||||||
|
This is a significant advantage over the EdgeRouter — full observability through
|
||||||
|
the existing Grafana dashboards and Loki log search, debuggable via the monitoring
|
||||||
|
MCP tools.
|
||||||
|
|
||||||
|
### WireGuard Integration
|
||||||
|
|
||||||
|
The remote access plan (remote-access.md) currently proposes a separate `extgw01`
|
||||||
|
gateway host. With a NixOS router, there's a decision to make:
|
||||||
|
|
||||||
|
**Option A:** WireGuard terminates on the router itself. Simplest topology — the
|
||||||
|
router is already the gateway, so VPN traffic doesn't need extra hops or firewall
|
||||||
|
rules. But adds complexity to the router, which should stay simple.
|
||||||
|
|
||||||
|
**Option B:** Keep extgw01 as a separate host (original plan). Router just routes
|
||||||
|
traffic to it. Better separation of concerns, router stays minimal.
|
||||||
|
|
||||||
|
Recommendation: Start with option B (keep it separate). The router should do routing
|
||||||
|
and nothing else. WireGuard can move to the router later if extgw01 feels redundant.
|
||||||
|
|
||||||
|
## Migration Plan
|
||||||
|
|
||||||
|
### Phase 1: Build and lab test
|
||||||
|
- Acquire hardware
|
||||||
|
- Create host config in the flake (routing, NAT, DHCP, firewall)
|
||||||
|
- Test-build on workstation: `nix build .#nixosConfigurations.router01.config.system.build.toplevel`
|
||||||
|
- Lab test with a temporary setup if possible (two NICs, isolated VLAN)
|
||||||
|
|
||||||
|
### Phase 2: Prepare cutover
|
||||||
|
- Pre-configure the MikroTik switch trunk port for the new router
|
||||||
|
- Document current EdgeRouter config (port forwarding, NAT rules, DHCP leases)
|
||||||
|
- Replicate all rules in the NixOS config
|
||||||
|
- Verify DNS, DHCP, and inter-VLAN routing work in test
|
||||||
|
|
||||||
|
### Phase 3: Cutover
|
||||||
|
- Schedule a maintenance window (brief downtime expected)
|
||||||
|
- Swap WAN cable from EdgeRouter to new router
|
||||||
|
- Swap LAN trunk from EdgeRouter to new router
|
||||||
|
- Verify connectivity from each VLAN
|
||||||
|
- Verify internet access, DNS resolution, inter-VLAN routing
|
||||||
|
- Monitor via Prometheus/Loki (immediately available since it's a fleet host)
|
||||||
|
|
||||||
|
### Phase 4: Decommission EdgeRouter
|
||||||
|
- Keep EdgeRouter available as fallback for a few weeks
|
||||||
|
- Remove `gw` entry from external-hosts.nix, replace with flake-managed host
|
||||||
|
- Update any references to 10.69.10.1 if the router IP changes
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- **Router IP:** Keep 10.69.10.1 or move to a different address? Each VLAN
|
||||||
|
sub-interface needs an IP (the gateway address for that subnet).
|
||||||
|
- **ISP uplink:** What type of WAN connection? PPPoE, DHCP, static IP?
|
||||||
|
- **Port forwarding:** What ports are currently forwarded on the EdgeRouter?
|
||||||
|
These need to be replicated in nftables.
|
||||||
|
- **DHCP scope:** Which subnets currently get DHCP from the EdgeRouter vs
|
||||||
|
other sources (UniFi controller for WLAN?)?
|
||||||
|
- **UPnP/NAT-PMP:** Needed for any devices? (gaming consoles, etc.)
|
||||||
|
- **Hardware preference:** Fanless mini PC budget and preferred vendor?
|
||||||
@@ -4,119 +4,118 @@
|
|||||||
|
|
||||||
## Goal
|
## Goal
|
||||||
|
|
||||||
Enable remote access to some or all homelab services from outside the internal network, without exposing anything directly to the internet.
|
Enable personal remote access to selected homelab services from outside the internal network, without exposing anything directly to the internet.
|
||||||
|
|
||||||
## Current State
|
## Current State
|
||||||
|
|
||||||
- All services are only accessible from the internal 10.69.13.x network
|
- All services are only accessible from the internal 10.69.13.x network
|
||||||
- Exception: jelly01 has a WireGuard link to an external VPS
|
- http-proxy has a WireGuard tunnel (`wg0`, `10.69.222.0/24`) to a VPS (`docker2.t-juice.club`) on an OpenStack cluster
|
||||||
- No services are directly exposed to the public internet
|
- VPS runs Traefik which proxies selected services (including Jellyfin) back through the tunnel to http-proxy's Caddy
|
||||||
|
- No other services are directly exposed to the public internet
|
||||||
|
|
||||||
## Constraints
|
## Decision: WireGuard Gateway
|
||||||
|
|
||||||
- Nothing should be directly accessible from the outside
|
After evaluating WireGuard gateway vs Headscale (self-hosted Tailscale), the **WireGuard gateway** approach was chosen:
|
||||||
- Must use VPN or overlay network (no port forwarding of services)
|
|
||||||
- Self-hosted solutions preferred over managed services
|
|
||||||
|
|
||||||
## Options
|
- Only 2 client devices (laptop + phone), so Headscale's device management UX isn't needed
|
||||||
|
- Split DNS works fine on Linux laptop via systemd-resolved; all-or-nothing DNS on phone is acceptable for occasional use
|
||||||
|
- Simpler infrastructure - no control server to maintain
|
||||||
|
- Builds on existing WireGuard experience and setup
|
||||||
|
|
||||||
### 1. WireGuard Gateway (Internal Router)
|
## Architecture
|
||||||
|
|
||||||
A dedicated NixOS host on the internal network with a WireGuard tunnel out to the VPS. The VPS becomes the public entry point, and the gateway routes traffic to internal services. Firewall rules on the gateway control which services are reachable.
|
```mermaid
|
||||||
|
graph TD
|
||||||
|
clients["Laptop / Phone"]
|
||||||
|
vps["VPS<br/>(WireGuard endpoint)"]
|
||||||
|
extgw["extgw01<br/>(gateway + bastion)"]
|
||||||
|
grafana["Grafana<br/>monitoring01:3000"]
|
||||||
|
jellyfin["Jellyfin<br/>jelly01:8096"]
|
||||||
|
arr["arr stack<br/>*-jail hosts"]
|
||||||
|
|
||||||
**Pros:**
|
clients -->|WireGuard| vps
|
||||||
- Simple, well-understood technology
|
vps -->|WireGuard tunnel| extgw
|
||||||
- Already running WireGuard for jelly01
|
extgw -->|allowed traffic| grafana
|
||||||
- Full control over routing and firewall rules
|
extgw -->|allowed traffic| jellyfin
|
||||||
- Excellent NixOS module support
|
extgw -->|allowed traffic| arr
|
||||||
- No extra dependencies
|
```
|
||||||
|
|
||||||
**Cons:**
|
### Existing path (unchanged)
|
||||||
- Hub-and-spoke topology (all traffic goes through VPS)
|
|
||||||
- Manual peer management
|
|
||||||
- Adding a new client device means editing configs on both VPS and gateway
|
|
||||||
|
|
||||||
### 2. WireGuard Mesh (No Relay)
|
The current public access path stays as-is:
|
||||||
|
|
||||||
Each client device connects directly to a WireGuard endpoint. Could be on the VPS which forwards to the homelab, or if there is a routable IP at home, directly to an internal host.
|
```
|
||||||
|
Internet → VPS (Traefik) → WireGuard → http-proxy (Caddy) → internal services
|
||||||
|
```
|
||||||
|
|
||||||
**Pros:**
|
This handles public Jellyfin access and any other publicly-exposed services.
|
||||||
- Simple and fast
|
|
||||||
- No extra software
|
|
||||||
|
|
||||||
**Cons:**
|
### New path (personal VPN)
|
||||||
- Manual key and endpoint management for every peer
|
|
||||||
- Doesn't scale well
|
|
||||||
- If behind CGNAT, still needs the VPS as intermediary
|
|
||||||
|
|
||||||
### 3. Headscale (Self-Hosted Tailscale)
|
A separate WireGuard tunnel for personal remote access with restricted firewall rules:
|
||||||
|
|
||||||
Run a Headscale control server (on the VPS or internally) and install the Tailscale client on homelab hosts and personal devices. Gets the Tailscale mesh networking UX without depending on Tailscale's infrastructure.
|
```
|
||||||
|
Laptop/Phone → VPS (WireGuard peers) → tunnel → extgw01 (firewall) → allowed services
|
||||||
|
```
|
||||||
|
|
||||||
**Pros:**
|
### Access tiers
|
||||||
- Mesh topology - devices communicate directly via NAT traversal (DERP relay as fallback)
|
|
||||||
- Easy to add/remove devices
|
|
||||||
- ACL support for granular access control
|
|
||||||
- MagicDNS for service discovery
|
|
||||||
- Good NixOS support for both headscale server and tailscale client
|
|
||||||
- Subnet routing lets you expose the entire 10.69.13.x network or specific hosts without installing tailscale on every host
|
|
||||||
|
|
||||||
**Cons:**
|
1. **VPN (default)**: Laptop/phone connect to VPS WireGuard endpoint, traffic routed through extgw01 firewall. Only whitelisted services are reachable.
|
||||||
- More moving parts than plain WireGuard
|
2. **SSH + 2FA (escalated)**: SSH into extgw01 for full network access when needed.
|
||||||
- Headscale is a third-party reimplementation, can lag behind Tailscale features
|
|
||||||
- Need to run and maintain the control server
|
|
||||||
|
|
||||||
### 4. Tailscale (Managed)
|
## New Host: extgw01
|
||||||
|
|
||||||
Same as Headscale but using Tailscale's hosted control plane.
|
A NixOS host on the internal network acting as both WireGuard gateway and SSH bastion.
|
||||||
|
|
||||||
**Pros:**
|
### Responsibilities
|
||||||
- Zero infrastructure to manage on the control plane side
|
|
||||||
- Polished UX, well-maintained clients
|
|
||||||
- Free tier covers personal use
|
|
||||||
|
|
||||||
**Cons:**
|
- **WireGuard tunnel** to the VPS for client traffic
|
||||||
- Dependency on Tailscale's service
|
- **Firewall** with allowlist controlling which internal services are reachable through the VPN
|
||||||
- Less aligned with self-hosting preference
|
- **SSH bastion** with 2FA for full network access when needed
|
||||||
- Coordination metadata goes through their servers (data plane is still peer-to-peer)
|
- **DNS**: Clients get split DNS config (laptop via systemd-resolved routing domain, phone uses internal DNS for all queries)
|
||||||
|
|
||||||
### 5. Netbird (Self-Hosted)
|
### Firewall allowlist (initial)
|
||||||
|
|
||||||
Open-source alternative to Tailscale with a self-hostable management server. WireGuard-based, supports ACLs and NAT traversal.
|
| Service | Destination | Port |
|
||||||
|
|------------|------------------------------|-------|
|
||||||
|
| Grafana | monitoring01.home.2rjus.net | 3000 |
|
||||||
|
| Jellyfin | jelly01.home.2rjus.net | 8096 |
|
||||||
|
| Sonarr | sonarr-jail.home.2rjus.net | 8989 |
|
||||||
|
| Radarr | radarr-jail.home.2rjus.net | 7878 |
|
||||||
|
| NZBget | nzbget-jail.home.2rjus.net | 6789 |
|
||||||
|
|
||||||
**Pros:**
|
### SSH 2FA options (to be decided)
|
||||||
- Fully self-hostable
|
|
||||||
- Web UI for management
|
|
||||||
- ACL and peer grouping support
|
|
||||||
|
|
||||||
**Cons:**
|
- **Kanidm**: Already deployed on kanidm01, supports RADIUS/OAuth2 for PAM integration
|
||||||
- Heavier to self-host (needs multiple components: management server, signal server, TURN relay)
|
- **SSH certificates via OpenBao**: Fits existing Vault infrastructure, short-lived certs
|
||||||
- Less mature NixOS module support compared to Tailscale/Headscale
|
- **TOTP via PAM**: Simplest fallback, Google Authenticator / similar
|
||||||
|
|
||||||
### 6. Nebula (by Defined Networking)
|
## VPS Configuration
|
||||||
|
|
||||||
Certificate-based mesh VPN. Each node gets a certificate from a CA you control. No central coordination server needed at runtime.
|
The VPS needs a new WireGuard interface (separate from the existing http-proxy tunnel):
|
||||||
|
|
||||||
**Pros:**
|
- WireGuard endpoint listening on a public UDP port
|
||||||
- No always-on control plane
|
- 2 peers: laptop, phone
|
||||||
- Certificate-based identity
|
- Routes client traffic through tunnel to extgw01
|
||||||
- Lightweight
|
- Minimal config - just routing, no firewall policy (that lives on extgw01)
|
||||||
|
|
||||||
**Cons:**
|
## Implementation Steps
|
||||||
- Less convenient for ad-hoc device addition (need to issue certs)
|
|
||||||
- NAT traversal less mature than Tailscale's
|
|
||||||
- Smaller community/ecosystem
|
|
||||||
|
|
||||||
## Key Decision Points
|
1. **Create extgw01 host configuration** in this repo
|
||||||
|
- VM provisioned via OpenTofu (same as other hosts)
|
||||||
- **Static public IP vs CGNAT?** Determines whether clients can connect directly to home network or need VPS relay.
|
- WireGuard interface for VPS tunnel
|
||||||
- **Number of client devices?** If just phone and laptop, plain WireGuard via VPS is fine. More devices favors Headscale.
|
- nftables/iptables firewall with service allowlist
|
||||||
- **Per-service vs per-network access?** Gateway with firewall rules gives per-service control. Headscale ACLs can also do this. Plain WireGuard gives network-level access with gateway firewall for finer control.
|
- IP forwarding enabled
|
||||||
- **Subnet routing vs per-host agents?** With Headscale/Tailscale, can either install client on every host, or use a single subnet router that advertises the 10.69.13.x range. The latter is closer to the gateway approach and avoids touching every host.
|
2. **Configure VPS WireGuard** for client peers
|
||||||
|
- New WireGuard interface with laptop + phone peers
|
||||||
## Leading Candidates
|
- Routing for 10.69.13.0/24 through extgw01 tunnel
|
||||||
|
3. **Set up client configs**
|
||||||
Based on existing WireGuard experience, self-hosting preference, and NixOS stack:
|
- Laptop: WireGuard config + systemd-resolved split DNS for `home.2rjus.net`
|
||||||
|
- Phone: WireGuard app config with DNS pointing at internal nameservers
|
||||||
1. **Headscale with a subnet router** - Best balance of convenience and self-hosting
|
4. **Set up SSH 2FA** on extgw01
|
||||||
2. **WireGuard gateway via VPS** - Simplest, most transparent, builds on existing setup
|
- Evaluate Kanidm integration vs OpenBao SSH certs vs TOTP
|
||||||
|
5. **Test and verify**
|
||||||
|
- VPN access to allowed services only
|
||||||
|
- Firewall blocks everything else
|
||||||
|
- SSH + 2FA grants full access
|
||||||
|
- Existing public access path unaffected
|
||||||
|
|||||||
@@ -39,23 +39,17 @@ Expand storage capacity for the main hdd-pool. Since we need to add disks anyway
|
|||||||
- nzbget: NixOS service or OCI container
|
- nzbget: NixOS service or OCI container
|
||||||
- NFS exports: `services.nfs.server`
|
- NFS exports: `services.nfs.server`
|
||||||
|
|
||||||
### Filesystem: BTRFS RAID1
|
### Filesystem: Keep ZFS
|
||||||
|
|
||||||
**Decision**: Migrate from ZFS to BTRFS with RAID1
|
**Decision**: Keep existing ZFS pool, import on NixOS
|
||||||
|
|
||||||
**Rationale**:
|
**Rationale**:
|
||||||
- **In-kernel**: No out-of-tree module issues like ZFS
|
- **No data migration needed**: Existing ZFS pool can be imported directly on NixOS
|
||||||
- **Flexible expansion**: Add individual disks, not required to buy pairs
|
- **Proven reliability**: Pool has been running reliably on TrueNAS
|
||||||
- **Mixed disk sizes**: Better handling than ZFS multi-vdev approach
|
- **NixOS ZFS support**: Well-supported, declarative configuration via `boot.zfs` and `services.zfs`
|
||||||
- **RAID level conversion**: Can convert between RAID levels in place
|
- **BTRFS RAID5/6 unreliable**: Research showed BTRFS RAID5/6 write hole is still unresolved
|
||||||
- Built-in checksumming, snapshots, compression (zstd)
|
- **BTRFS RAID1 wasteful**: With mixed disk sizes, RAID1 wastes significant capacity vs ZFS mirrors
|
||||||
- NixOS has good BTRFS support
|
- Checksumming, snapshots, compression (lz4/zstd) all available
|
||||||
|
|
||||||
**BTRFS RAID1 notes**:
|
|
||||||
- "RAID1" means 2 copies of all data
|
|
||||||
- Distributes across all available devices
|
|
||||||
- With 6+ disks, provides redundancy + capacity scaling
|
|
||||||
- RAID5/6 avoided (known issues), RAID1/10 are stable
|
|
||||||
|
|
||||||
### Hardware: Keep Existing + Add Disks
|
### Hardware: Keep Existing + Add Disks
|
||||||
|
|
||||||
@@ -69,83 +63,94 @@ Expand storage capacity for the main hdd-pool. Since we need to add disks anyway
|
|||||||
|
|
||||||
**Storage architecture**:
|
**Storage architecture**:
|
||||||
|
|
||||||
**Bulk storage** (BTRFS RAID1 on HDDs):
|
**hdd-pool** (ZFS mirrors):
|
||||||
- Current: 6x HDDs (2x16TB + 2x8TB + 2x8TB)
|
- Current: 3 mirror vdevs (2x16TB + 2x8TB + 2x8TB) = 32TB usable
|
||||||
- Add: 2x new HDDs (size TBD)
|
- Add: mirror-3 with 2x 24TB = +24TB usable
|
||||||
|
- Total after expansion: ~56TB usable
|
||||||
- Use: Media, downloads, backups, non-critical data
|
- Use: Media, downloads, backups, non-critical data
|
||||||
- Risk tolerance: High (data mostly replaceable)
|
|
||||||
|
|
||||||
**Critical data** (small volume):
|
|
||||||
- Use 2x 240GB SSDs in mirror (BTRFS or ZFS)
|
|
||||||
- Or use 2TB NVMe for critical data
|
|
||||||
- Risk tolerance: Low (data important but small)
|
|
||||||
|
|
||||||
### Disk Purchase Decision
|
### Disk Purchase Decision
|
||||||
|
|
||||||
**Options under consideration**:
|
**Decision**: 2x 24TB drives (ordered, arriving 2026-02-21)
|
||||||
|
|
||||||
**Option A: 2x 16TB drives**
|
|
||||||
- Matches largest current drives
|
|
||||||
- Enables potential future RAID5 if desired (6x 16TB array)
|
|
||||||
- More conservative capacity increase
|
|
||||||
|
|
||||||
**Option B: 2x 20-24TB drives**
|
|
||||||
- Larger capacity headroom
|
|
||||||
- Better $/TB ratio typically
|
|
||||||
- Future-proofs better
|
|
||||||
|
|
||||||
**Initial purchase**: 2 drives (chassis has space for 2 more without modifications)
|
|
||||||
|
|
||||||
## Migration Strategy
|
## Migration Strategy
|
||||||
|
|
||||||
### High-Level Plan
|
### High-Level Plan
|
||||||
|
|
||||||
1. **Preparation**:
|
1. **Expand ZFS pool** (on TrueNAS):
|
||||||
- Purchase 2x new HDDs (16TB or 20-24TB)
|
- Install 2x 24TB drives (may need new drive trays - order from abroad if needed)
|
||||||
- Create NixOS configuration for new storage host
|
- If chassis space is limited, temporarily replace the two oldest 8TB drives (da0/ada4)
|
||||||
- Set up bare metal NixOS installation
|
- Add as mirror-3 vdev to hdd-pool
|
||||||
|
- Verify pool health and resilver completes
|
||||||
|
- Check SMART data on old 8TB drives (all healthy as of 2026-02-20, no reallocated sectors)
|
||||||
|
- Burn-in: at minimum short + long SMART test before adding to pool
|
||||||
|
|
||||||
2. **Initial BTRFS pool**:
|
2. **Prepare NixOS configuration**:
|
||||||
- Install 2 new disks
|
- Create host configuration (`hosts/nas1/` or similar)
|
||||||
- Create BTRFS filesystem in RAID1
|
- Configure ZFS pool import (`boot.zfs.extraPools`)
|
||||||
- Mount and test NFS exports
|
- Set up services: radarr, sonarr, nzbget, restic-rest, NFS
|
||||||
|
- Configure monitoring (node-exporter, promtail, smartctl-exporter)
|
||||||
|
|
||||||
3. **Data migration**:
|
3. **Install NixOS**:
|
||||||
- Copy data from TrueNAS ZFS pool to new BTRFS pool over 10GbE
|
- `zfs export hdd-pool` on TrueNAS before shutdown (clean export)
|
||||||
- Verify data integrity
|
- Wipe TrueNAS boot-pool SSDs, set up as mdadm RAID1 for NixOS root
|
||||||
|
- Install NixOS on mdadm mirror (keeps boot path ZFS-independent)
|
||||||
|
- Import hdd-pool via `boot.zfs.extraPools`
|
||||||
|
- Verify all datasets mount correctly
|
||||||
|
|
||||||
4. **Expand pool**:
|
4. **Service migration**:
|
||||||
- As old ZFS pool is emptied, wipe drives and add to BTRFS pool
|
- Configure NixOS services to use ZFS dataset paths
|
||||||
- Pool grows incrementally: 2 → 4 → 6 → 8 disks
|
- Update NFS exports
|
||||||
- BTRFS rebalances data across new devices
|
- Test from consuming hosts
|
||||||
|
|
||||||
5. **Service migration**:
|
5. **Cutover**:
|
||||||
- Set up radarr/sonarr/nzbget/restic as NixOS services
|
- Update DNS/client mounts if IP changes
|
||||||
- Update NFS client mounts on consuming hosts
|
- Verify monitoring integration
|
||||||
|
|
||||||
6. **Cutover**:
|
|
||||||
- Point consumers to new NAS host
|
|
||||||
- Decommission TrueNAS
|
- Decommission TrueNAS
|
||||||
- Repurpose hardware or keep as spare
|
|
||||||
|
### Post-Expansion: Vdev Rebalancing
|
||||||
|
|
||||||
|
ZFS has no built-in rebalance command. After adding the new 24TB vdev, ZFS will
|
||||||
|
write new data preferentially to it (most free space), leaving old vdevs packed
|
||||||
|
at ~97%. This is suboptimal but not urgent once overall pool usage drops to ~50%.
|
||||||
|
|
||||||
|
To gradually rebalance, rewrite files in place so ZFS redistributes blocks across
|
||||||
|
all vdevs proportional to free space:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Rewrite files individually (spreads blocks across all vdevs)
|
||||||
|
find /pool/dataset -type f -exec sh -c '
|
||||||
|
for f; do cp "$f" "$f.rebal" && mv "$f.rebal" "$f"; done
|
||||||
|
' _ {} +
|
||||||
|
```
|
||||||
|
|
||||||
|
Avoid `zfs send/recv` for large datasets (e.g. 20TB) as this would concentrate
|
||||||
|
data on the emptiest vdev rather than spreading it evenly.
|
||||||
|
|
||||||
|
**Recommendation**: Do this after NixOS migration is stable. Not urgent - the pool
|
||||||
|
will function fine with uneven distribution, just slightly suboptimal for performance.
|
||||||
|
|
||||||
### Migration Advantages
|
### Migration Advantages
|
||||||
|
|
||||||
- **Low risk**: New pool created independently, old data remains intact during migration
|
- **No data migration**: ZFS pool imported directly, no copying terabytes of data
|
||||||
- **Incremental**: Can add old disks one at a time as space allows
|
- **Low risk**: Pool expansion done on stable TrueNAS before OS swap
|
||||||
- **Flexible**: BTRFS handles mixed disk sizes gracefully
|
- **Reversible**: Can boot back to TrueNAS if NixOS has issues (ZFS pool is OS-independent)
|
||||||
- **Reversible**: Keep TrueNAS running until fully validated
|
- **Quick cutover**: Once NixOS config is ready, the OS swap is fast
|
||||||
|
|
||||||
## Next Steps
|
## Next Steps
|
||||||
|
|
||||||
1. Decide on disk size (16TB vs 20-24TB)
|
1. ~~Decide on disk size~~ - 2x 24TB ordered
|
||||||
2. Purchase disks
|
2. Install drives and add mirror vdev to ZFS pool
|
||||||
3. Design NixOS host configuration (`hosts/nas1/`)
|
3. Check SMART data on 8TB drives - decide whether to keep or retire
|
||||||
4. Plan detailed migration timeline
|
4. Design NixOS host configuration (`hosts/nas1/`)
|
||||||
5. Document NFS export mapping (current → new)
|
5. Document NFS export mapping (current -> new)
|
||||||
|
6. Plan NixOS installation and cutover
|
||||||
|
|
||||||
## Open Questions
|
## Open Questions
|
||||||
|
|
||||||
- [ ] Final decision on disk size?
|
|
||||||
- [ ] Hostname for new NAS host? (nas1? storage1?)
|
- [ ] Hostname for new NAS host? (nas1? storage1?)
|
||||||
- [ ] IP address allocation (keep 10.69.12.50 or new IP?)
|
- [ ] IP address/subnet: NAS and Proxmox are both on 10GbE to the same switch but different subnets, forcing traffic through the router (bottleneck). Move to same subnet during migration.
|
||||||
- [ ] Timeline/maintenance window for migration?
|
- [x] Boot drive: Reuse TrueNAS boot-pool SSDs as mdadm RAID1 for NixOS root (no ZFS on boot path)
|
||||||
|
- [ ] Retire old 8TB drives? (SMART looks healthy, keep unless chassis space is needed)
|
||||||
|
- [ ] Drive trays: do new 24TB drives fit, or order trays from abroad?
|
||||||
|
- [ ] Timeline/maintenance window for NixOS swap?
|
||||||
|
|||||||
311
docs/user-management.md
Normal file
311
docs/user-management.md
Normal file
@@ -0,0 +1,311 @@
|
|||||||
|
# User Management with Kanidm
|
||||||
|
|
||||||
|
Central authentication for the homelab using Kanidm.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
- **Server**: kanidm01.home.2rjus.net (auth.home.2rjus.net)
|
||||||
|
- **WebUI**: https://auth.home.2rjus.net
|
||||||
|
- **LDAPS**: port 636
|
||||||
|
|
||||||
|
## CLI Setup
|
||||||
|
|
||||||
|
The `kanidm` CLI is available in the devshell:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
nix develop
|
||||||
|
|
||||||
|
# Login as idm_admin
|
||||||
|
kanidm login --name idm_admin --url https://auth.home.2rjus.net
|
||||||
|
```
|
||||||
|
|
||||||
|
## User Management
|
||||||
|
|
||||||
|
POSIX users are managed imperatively via the `kanidm` CLI. This allows setting
|
||||||
|
all attributes (including UNIX password) in one workflow.
|
||||||
|
|
||||||
|
### Creating a POSIX User
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create the person
|
||||||
|
kanidm person create <username> "<Display Name>"
|
||||||
|
|
||||||
|
# Add to groups
|
||||||
|
kanidm group add-members ssh-users <username>
|
||||||
|
|
||||||
|
# Enable POSIX (UID is auto-assigned)
|
||||||
|
kanidm person posix set <username>
|
||||||
|
|
||||||
|
# Set UNIX password (required for SSH login, min 10 characters)
|
||||||
|
kanidm person posix set-password <username>
|
||||||
|
|
||||||
|
# Optionally set login shell
|
||||||
|
kanidm person posix set <username> --shell /bin/zsh
|
||||||
|
```
|
||||||
|
|
||||||
|
### Setting Email Address
|
||||||
|
|
||||||
|
Email is required for OAuth2/OIDC login (e.g., Grafana):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm person update <username> --mail <email>
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example: Full User Creation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm person create testuser "Test User"
|
||||||
|
kanidm person update testuser --mail testuser@home.2rjus.net
|
||||||
|
kanidm group add-members ssh-users testuser
|
||||||
|
kanidm group add-members users testuser # Required for OAuth2 scopes
|
||||||
|
kanidm person posix set testuser
|
||||||
|
kanidm person posix set-password testuser
|
||||||
|
kanidm person get testuser
|
||||||
|
```
|
||||||
|
|
||||||
|
After creation, verify on a client host:
|
||||||
|
```bash
|
||||||
|
getent passwd testuser
|
||||||
|
ssh testuser@testvm01.home.2rjus.net
|
||||||
|
```
|
||||||
|
|
||||||
|
### Viewing User Details
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm person get <username>
|
||||||
|
```
|
||||||
|
|
||||||
|
### Removing a User
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm person delete <username>
|
||||||
|
```
|
||||||
|
|
||||||
|
## Group Management
|
||||||
|
|
||||||
|
Groups for POSIX access are also managed via CLI.
|
||||||
|
|
||||||
|
### Creating a POSIX Group
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create the group
|
||||||
|
kanidm group create <group-name>
|
||||||
|
|
||||||
|
# Enable POSIX with a specific GID
|
||||||
|
kanidm group posix set <group-name> --gidnumber <gid>
|
||||||
|
```
|
||||||
|
|
||||||
|
### Adding Members
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm group add-members <group-name> <username>
|
||||||
|
```
|
||||||
|
|
||||||
|
### Viewing Group Details
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm group get <group-name>
|
||||||
|
kanidm group list-members <group-name>
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example: Full Group Creation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm group create testgroup
|
||||||
|
kanidm group posix set testgroup --gidnumber 68010
|
||||||
|
kanidm group add-members testgroup testuser
|
||||||
|
kanidm group get testgroup
|
||||||
|
```
|
||||||
|
|
||||||
|
After creation, verify on a client host:
|
||||||
|
```bash
|
||||||
|
getent group testgroup
|
||||||
|
```
|
||||||
|
|
||||||
|
### Current Groups
|
||||||
|
|
||||||
|
| Group | GID | Purpose |
|
||||||
|
|-------|-----|---------|
|
||||||
|
| ssh-users | 68000 | SSH login access |
|
||||||
|
| admins | 68001 | Administrative access |
|
||||||
|
| users | 68002 | General users |
|
||||||
|
|
||||||
|
### UID/GID Allocation
|
||||||
|
|
||||||
|
Kanidm auto-assigns UIDs/GIDs from its configured range. For manually assigned GIDs:
|
||||||
|
|
||||||
|
| Range | Purpose |
|
||||||
|
|-------|---------|
|
||||||
|
| 65,536+ | Users (auto-assigned) |
|
||||||
|
| 68,000 - 68,999 | Groups (manually assigned) |
|
||||||
|
|
||||||
|
## OAuth2/OIDC Login (Web Services)
|
||||||
|
|
||||||
|
For OAuth2/OIDC login to web services like Grafana, users need:
|
||||||
|
|
||||||
|
1. **Primary credential** - Password set via `credential update` (separate from unix password)
|
||||||
|
2. **MFA** - TOTP or passkey (Kanidm requires MFA for primary credentials)
|
||||||
|
3. **Group membership** - Member of `users` group (for OAuth2 scope mapping)
|
||||||
|
4. **Email address** - Set via `person update --mail`
|
||||||
|
|
||||||
|
### Setting Up Primary Credential (Web Login)
|
||||||
|
|
||||||
|
The primary credential is different from the unix/POSIX password:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Interactive credential setup
|
||||||
|
kanidm person credential update <username>
|
||||||
|
|
||||||
|
# In the interactive prompt:
|
||||||
|
# 1. Type 'password' to set a password
|
||||||
|
# 2. Type 'totp' to add TOTP (scan QR with authenticator app)
|
||||||
|
# 3. Type 'commit' to save
|
||||||
|
```
|
||||||
|
|
||||||
|
### Verifying OAuth2 Readiness
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm person get <username>
|
||||||
|
```
|
||||||
|
|
||||||
|
Check for:
|
||||||
|
- `mail:` - Email address set
|
||||||
|
- `memberof:` - Includes `users@home.2rjus.net`
|
||||||
|
- Primary credential status (check via `credential update` → `status`)
|
||||||
|
|
||||||
|
## PAM/NSS Client Configuration
|
||||||
|
|
||||||
|
Enable central authentication on a host:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
homelab.kanidm.enable = true;
|
||||||
|
```
|
||||||
|
|
||||||
|
This configures:
|
||||||
|
- `services.kanidm.enablePam = true`
|
||||||
|
- Client connection to auth.home.2rjus.net
|
||||||
|
- Login authorization for `ssh-users` group
|
||||||
|
- Short usernames (`torjus` instead of `torjus@home.2rjus.net`)
|
||||||
|
- Home directory symlinks (`/home/torjus` → UUID-based directory)
|
||||||
|
|
||||||
|
### Enabled Hosts
|
||||||
|
|
||||||
|
- testvm01, testvm02, testvm03 (test tier)
|
||||||
|
|
||||||
|
### Options
|
||||||
|
|
||||||
|
```nix
|
||||||
|
homelab.kanidm = {
|
||||||
|
enable = true;
|
||||||
|
server = "https://auth.home.2rjus.net"; # default
|
||||||
|
allowedLoginGroups = [ "ssh-users" ]; # default
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
### Home Directories
|
||||||
|
|
||||||
|
Home directories use UUID-based paths for stability (so renaming a user doesn't
|
||||||
|
require moving their home directory). Symlinks provide convenient access:
|
||||||
|
|
||||||
|
```
|
||||||
|
/home/torjus -> /home/e4f4c56c-4aee-4c20-846f-90cb69807733
|
||||||
|
```
|
||||||
|
|
||||||
|
The symlinks are created by `kanidm-unixd-tasks` on first login.
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
### Verify NSS Resolution
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check user resolution
|
||||||
|
getent passwd <username>
|
||||||
|
|
||||||
|
# Check group resolution
|
||||||
|
getent group <group-name>
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test SSH Login
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ssh <username>@<hostname>.home.2rjus.net
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### "PAM user mismatch" error
|
||||||
|
|
||||||
|
SSH fails with "fatal: PAM user mismatch" in logs. This happens when Kanidm returns
|
||||||
|
usernames in SPN format (`torjus@home.2rjus.net`) but SSH expects short names (`torjus`).
|
||||||
|
|
||||||
|
**Solution**: Configure `uid_attr_map = "name"` in unixSettings (already set in our module).
|
||||||
|
|
||||||
|
Check current format:
|
||||||
|
```bash
|
||||||
|
getent passwd torjus
|
||||||
|
# Should show: torjus:x:65536:...
|
||||||
|
# NOT: torjus@home.2rjus.net:x:65536:...
|
||||||
|
```
|
||||||
|
|
||||||
|
### User resolves but SSH fails immediately
|
||||||
|
|
||||||
|
The user's login group (e.g., `ssh-users`) likely doesn't have POSIX enabled:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check if group has POSIX
|
||||||
|
getent group ssh-users
|
||||||
|
|
||||||
|
# If empty, enable POSIX on the server
|
||||||
|
kanidm group posix set ssh-users --gidnumber 68000
|
||||||
|
```
|
||||||
|
|
||||||
|
### User doesn't resolve via getent
|
||||||
|
|
||||||
|
1. Check kanidm-unixd service is running:
|
||||||
|
```bash
|
||||||
|
systemctl status kanidm-unixd
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Check unixd can reach server:
|
||||||
|
```bash
|
||||||
|
kanidm-unix status
|
||||||
|
# Should show: system: online, Kanidm: online
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Check client can reach server:
|
||||||
|
```bash
|
||||||
|
curl -s https://auth.home.2rjus.net/status
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Check user has POSIX enabled on server:
|
||||||
|
```bash
|
||||||
|
kanidm person get <username>
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Restart nscd to clear stale cache:
|
||||||
|
```bash
|
||||||
|
systemctl restart nscd
|
||||||
|
```
|
||||||
|
|
||||||
|
6. Invalidate kanidm cache:
|
||||||
|
```bash
|
||||||
|
kanidm-unix cache-invalidate
|
||||||
|
```
|
||||||
|
|
||||||
|
### Changes not taking effect after deployment
|
||||||
|
|
||||||
|
NixOS uses nsncd (a Rust reimplementation of nscd) for NSS caching. After deploying
|
||||||
|
kanidm-unixd config changes, you may need to restart both services:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
systemctl restart kanidm-unixd
|
||||||
|
systemctl restart nscd
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test PAM authentication directly
|
||||||
|
|
||||||
|
Use the kanidm-unix CLI to test PAM auth without SSH:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm-unix auth-test --name <username>
|
||||||
|
```
|
||||||
28
flake.lock
generated
28
flake.lock
generated
@@ -28,11 +28,11 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1770481834,
|
"lastModified": 1771488195,
|
||||||
"narHash": "sha256-Xx9BYnI0C/qgPbwr9nj6NoAdQTbYLunrdbNSaUww9oY=",
|
"narHash": "sha256-2kMxqdDyPluRQRoES22Y0oSjp7pc5fj2nRterfmSIyc=",
|
||||||
"ref": "master",
|
"ref": "master",
|
||||||
"rev": "fd0d63b103dfaf21d1c27363266590e723021c67",
|
"rev": "2d26de50559d8acb82ea803764e138325d95572c",
|
||||||
"revCount": 24,
|
"revCount": 37,
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://git.t-juice.club/torjus/homelab-deploy"
|
"url": "https://git.t-juice.club/torjus/homelab-deploy"
|
||||||
},
|
},
|
||||||
@@ -49,11 +49,11 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1770422522,
|
"lastModified": 1770593543,
|
||||||
"narHash": "sha256-WmIFnquu4u58v8S2bOVWmknRwHn4x88CRfBFTzJ1inQ=",
|
"narHash": "sha256-hT8Rj6JAwGDFvcxWEcUzTCrWSiupCfBa57pBDnM2C5g=",
|
||||||
"ref": "refs/heads/master",
|
"ref": "refs/heads/master",
|
||||||
"rev": "cf0ce858997af4d8dcc2ce10393ff393e17fc911",
|
"rev": "5aa5f7275b7a08015816171ba06d2cbdc2e02d3e",
|
||||||
"revCount": 11,
|
"revCount": 15,
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://git.t-juice.club/torjus/nixos-exporter"
|
"url": "https://git.t-juice.club/torjus/nixos-exporter"
|
||||||
},
|
},
|
||||||
@@ -64,11 +64,11 @@
|
|||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1770136044,
|
"lastModified": 1771419570,
|
||||||
"narHash": "sha256-tlFqNG/uzz2++aAmn4v8J0vAkV3z7XngeIIB3rM3650=",
|
"narHash": "sha256-bxAlQgre3pcQcaRUm/8A0v/X8d2nhfraWSFqVmMcBcU=",
|
||||||
"owner": "nixos",
|
"owner": "nixos",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "e576e3c9cf9bad747afcddd9e34f51d18c855b4e",
|
"rev": "6d41bc27aaf7b6a3ba6b169db3bd5d6159cfaa47",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
@@ -80,11 +80,11 @@
|
|||||||
},
|
},
|
||||||
"nixpkgs-unstable": {
|
"nixpkgs-unstable": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1770197578,
|
"lastModified": 1771369470,
|
||||||
"narHash": "sha256-AYqlWrX09+HvGs8zM6ebZ1pwUqjkfpnv8mewYwAo+iM=",
|
"narHash": "sha256-0NBlEBKkN3lufyvFegY4TYv5mCNHbi5OmBDrzihbBMQ=",
|
||||||
"owner": "nixos",
|
"owner": "nixos",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "00c21e4c93d963c50d4c0c89bfa84ed6e0694df2",
|
"rev": "0182a361324364ae3f436a63005877674cf45efb",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|||||||
48
flake.nix
48
flake.nix
@@ -92,15 +92,6 @@
|
|||||||
./hosts/http-proxy
|
./hosts/http-proxy
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
monitoring01 = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self;
|
|
||||||
};
|
|
||||||
modules = commonModules ++ [
|
|
||||||
./hosts/monitoring01
|
|
||||||
];
|
|
||||||
};
|
|
||||||
jelly01 = nixpkgs.lib.nixosSystem {
|
jelly01 = nixpkgs.lib.nixosSystem {
|
||||||
inherit system;
|
inherit system;
|
||||||
specialArgs = {
|
specialArgs = {
|
||||||
@@ -110,15 +101,6 @@
|
|||||||
./hosts/jelly01
|
./hosts/jelly01
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
nix-cache01 = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self;
|
|
||||||
};
|
|
||||||
modules = commonModules ++ [
|
|
||||||
./hosts/nix-cache01
|
|
||||||
];
|
|
||||||
};
|
|
||||||
nats1 = nixpkgs.lib.nixosSystem {
|
nats1 = nixpkgs.lib.nixosSystem {
|
||||||
inherit system;
|
inherit system;
|
||||||
specialArgs = {
|
specialArgs = {
|
||||||
@@ -191,6 +173,33 @@
|
|||||||
./hosts/kanidm01
|
./hosts/kanidm01
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
|
monitoring02 = nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
specialArgs = {
|
||||||
|
inherit inputs self;
|
||||||
|
};
|
||||||
|
modules = commonModules ++ [
|
||||||
|
./hosts/monitoring02
|
||||||
|
];
|
||||||
|
};
|
||||||
|
nix-cache02 = nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
specialArgs = {
|
||||||
|
inherit inputs self;
|
||||||
|
};
|
||||||
|
modules = commonModules ++ [
|
||||||
|
./hosts/nix-cache02
|
||||||
|
];
|
||||||
|
};
|
||||||
|
garage01 = nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
specialArgs = {
|
||||||
|
inherit inputs self;
|
||||||
|
};
|
||||||
|
modules = commonModules ++ [
|
||||||
|
./hosts/garage01
|
||||||
|
];
|
||||||
|
};
|
||||||
};
|
};
|
||||||
packages = forAllSystems (
|
packages = forAllSystems (
|
||||||
{ pkgs }:
|
{ pkgs }:
|
||||||
@@ -207,9 +216,12 @@
|
|||||||
pkgs.ansible
|
pkgs.ansible
|
||||||
pkgs.opentofu
|
pkgs.opentofu
|
||||||
pkgs.openbao
|
pkgs.openbao
|
||||||
|
pkgs.kanidm_1_8
|
||||||
|
pkgs.nkeys
|
||||||
(pkgs.callPackage ./scripts/create-host { })
|
(pkgs.callPackage ./scripts/create-host { })
|
||||||
homelab-deploy.packages.${pkgs.system}.default
|
homelab-deploy.packages.${pkgs.system}.default
|
||||||
];
|
];
|
||||||
|
ANSIBLE_CONFIG = "./ansible/ansible.cfg";
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -1,33 +1,37 @@
|
|||||||
{
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
pkgs,
|
pkgs,
|
||||||
...
|
...
|
||||||
}:
|
}:
|
||||||
|
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
./hardware-configuration.nix
|
../template2/hardware-configuration.nix
|
||||||
|
|
||||||
../../system
|
../../system
|
||||||
../../common/vm
|
../../common/vm
|
||||||
];
|
];
|
||||||
|
|
||||||
homelab.dns.cnames = [ "nix-cache" "actions1" ];
|
# Host metadata (adjust as needed)
|
||||||
|
homelab.host = {
|
||||||
homelab.host.role = "build-host";
|
tier = "test"; # Start in test tier, move to prod after validation
|
||||||
|
role = "storage";
|
||||||
fileSystems."/nix" = {
|
|
||||||
device = "/dev/disk/by-label/nixcache";
|
|
||||||
fsType = "xfs";
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
homelab.dns.cnames = [ "s3" ];
|
||||||
|
|
||||||
|
# Enable Vault integration
|
||||||
|
vault.enable = true;
|
||||||
|
|
||||||
|
# Enable remote deployment via NATS
|
||||||
|
homelab.deploy.enable = true;
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
nixpkgs.config.allowUnfree = true;
|
||||||
# Use the systemd-boot EFI boot loader.
|
boot.loader.grub.enable = true;
|
||||||
boot.loader.grub = {
|
boot.loader.grub.device = "/dev/vda";
|
||||||
enable = true;
|
|
||||||
device = "/dev/sda";
|
|
||||||
configurationLimit = 3;
|
|
||||||
};
|
|
||||||
|
|
||||||
networking.hostName = "nix-cache01";
|
networking.hostName = "garage01";
|
||||||
networking.domain = "home.2rjus.net";
|
networking.domain = "home.2rjus.net";
|
||||||
networking.useNetworkd = true;
|
networking.useNetworkd = true;
|
||||||
networking.useDHCP = false;
|
networking.useDHCP = false;
|
||||||
@@ -41,7 +45,7 @@
|
|||||||
systemd.network.networks."ens18" = {
|
systemd.network.networks."ens18" = {
|
||||||
matchConfig.Name = "ens18";
|
matchConfig.Name = "ens18";
|
||||||
address = [
|
address = [
|
||||||
"10.69.13.15/24"
|
"10.69.13.26/24"
|
||||||
];
|
];
|
||||||
routes = [
|
routes = [
|
||||||
{ Gateway = "10.69.13.1"; }
|
{ Gateway = "10.69.13.1"; }
|
||||||
@@ -54,9 +58,6 @@
|
|||||||
"nix-command"
|
"nix-command"
|
||||||
"flakes"
|
"flakes"
|
||||||
];
|
];
|
||||||
vault.enable = true;
|
|
||||||
homelab.deploy.enable = true;
|
|
||||||
|
|
||||||
nix.settings.tarball-ttl = 0;
|
nix.settings.tarball-ttl = 0;
|
||||||
environment.systemPackages = with pkgs; [
|
environment.systemPackages = with pkgs; [
|
||||||
vim
|
vim
|
||||||
@@ -64,13 +65,11 @@
|
|||||||
git
|
git
|
||||||
];
|
];
|
||||||
|
|
||||||
services.qemuGuest.enable = true;
|
|
||||||
|
|
||||||
# Open ports in the firewall.
|
# Open ports in the firewall.
|
||||||
# networking.firewall.allowedTCPPorts = [ ... ];
|
# networking.firewall.allowedTCPPorts = [ ... ];
|
||||||
# networking.firewall.allowedUDPPorts = [ ... ];
|
# networking.firewall.allowedUDPPorts = [ ... ];
|
||||||
# Or disable the firewall altogether.
|
# Or disable the firewall altogether.
|
||||||
networking.firewall.enable = false;
|
networking.firewall.enable = false;
|
||||||
|
|
||||||
system.stateVersion = "24.05"; # Did you read the comment?
|
system.stateVersion = "25.11"; # Did you read the comment?
|
||||||
}
|
}
|
||||||
@@ -1,7 +1,6 @@
|
|||||||
{ ... }:
|
{ ... }: {
|
||||||
{
|
|
||||||
imports = [
|
imports = [
|
||||||
./configuration.nix
|
./configuration.nix
|
||||||
../../services/monitoring
|
../../services/garage
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
@@ -13,6 +13,8 @@
|
|||||||
../../common/vm
|
../../common/vm
|
||||||
];
|
];
|
||||||
|
|
||||||
|
homelab.host.role = "home-automation";
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
nixpkgs.config.allowUnfree = true;
|
||||||
# Use the systemd-boot EFI boot loader.
|
# Use the systemd-boot EFI boot loader.
|
||||||
boot.loader.grub = {
|
boot.loader.grub = {
|
||||||
@@ -85,6 +87,7 @@
|
|||||||
"--keep-monthly 6"
|
"--keep-monthly 6"
|
||||||
"--keep-within 1d"
|
"--keep-within 1d"
|
||||||
];
|
];
|
||||||
|
extraOptions = [ "--retry-lock=5m" ];
|
||||||
};
|
};
|
||||||
|
|
||||||
# Open ports in the firewall.
|
# Open ports in the firewall.
|
||||||
|
|||||||
@@ -11,18 +11,14 @@
|
|||||||
../../common/vm
|
../../common/vm
|
||||||
];
|
];
|
||||||
|
|
||||||
|
homelab.host.role = "proxy";
|
||||||
homelab.dns.cnames = [
|
homelab.dns.cnames = [
|
||||||
"nzbget"
|
"nzbget"
|
||||||
"radarr"
|
"radarr"
|
||||||
"sonarr"
|
"sonarr"
|
||||||
"ha"
|
"ha"
|
||||||
"z2m"
|
"z2m"
|
||||||
"grafana"
|
|
||||||
"prometheus"
|
|
||||||
"alertmanager"
|
|
||||||
"jelly"
|
"jelly"
|
||||||
"pyroscope"
|
|
||||||
"pushgw"
|
|
||||||
];
|
];
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
nixpkgs.config.allowUnfree = true;
|
||||||
|
|||||||
@@ -11,6 +11,8 @@
|
|||||||
../../common/vm
|
../../common/vm
|
||||||
];
|
];
|
||||||
|
|
||||||
|
homelab.host.role = "media";
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
nixpkgs.config.allowUnfree = true;
|
||||||
# Use the systemd-boot EFI boot loader.
|
# Use the systemd-boot EFI boot loader.
|
||||||
boot.loader.grub = {
|
boot.loader.grub = {
|
||||||
|
|||||||
@@ -14,9 +14,8 @@
|
|||||||
../../services/kanidm
|
../../services/kanidm
|
||||||
];
|
];
|
||||||
|
|
||||||
# Host metadata
|
|
||||||
homelab.host = {
|
homelab.host = {
|
||||||
tier = "test";
|
tier = "prod";
|
||||||
role = "auth";
|
role = "auth";
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -1,110 +0,0 @@
|
|||||||
{
|
|
||||||
pkgs,
|
|
||||||
...
|
|
||||||
}:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports = [
|
|
||||||
./hardware-configuration.nix
|
|
||||||
|
|
||||||
../../system
|
|
||||||
../../common/vm
|
|
||||||
];
|
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
|
||||||
# Use the systemd-boot EFI boot loader.
|
|
||||||
boot.loader.grub = {
|
|
||||||
enable = true;
|
|
||||||
device = "/dev/sda";
|
|
||||||
configurationLimit = 3;
|
|
||||||
};
|
|
||||||
|
|
||||||
networking.hostName = "monitoring01";
|
|
||||||
networking.domain = "home.2rjus.net";
|
|
||||||
networking.useNetworkd = true;
|
|
||||||
networking.useDHCP = false;
|
|
||||||
services.resolved.enable = true;
|
|
||||||
networking.nameservers = [
|
|
||||||
"10.69.13.5"
|
|
||||||
"10.69.13.6"
|
|
||||||
];
|
|
||||||
|
|
||||||
systemd.network.enable = true;
|
|
||||||
systemd.network.networks."ens18" = {
|
|
||||||
matchConfig.Name = "ens18";
|
|
||||||
address = [
|
|
||||||
"10.69.13.13/24"
|
|
||||||
];
|
|
||||||
routes = [
|
|
||||||
{ Gateway = "10.69.13.1"; }
|
|
||||||
];
|
|
||||||
linkConfig.RequiredForOnline = "routable";
|
|
||||||
};
|
|
||||||
time.timeZone = "Europe/Oslo";
|
|
||||||
|
|
||||||
nix.settings.experimental-features = [
|
|
||||||
"nix-command"
|
|
||||||
"flakes"
|
|
||||||
];
|
|
||||||
nix.settings.tarball-ttl = 0;
|
|
||||||
environment.systemPackages = with pkgs; [
|
|
||||||
vim
|
|
||||||
wget
|
|
||||||
git
|
|
||||||
sqlite
|
|
||||||
];
|
|
||||||
|
|
||||||
services.qemuGuest.enable = true;
|
|
||||||
|
|
||||||
# Vault secrets management
|
|
||||||
vault.enable = true;
|
|
||||||
homelab.deploy.enable = true;
|
|
||||||
vault.secrets.backup-helper = {
|
|
||||||
secretPath = "shared/backup/password";
|
|
||||||
extractKey = "password";
|
|
||||||
outputDir = "/run/secrets/backup_helper_secret";
|
|
||||||
services = [ "restic-backups-grafana" "restic-backups-grafana-db" ];
|
|
||||||
};
|
|
||||||
|
|
||||||
services.restic.backups.grafana = {
|
|
||||||
repository = "rest:http://10.69.12.52:8000/backup-nix";
|
|
||||||
passwordFile = "/run/secrets/backup_helper_secret";
|
|
||||||
paths = [ "/var/lib/grafana/plugins" ];
|
|
||||||
timerConfig = {
|
|
||||||
OnCalendar = "daily";
|
|
||||||
Persistent = true;
|
|
||||||
RandomizedDelaySec = "2h";
|
|
||||||
};
|
|
||||||
pruneOpts = [
|
|
||||||
"--keep-daily 7"
|
|
||||||
"--keep-weekly 4"
|
|
||||||
"--keep-monthly 6"
|
|
||||||
"--keep-within 1d"
|
|
||||||
];
|
|
||||||
};
|
|
||||||
|
|
||||||
services.restic.backups.grafana-db = {
|
|
||||||
repository = "rest:http://10.69.12.52:8000/backup-nix";
|
|
||||||
passwordFile = "/run/secrets/backup_helper_secret";
|
|
||||||
command = [ "${pkgs.sqlite}/bin/sqlite3" "/var/lib/grafana/data/grafana.db" ".dump" ];
|
|
||||||
timerConfig = {
|
|
||||||
OnCalendar = "daily";
|
|
||||||
Persistent = true;
|
|
||||||
RandomizedDelaySec = "2h";
|
|
||||||
};
|
|
||||||
pruneOpts = [
|
|
||||||
"--keep-daily 7"
|
|
||||||
"--keep-weekly 4"
|
|
||||||
"--keep-monthly 6"
|
|
||||||
"--keep-within 1d"
|
|
||||||
];
|
|
||||||
};
|
|
||||||
|
|
||||||
# Open ports in the firewall.
|
|
||||||
# networking.firewall.allowedTCPPorts = [ ... ];
|
|
||||||
# networking.firewall.allowedUDPPorts = [ ... ];
|
|
||||||
# Or disable the firewall altogether.
|
|
||||||
networking.firewall.enable = false;
|
|
||||||
|
|
||||||
system.stateVersion = "23.11"; # Did you read the comment?
|
|
||||||
}
|
|
||||||
@@ -1,42 +0,0 @@
|
|||||||
{
|
|
||||||
config,
|
|
||||||
lib,
|
|
||||||
pkgs,
|
|
||||||
modulesPath,
|
|
||||||
...
|
|
||||||
}:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports = [
|
|
||||||
(modulesPath + "/profiles/qemu-guest.nix")
|
|
||||||
];
|
|
||||||
boot.initrd.availableKernelModules = [
|
|
||||||
"ata_piix"
|
|
||||||
"uhci_hcd"
|
|
||||||
"virtio_pci"
|
|
||||||
"virtio_scsi"
|
|
||||||
"sd_mod"
|
|
||||||
"sr_mod"
|
|
||||||
];
|
|
||||||
boot.initrd.kernelModules = [ "dm-snapshot" ];
|
|
||||||
boot.kernelModules = [
|
|
||||||
"ptp_kvm"
|
|
||||||
];
|
|
||||||
boot.extraModulePackages = [ ];
|
|
||||||
|
|
||||||
fileSystems."/" = {
|
|
||||||
device = "/dev/disk/by-label/root";
|
|
||||||
fsType = "xfs";
|
|
||||||
};
|
|
||||||
|
|
||||||
swapDevices = [ { device = "/dev/disk/by-label/swap"; } ];
|
|
||||||
|
|
||||||
# Enables DHCP on each ethernet and wireless interface. In case of scripted networking
|
|
||||||
# (the default) this is the recommended approach. When using systemd-networkd it's
|
|
||||||
# still possible to use this option, but it's recommended to use it in conjunction
|
|
||||||
# with explicit per-interface declarations with `networking.interfaces.<interface>.useDHCP`.
|
|
||||||
networking.useDHCP = lib.mkDefault true;
|
|
||||||
# networking.interfaces.ens18.useDHCP = lib.mkDefault true;
|
|
||||||
|
|
||||||
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
|
|
||||||
}
|
|
||||||
74
hosts/monitoring02/configuration.nix
Normal file
74
hosts/monitoring02/configuration.nix
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
|
pkgs,
|
||||||
|
...
|
||||||
|
}:
|
||||||
|
|
||||||
|
{
|
||||||
|
imports = [
|
||||||
|
../template2/hardware-configuration.nix
|
||||||
|
|
||||||
|
../../system
|
||||||
|
../../common/vm
|
||||||
|
];
|
||||||
|
|
||||||
|
homelab.host = {
|
||||||
|
tier = "prod";
|
||||||
|
role = "monitoring";
|
||||||
|
};
|
||||||
|
|
||||||
|
homelab.dns.cnames = [ "monitoring" "alertmanager" "grafana" "grafana-test" "metrics" "vmalert" "loki" ];
|
||||||
|
|
||||||
|
# Enable Vault integration
|
||||||
|
vault.enable = true;
|
||||||
|
|
||||||
|
# Enable remote deployment via NATS
|
||||||
|
homelab.deploy.enable = true;
|
||||||
|
|
||||||
|
nixpkgs.config.allowUnfree = true;
|
||||||
|
boot.loader.grub.enable = true;
|
||||||
|
boot.loader.grub.device = "/dev/vda";
|
||||||
|
|
||||||
|
networking.hostName = "monitoring02";
|
||||||
|
networking.domain = "home.2rjus.net";
|
||||||
|
networking.useNetworkd = true;
|
||||||
|
networking.useDHCP = false;
|
||||||
|
services.resolved.enable = true;
|
||||||
|
networking.nameservers = [
|
||||||
|
"10.69.13.5"
|
||||||
|
"10.69.13.6"
|
||||||
|
];
|
||||||
|
|
||||||
|
systemd.network.enable = true;
|
||||||
|
systemd.network.networks."ens18" = {
|
||||||
|
matchConfig.Name = "ens18";
|
||||||
|
address = [
|
||||||
|
"10.69.13.24/24"
|
||||||
|
];
|
||||||
|
routes = [
|
||||||
|
{ Gateway = "10.69.13.1"; }
|
||||||
|
];
|
||||||
|
linkConfig.RequiredForOnline = "routable";
|
||||||
|
};
|
||||||
|
time.timeZone = "Europe/Oslo";
|
||||||
|
|
||||||
|
nix.settings.experimental-features = [
|
||||||
|
"nix-command"
|
||||||
|
"flakes"
|
||||||
|
];
|
||||||
|
nix.settings.tarball-ttl = 0;
|
||||||
|
environment.systemPackages = with pkgs; [
|
||||||
|
vim
|
||||||
|
wget
|
||||||
|
git
|
||||||
|
];
|
||||||
|
|
||||||
|
# Open ports in the firewall.
|
||||||
|
# networking.firewall.allowedTCPPorts = [ ... ];
|
||||||
|
# networking.firewall.allowedUDPPorts = [ ... ];
|
||||||
|
# Or disable the firewall altogether.
|
||||||
|
networking.firewall.enable = false;
|
||||||
|
|
||||||
|
system.stateVersion = "25.11"; # Did you read the comment?
|
||||||
|
}
|
||||||
12
hosts/monitoring02/default.nix
Normal file
12
hosts/monitoring02/default.nix
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
{ ... }: {
|
||||||
|
imports = [
|
||||||
|
./configuration.nix
|
||||||
|
../../services/grafana
|
||||||
|
../../services/victoriametrics
|
||||||
|
../../services/loki
|
||||||
|
../../services/monitoring/alerttonotify.nix
|
||||||
|
../../services/monitoring/blackbox.nix
|
||||||
|
../../services/monitoring/exportarr.nix
|
||||||
|
../../services/monitoring/pve.nix
|
||||||
|
];
|
||||||
|
}
|
||||||
@@ -11,6 +11,8 @@
|
|||||||
../../common/vm
|
../../common/vm
|
||||||
];
|
];
|
||||||
|
|
||||||
|
homelab.host.role = "messaging";
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
nixpkgs.config.allowUnfree = true;
|
||||||
# Use the systemd-boot EFI boot loader.
|
# Use the systemd-boot EFI boot loader.
|
||||||
boot.loader.grub = {
|
boot.loader.grub = {
|
||||||
|
|||||||
@@ -1,42 +0,0 @@
|
|||||||
{
|
|
||||||
config,
|
|
||||||
lib,
|
|
||||||
pkgs,
|
|
||||||
modulesPath,
|
|
||||||
...
|
|
||||||
}:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports = [
|
|
||||||
(modulesPath + "/profiles/qemu-guest.nix")
|
|
||||||
];
|
|
||||||
boot.initrd.availableKernelModules = [
|
|
||||||
"ata_piix"
|
|
||||||
"uhci_hcd"
|
|
||||||
"virtio_pci"
|
|
||||||
"virtio_scsi"
|
|
||||||
"sd_mod"
|
|
||||||
"sr_mod"
|
|
||||||
];
|
|
||||||
boot.initrd.kernelModules = [ "dm-snapshot" ];
|
|
||||||
boot.kernelModules = [
|
|
||||||
"ptp_kvm"
|
|
||||||
];
|
|
||||||
boot.extraModulePackages = [ ];
|
|
||||||
|
|
||||||
fileSystems."/" = {
|
|
||||||
device = "/dev/disk/by-label/root";
|
|
||||||
fsType = "xfs";
|
|
||||||
};
|
|
||||||
|
|
||||||
swapDevices = [ { device = "/dev/disk/by-label/swap"; } ];
|
|
||||||
|
|
||||||
# Enables DHCP on each ethernet and wireless interface. In case of scripted networking
|
|
||||||
# (the default) this is the recommended approach. When using systemd-networkd it's
|
|
||||||
# still possible to use this option, but it's recommended to use it in conjunction
|
|
||||||
# with explicit per-interface declarations with `networking.interfaces.<interface>.useDHCP`.
|
|
||||||
networking.useDHCP = lib.mkDefault true;
|
|
||||||
# networking.interfaces.ens18.useDHCP = lib.mkDefault true;
|
|
||||||
|
|
||||||
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
|
|
||||||
}
|
|
||||||
45
hosts/nix-cache02/builder.nix
Normal file
45
hosts/nix-cache02/builder.nix
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
{ config, ... }:
|
||||||
|
{
|
||||||
|
# Fetch builder NKey from Vault
|
||||||
|
vault.secrets.builder-nkey = {
|
||||||
|
secretPath = "shared/homelab-deploy/builder-nkey";
|
||||||
|
extractKey = "nkey";
|
||||||
|
outputDir = "/run/secrets/builder-nkey";
|
||||||
|
services = [ "homelab-deploy-builder" ];
|
||||||
|
};
|
||||||
|
|
||||||
|
# Configure the builder service
|
||||||
|
services.homelab-deploy.builder = {
|
||||||
|
enable = true;
|
||||||
|
natsUrl = "nats://nats1.home.2rjus.net:4222";
|
||||||
|
nkeyFile = "/run/secrets/builder-nkey";
|
||||||
|
|
||||||
|
settings.repos = {
|
||||||
|
nixos-servers = {
|
||||||
|
url = "git+https://git.t-juice.club/torjus/nixos-servers.git";
|
||||||
|
defaultBranch = "master";
|
||||||
|
};
|
||||||
|
nixos = {
|
||||||
|
url = "git+https://git.t-juice.club/torjus/nixos.git";
|
||||||
|
defaultBranch = "master";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
timeout = 14400;
|
||||||
|
metrics.enable = true;
|
||||||
|
};
|
||||||
|
|
||||||
|
# Expose builder metrics for Prometheus scraping
|
||||||
|
homelab.monitoring.scrapeTargets = [
|
||||||
|
{
|
||||||
|
job_name = "homelab-deploy-builder";
|
||||||
|
port = 9973;
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
|
# Ensure builder starts after vault secret is available
|
||||||
|
systemd.services.homelab-deploy-builder = {
|
||||||
|
after = [ "vault-secret-builder-nkey.service" ];
|
||||||
|
requires = [ "vault-secret-builder-nkey.service" ];
|
||||||
|
};
|
||||||
|
}
|
||||||
74
hosts/nix-cache02/configuration.nix
Normal file
74
hosts/nix-cache02/configuration.nix
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
|
pkgs,
|
||||||
|
...
|
||||||
|
}:
|
||||||
|
|
||||||
|
{
|
||||||
|
imports = [
|
||||||
|
../template2/hardware-configuration.nix
|
||||||
|
|
||||||
|
../../system
|
||||||
|
../../common/vm
|
||||||
|
];
|
||||||
|
|
||||||
|
homelab.host = {
|
||||||
|
tier = "prod";
|
||||||
|
role = "build-host";
|
||||||
|
};
|
||||||
|
|
||||||
|
homelab.dns.cnames = [ "nix-cache" ];
|
||||||
|
|
||||||
|
# Enable Vault integration
|
||||||
|
vault.enable = true;
|
||||||
|
|
||||||
|
# Enable remote deployment via NATS
|
||||||
|
homelab.deploy.enable = true;
|
||||||
|
|
||||||
|
nixpkgs.config.allowUnfree = true;
|
||||||
|
boot.loader.grub.enable = true;
|
||||||
|
boot.loader.grub.device = "/dev/vda";
|
||||||
|
|
||||||
|
networking.hostName = "nix-cache02";
|
||||||
|
networking.domain = "home.2rjus.net";
|
||||||
|
networking.useNetworkd = true;
|
||||||
|
networking.useDHCP = false;
|
||||||
|
services.resolved.enable = true;
|
||||||
|
networking.nameservers = [
|
||||||
|
"10.69.13.5"
|
||||||
|
"10.69.13.6"
|
||||||
|
];
|
||||||
|
|
||||||
|
systemd.network.enable = true;
|
||||||
|
systemd.network.networks."ens18" = {
|
||||||
|
matchConfig.Name = "ens18";
|
||||||
|
address = [
|
||||||
|
"10.69.13.25/24"
|
||||||
|
];
|
||||||
|
routes = [
|
||||||
|
{ Gateway = "10.69.13.1"; }
|
||||||
|
];
|
||||||
|
linkConfig.RequiredForOnline = "routable";
|
||||||
|
};
|
||||||
|
time.timeZone = "Europe/Oslo";
|
||||||
|
|
||||||
|
nix.settings.experimental-features = [
|
||||||
|
"nix-command"
|
||||||
|
"flakes"
|
||||||
|
];
|
||||||
|
nix.settings.tarball-ttl = 0;
|
||||||
|
environment.systemPackages = with pkgs; [
|
||||||
|
vim
|
||||||
|
wget
|
||||||
|
git
|
||||||
|
];
|
||||||
|
|
||||||
|
# Open ports in the firewall.
|
||||||
|
# networking.firewall.allowedTCPPorts = [ ... ];
|
||||||
|
# networking.firewall.allowedUDPPorts = [ ... ];
|
||||||
|
# Or disable the firewall altogether.
|
||||||
|
networking.firewall.enable = false;
|
||||||
|
|
||||||
|
system.stateVersion = "25.11"; # Did you read the comment?
|
||||||
|
}
|
||||||
@@ -1,8 +1,8 @@
|
|||||||
{ ... }:
|
{ ... }: {
|
||||||
{
|
|
||||||
imports = [
|
imports = [
|
||||||
./configuration.nix
|
./configuration.nix
|
||||||
|
./builder.nix
|
||||||
|
./scheduler.nix
|
||||||
../../services/nix-cache
|
../../services/nix-cache
|
||||||
../../services/actions-runner
|
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
61
hosts/nix-cache02/scheduler.nix
Normal file
61
hosts/nix-cache02/scheduler.nix
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
{ config, pkgs, lib, inputs, ... }:
|
||||||
|
let
|
||||||
|
homelab-deploy = inputs.homelab-deploy.packages.${pkgs.system}.default;
|
||||||
|
|
||||||
|
scheduledBuildScript = pkgs.writeShellApplication {
|
||||||
|
name = "scheduled-build";
|
||||||
|
runtimeInputs = [ homelab-deploy ];
|
||||||
|
text = ''
|
||||||
|
NATS_URL="nats://nats1.home.2rjus.net:4222"
|
||||||
|
NKEY_FILE="/run/secrets/scheduler-nkey"
|
||||||
|
|
||||||
|
echo "Starting scheduled builds at $(date)"
|
||||||
|
|
||||||
|
# Build all nixos-servers hosts
|
||||||
|
homelab-deploy build \
|
||||||
|
--nats-url "$NATS_URL" \
|
||||||
|
--nkey-file "$NKEY_FILE" \
|
||||||
|
nixos-servers --all
|
||||||
|
|
||||||
|
# Build all nixos (gunter) hosts
|
||||||
|
homelab-deploy build \
|
||||||
|
--nats-url "$NATS_URL" \
|
||||||
|
--nkey-file "$NKEY_FILE" \
|
||||||
|
nixos --all
|
||||||
|
|
||||||
|
echo "Scheduled builds completed at $(date)"
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
in
|
||||||
|
{
|
||||||
|
# Fetch scheduler NKey from Vault
|
||||||
|
vault.secrets.scheduler-nkey = {
|
||||||
|
secretPath = "shared/homelab-deploy/scheduler-nkey";
|
||||||
|
extractKey = "nkey";
|
||||||
|
outputDir = "/run/secrets/scheduler-nkey";
|
||||||
|
services = [ "scheduled-build" ];
|
||||||
|
};
|
||||||
|
|
||||||
|
# Timer: every 2 hours
|
||||||
|
systemd.timers.scheduled-build = {
|
||||||
|
description = "Trigger scheduled Nix builds";
|
||||||
|
wantedBy = [ "timers.target" ];
|
||||||
|
timerConfig = {
|
||||||
|
OnCalendar = "*-*-* 00/2:00:00"; # Every 2 hours at :00
|
||||||
|
Persistent = true; # Run missed builds on boot
|
||||||
|
RandomizedDelaySec = "5m"; # Slight jitter
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# Service: oneshot that triggers builds
|
||||||
|
systemd.services.scheduled-build = {
|
||||||
|
description = "Trigger builds for all hosts via NATS";
|
||||||
|
after = [ "network-online.target" "vault-secret-scheduler-nkey.service" ];
|
||||||
|
requires = [ "vault-secret-scheduler-nkey.service" ];
|
||||||
|
wants = [ "network-online.target" ];
|
||||||
|
serviceConfig = {
|
||||||
|
Type = "oneshot";
|
||||||
|
ExecStart = lib.getExe scheduledBuildScript;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -6,7 +6,8 @@ let
|
|||||||
text = ''
|
text = ''
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
LOKI_URL="http://monitoring01.home.2rjus.net:3100/loki/api/v1/push"
|
LOKI_URL="https://loki.home.2rjus.net/loki/api/v1/push"
|
||||||
|
LOKI_AUTH_FILE="/run/secrets/promtail-loki-auth"
|
||||||
|
|
||||||
# Send a log entry to Loki with bootstrap status
|
# Send a log entry to Loki with bootstrap status
|
||||||
# Usage: log_to_loki <stage> <message>
|
# Usage: log_to_loki <stage> <message>
|
||||||
@@ -28,7 +29,7 @@ let
|
|||||||
streams: [{
|
streams: [{
|
||||||
stream: {
|
stream: {
|
||||||
job: "bootstrap",
|
job: "bootstrap",
|
||||||
host: $host,
|
hostname: $host,
|
||||||
stage: $stage,
|
stage: $stage,
|
||||||
branch: $branch
|
branch: $branch
|
||||||
},
|
},
|
||||||
@@ -36,8 +37,14 @@ let
|
|||||||
}]
|
}]
|
||||||
}')
|
}')
|
||||||
|
|
||||||
|
local auth_args=()
|
||||||
|
if [[ -f "$LOKI_AUTH_FILE" ]]; then
|
||||||
|
auth_args=(-u "promtail:$(cat "$LOKI_AUTH_FILE")")
|
||||||
|
fi
|
||||||
|
|
||||||
curl -s --connect-timeout 2 --max-time 5 \
|
curl -s --connect-timeout 2 --max-time 5 \
|
||||||
-X POST \
|
-X POST \
|
||||||
|
"''${auth_args[@]}" \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d "$payload" \
|
-d "$payload" \
|
||||||
"$LOKI_URL" >/dev/null 2>&1 || true
|
"$LOKI_URL" >/dev/null 2>&1 || true
|
||||||
|
|||||||
@@ -35,6 +35,7 @@
|
|||||||
homelab.host = {
|
homelab.host = {
|
||||||
tier = "test";
|
tier = "test";
|
||||||
priority = "low";
|
priority = "low";
|
||||||
|
labels.ansible = "false"; # Exclude from Ansible inventory
|
||||||
};
|
};
|
||||||
|
|
||||||
boot.loader.grub.enable = true;
|
boot.loader.grub.enable = true;
|
||||||
|
|||||||
@@ -14,9 +14,9 @@
|
|||||||
../../common/ssh-audit.nix
|
../../common/ssh-audit.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
# Host metadata (adjust as needed)
|
|
||||||
homelab.host = {
|
homelab.host = {
|
||||||
tier = "test"; # Start in test tier, move to prod after validation
|
tier = "test";
|
||||||
|
role = "test";
|
||||||
};
|
};
|
||||||
|
|
||||||
# Enable Vault integration
|
# Enable Vault integration
|
||||||
@@ -25,6 +25,9 @@
|
|||||||
# Enable remote deployment via NATS
|
# Enable remote deployment via NATS
|
||||||
homelab.deploy.enable = true;
|
homelab.deploy.enable = true;
|
||||||
|
|
||||||
|
# Enable Kanidm PAM/NSS for central authentication
|
||||||
|
homelab.kanidm.enable = true;
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
nixpkgs.config.allowUnfree = true;
|
||||||
boot.loader.grub.enable = true;
|
boot.loader.grub.enable = true;
|
||||||
boot.loader.grub.device = "/dev/vda";
|
boot.loader.grub.device = "/dev/vda";
|
||||||
|
|||||||
@@ -14,9 +14,9 @@
|
|||||||
../../common/ssh-audit.nix
|
../../common/ssh-audit.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
# Host metadata (adjust as needed)
|
|
||||||
homelab.host = {
|
homelab.host = {
|
||||||
tier = "test"; # Start in test tier, move to prod after validation
|
tier = "test";
|
||||||
|
role = "test";
|
||||||
};
|
};
|
||||||
|
|
||||||
# Enable Vault integration
|
# Enable Vault integration
|
||||||
@@ -25,6 +25,9 @@
|
|||||||
# Enable remote deployment via NATS
|
# Enable remote deployment via NATS
|
||||||
homelab.deploy.enable = true;
|
homelab.deploy.enable = true;
|
||||||
|
|
||||||
|
# Enable Kanidm PAM/NSS for central authentication
|
||||||
|
homelab.kanidm.enable = true;
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
nixpkgs.config.allowUnfree = true;
|
||||||
boot.loader.grub.enable = true;
|
boot.loader.grub.enable = true;
|
||||||
boot.loader.grub.device = "/dev/vda";
|
boot.loader.grub.device = "/dev/vda";
|
||||||
|
|||||||
@@ -14,9 +14,9 @@
|
|||||||
../../common/ssh-audit.nix
|
../../common/ssh-audit.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
# Host metadata (adjust as needed)
|
|
||||||
homelab.host = {
|
homelab.host = {
|
||||||
tier = "test"; # Start in test tier, move to prod after validation
|
tier = "test";
|
||||||
|
role = "test";
|
||||||
};
|
};
|
||||||
|
|
||||||
# Enable Vault integration
|
# Enable Vault integration
|
||||||
@@ -25,6 +25,9 @@
|
|||||||
# Enable remote deployment via NATS
|
# Enable remote deployment via NATS
|
||||||
homelab.deploy.enable = true;
|
homelab.deploy.enable = true;
|
||||||
|
|
||||||
|
# Enable Kanidm PAM/NSS for central authentication
|
||||||
|
homelab.kanidm.enable = true;
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
nixpkgs.config.allowUnfree = true;
|
||||||
boot.loader.grub.enable = true;
|
boot.loader.grub.enable = true;
|
||||||
boot.loader.grub.device = "/dev/vda";
|
boot.loader.grub.device = "/dev/vda";
|
||||||
|
|||||||
@@ -58,10 +58,9 @@ let
|
|||||||
};
|
};
|
||||||
|
|
||||||
# Build effective labels for a host
|
# Build effective labels for a host
|
||||||
# Always includes hostname; only includes tier/priority/role if non-default
|
# Always includes hostname and tier; only includes priority/role if non-default
|
||||||
buildEffectiveLabels = host:
|
buildEffectiveLabels = host:
|
||||||
{ hostname = host.hostname; }
|
{ hostname = host.hostname; tier = host.tier; }
|
||||||
// (lib.optionalAttrs (host.tier != "prod") { tier = host.tier; })
|
|
||||||
// (lib.optionalAttrs (host.priority != "high") { priority = host.priority; })
|
// (lib.optionalAttrs (host.priority != "high") { priority = host.priority; })
|
||||||
// (lib.optionalAttrs (host.role != null) { role = host.role; })
|
// (lib.optionalAttrs (host.role != null) { role = host.role; })
|
||||||
// host.labels;
|
// host.labels;
|
||||||
|
|||||||
@@ -1,5 +0,0 @@
|
|||||||
[proxmox]
|
|
||||||
pve1.home.2rjus.net
|
|
||||||
|
|
||||||
[proxmox:vars]
|
|
||||||
ansible_user=root
|
|
||||||
@@ -20,10 +20,10 @@ vault-fetch <secret-path> <output-directory> [cache-directory]
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Fetch Grafana admin secrets
|
# Fetch Grafana admin secrets
|
||||||
vault-fetch hosts/monitoring01/grafana-admin /run/secrets/grafana /var/lib/vault/cache/grafana
|
vault-fetch hosts/ha1/mqtt-password /run/secrets/grafana /var/lib/vault/cache/grafana
|
||||||
|
|
||||||
# Use default cache location
|
# Use default cache location
|
||||||
vault-fetch hosts/monitoring01/grafana-admin /run/secrets/grafana
|
vault-fetch hosts/ha1/mqtt-password /run/secrets/grafana
|
||||||
```
|
```
|
||||||
|
|
||||||
## How It Works
|
## How It Works
|
||||||
@@ -53,13 +53,13 @@ If Vault is unreachable or authentication fails:
|
|||||||
This tool is designed to be called from systemd service `ExecStartPre` hooks via the `vault.secrets` NixOS module:
|
This tool is designed to be called from systemd service `ExecStartPre` hooks via the `vault.secrets` NixOS module:
|
||||||
|
|
||||||
```nix
|
```nix
|
||||||
vault.secrets.grafana-admin = {
|
vault.secrets.mqtt-password = {
|
||||||
secretPath = "hosts/monitoring01/grafana-admin";
|
secretPath = "hosts/ha1/mqtt-password";
|
||||||
};
|
};
|
||||||
|
|
||||||
# Service automatically gets secrets fetched before start
|
# Service automatically gets secrets fetched before start
|
||||||
systemd.services.grafana.serviceConfig = {
|
systemd.services.mosquitto.serviceConfig = {
|
||||||
EnvironmentFile = "/run/secrets/grafana-admin/password";
|
EnvironmentFile = "/run/secrets/mqtt-password/password";
|
||||||
};
|
};
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ set -euo pipefail
|
|||||||
#
|
#
|
||||||
# Usage: vault-fetch <secret-path> <output-directory> [cache-directory]
|
# Usage: vault-fetch <secret-path> <output-directory> [cache-directory]
|
||||||
#
|
#
|
||||||
# Example: vault-fetch hosts/monitoring01/grafana-admin /run/secrets/grafana /var/lib/vault/cache/grafana
|
# Example: vault-fetch hosts/ha1/mqtt-password /run/secrets/grafana /var/lib/vault/cache/grafana
|
||||||
#
|
#
|
||||||
# This script:
|
# This script:
|
||||||
# 1. Authenticates to Vault using AppRole credentials from /var/lib/vault/approle/
|
# 1. Authenticates to Vault using AppRole credentials from /var/lib/vault/approle/
|
||||||
@@ -17,7 +17,7 @@ set -euo pipefail
|
|||||||
# Parse arguments
|
# Parse arguments
|
||||||
if [ $# -lt 2 ]; then
|
if [ $# -lt 2 ]; then
|
||||||
echo "Usage: vault-fetch <secret-path> <output-directory> [cache-directory]" >&2
|
echo "Usage: vault-fetch <secret-path> <output-directory> [cache-directory]" >&2
|
||||||
echo "Example: vault-fetch hosts/monitoring01/grafana /run/secrets/grafana /var/lib/vault/cache/grafana" >&2
|
echo "Example: vault-fetch hosts/ha1/mqtt-password /run/secrets/grafana /var/lib/vault/cache/grafana" >&2
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
@@ -1,57 +0,0 @@
|
|||||||
{ pkgs, config, ... }:
|
|
||||||
{
|
|
||||||
vault.secrets.actions-token = {
|
|
||||||
secretPath = "hosts/nix-cache01/actions-token";
|
|
||||||
extractKey = "token";
|
|
||||||
outputDir = "/run/secrets/actions-token-1";
|
|
||||||
services = [ "gitea-runner-actions1" ];
|
|
||||||
};
|
|
||||||
|
|
||||||
virtualisation.podman = {
|
|
||||||
enable = true;
|
|
||||||
dockerCompat = true;
|
|
||||||
};
|
|
||||||
|
|
||||||
services.gitea-actions-runner.instances = {
|
|
||||||
actions1 = {
|
|
||||||
enable = true;
|
|
||||||
tokenFile = "/run/secrets/actions-token-1";
|
|
||||||
name = "actions1.home.2rjus.net";
|
|
||||||
settings = {
|
|
||||||
log = {
|
|
||||||
level = "debug";
|
|
||||||
};
|
|
||||||
|
|
||||||
runner = {
|
|
||||||
file = ".runner";
|
|
||||||
capacity = 4;
|
|
||||||
timeout = "2h";
|
|
||||||
shutdown_timeout = "10m";
|
|
||||||
insecure = false;
|
|
||||||
fetch_timeout = "10s";
|
|
||||||
fetch_interval = "30s";
|
|
||||||
};
|
|
||||||
|
|
||||||
cache = {
|
|
||||||
enabled = true;
|
|
||||||
dir = "/var/cache/gitea-actions1";
|
|
||||||
};
|
|
||||||
|
|
||||||
container = {
|
|
||||||
privileged = false;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
labels =
|
|
||||||
builtins.map (n: "${n}:docker://gitea/runner-images:${n}") [
|
|
||||||
"ubuntu-latest"
|
|
||||||
"ubuntu-latest-slim"
|
|
||||||
"ubuntu-latest-full"
|
|
||||||
]
|
|
||||||
++ [
|
|
||||||
"homelab"
|
|
||||||
];
|
|
||||||
|
|
||||||
url = "https://git.t-juice.club";
|
|
||||||
};
|
|
||||||
};
|
|
||||||
}
|
|
||||||
64
services/garage/default.nix
Normal file
64
services/garage/default.nix
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
{ config, pkgs, ... }:
|
||||||
|
{
|
||||||
|
homelab.monitoring.scrapeTargets = [
|
||||||
|
{
|
||||||
|
job_name = "garage";
|
||||||
|
port = 3903;
|
||||||
|
metrics_path = "/metrics";
|
||||||
|
}
|
||||||
|
{
|
||||||
|
job_name = "caddy";
|
||||||
|
port = 9117;
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
|
vault.secrets.garage-env = {
|
||||||
|
secretPath = "hosts/${config.networking.hostName}/garage";
|
||||||
|
extractKey = "env";
|
||||||
|
outputDir = "/run/secrets/garage-env";
|
||||||
|
services = [ "garage" ];
|
||||||
|
};
|
||||||
|
|
||||||
|
services.garage = {
|
||||||
|
enable = true;
|
||||||
|
package = pkgs.garage;
|
||||||
|
environmentFile = "/run/secrets/garage-env";
|
||||||
|
settings = {
|
||||||
|
metadata_dir = "/var/lib/garage/meta";
|
||||||
|
data_dir = "/var/lib/garage/data";
|
||||||
|
replication_factor = 1;
|
||||||
|
rpc_bind_addr = "[::]:3901";
|
||||||
|
rpc_public_addr = "garage01.home.2rjus.net:3901";
|
||||||
|
s3_api = {
|
||||||
|
api_bind_addr = "[::]:3900";
|
||||||
|
s3_region = "garage";
|
||||||
|
root_domain = ".s3.home.2rjus.net";
|
||||||
|
};
|
||||||
|
admin = {
|
||||||
|
api_bind_addr = "[::]:3903";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
services.caddy = {
|
||||||
|
enable = true;
|
||||||
|
package = pkgs.unstable.caddy;
|
||||||
|
configFile = pkgs.writeText "Caddyfile" ''
|
||||||
|
{
|
||||||
|
acme_ca https://vault.home.2rjus.net:8200/v1/pki_int/acme/directory
|
||||||
|
metrics
|
||||||
|
}
|
||||||
|
|
||||||
|
s3.home.2rjus.net {
|
||||||
|
reverse_proxy http://localhost:3900
|
||||||
|
}
|
||||||
|
|
||||||
|
http://garage01.home.2rjus.net:9117 {
|
||||||
|
handle /metrics {
|
||||||
|
metrics
|
||||||
|
}
|
||||||
|
respond 404
|
||||||
|
}
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
}
|
||||||
391
services/grafana/dashboards/apiary.json
Normal file
391
services/grafana/dashboards/apiary.json
Normal file
@@ -0,0 +1,391 @@
|
|||||||
|
{
|
||||||
|
"uid": "apiary-homelab",
|
||||||
|
"title": "Apiary - Honeypot",
|
||||||
|
"tags": ["apiary", "honeypot", "prometheus", "homelab"],
|
||||||
|
"timezone": "browser",
|
||||||
|
"schemaVersion": 39,
|
||||||
|
"version": 1,
|
||||||
|
"refresh": "1m",
|
||||||
|
"time": {
|
||||||
|
"from": "now-24h",
|
||||||
|
"to": "now"
|
||||||
|
},
|
||||||
|
"templating": {
|
||||||
|
"list": []
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"title": "SSH Connections",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(oubliette_ssh_connections_total{job=\"apiary\"})",
|
||||||
|
"legendFormat": "Total",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "blue", "value": null}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none",
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"description": "Total SSH connections across all outcomes"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"title": "Active Sessions",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "oubliette_sessions_active{job=\"apiary\"}",
|
||||||
|
"legendFormat": "Active",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "yellow", "value": 5},
|
||||||
|
{"color": "red", "value": 20}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none",
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"description": "Currently active honeypot sessions"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"title": "Unique IPs",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "oubliette_storage_unique_ips{job=\"apiary\"}",
|
||||||
|
"legendFormat": "IPs",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "purple", "value": null}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none",
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"description": "Total unique source IPs observed"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"title": "Total Login Attempts",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "oubliette_storage_login_attempts_total{job=\"apiary\"}",
|
||||||
|
"legendFormat": "Attempts",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "orange", "value": null}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none",
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"description": "Total login attempts stored"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"title": "SSH Connections Over Time",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 4},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"interval": "60s",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(oubliette_ssh_connections_total{job=\"apiary\"}[$__rate_interval])",
|
||||||
|
"legendFormat": "{{outcome}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "cps",
|
||||||
|
"custom": {
|
||||||
|
"drawStyle": "line",
|
||||||
|
"lineInterpolation": "smooth",
|
||||||
|
"fillOpacity": 20,
|
||||||
|
"pointSize": 5,
|
||||||
|
"showPoints": "auto",
|
||||||
|
"stacking": {"mode": "none"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {"displayMode": "list", "placement": "bottom"},
|
||||||
|
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||||
|
},
|
||||||
|
"description": "SSH connection rate by outcome"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"title": "Auth Attempts Over Time",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 4},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"interval": "60s",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(oubliette_auth_attempts_total{job=\"apiary\"}[$__rate_interval])",
|
||||||
|
"legendFormat": "{{reason}} - {{result}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "cps",
|
||||||
|
"custom": {
|
||||||
|
"drawStyle": "line",
|
||||||
|
"lineInterpolation": "smooth",
|
||||||
|
"fillOpacity": 20,
|
||||||
|
"pointSize": 5,
|
||||||
|
"showPoints": "auto",
|
||||||
|
"stacking": {"mode": "none"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {"displayMode": "list", "placement": "bottom"},
|
||||||
|
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||||
|
},
|
||||||
|
"description": "Authentication attempt rate by reason and result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 7,
|
||||||
|
"title": "Sessions by Shell",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 22},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"interval": "60s",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(oubliette_sessions_total{job=\"apiary\"}[$__rate_interval])",
|
||||||
|
"legendFormat": "{{shell}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "cps",
|
||||||
|
"custom": {
|
||||||
|
"drawStyle": "line",
|
||||||
|
"lineInterpolation": "smooth",
|
||||||
|
"fillOpacity": 20,
|
||||||
|
"pointSize": 5,
|
||||||
|
"showPoints": "auto",
|
||||||
|
"stacking": {"mode": "normal"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {"displayMode": "list", "placement": "bottom"},
|
||||||
|
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||||
|
},
|
||||||
|
"description": "Session creation rate by shell type"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 8,
|
||||||
|
"title": "Attempts by Country",
|
||||||
|
"type": "geomap",
|
||||||
|
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 12},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "oubliette_auth_attempts_by_country_total{job=\"apiary\"}",
|
||||||
|
"legendFormat": "{{country}}",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "yellow", "value": 10},
|
||||||
|
{"color": "orange", "value": 50},
|
||||||
|
{"color": "red", "value": 200}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"view": {
|
||||||
|
"id": "zero",
|
||||||
|
"lat": 30,
|
||||||
|
"lon": 10,
|
||||||
|
"zoom": 2
|
||||||
|
},
|
||||||
|
"basemap": {
|
||||||
|
"type": "default"
|
||||||
|
},
|
||||||
|
"layers": [
|
||||||
|
{
|
||||||
|
"type": "markers",
|
||||||
|
"name": "Auth Attempts",
|
||||||
|
"config": {
|
||||||
|
"showLegend": true,
|
||||||
|
"style": {
|
||||||
|
"size": {
|
||||||
|
"field": "Value",
|
||||||
|
"min": 3,
|
||||||
|
"max": 20
|
||||||
|
},
|
||||||
|
"color": {
|
||||||
|
"field": "Value"
|
||||||
|
},
|
||||||
|
"symbol": {
|
||||||
|
"mode": "fixed",
|
||||||
|
"fixed": "img/icons/marker/circle.svg"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"location": {
|
||||||
|
"mode": "lookup",
|
||||||
|
"lookup": "country",
|
||||||
|
"gazetteer": "public/gazetteer/countries.json"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"description": "Authentication attempts by country (geo lookup from country code)"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 9,
|
||||||
|
"title": "Session Duration Distribution",
|
||||||
|
"type": "heatmap",
|
||||||
|
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 30},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"interval": "60s",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(oubliette_session_duration_seconds_bucket{job=\"apiary\"}[$__rate_interval])",
|
||||||
|
"legendFormat": "{{le}}",
|
||||||
|
"refId": "A",
|
||||||
|
"format": "heatmap"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"custom": {
|
||||||
|
"scaleDistribution": {
|
||||||
|
"type": "log",
|
||||||
|
"log": 2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"calculate": false,
|
||||||
|
"yAxis": {
|
||||||
|
"unit": "s"
|
||||||
|
},
|
||||||
|
"color": {
|
||||||
|
"scheme": "Oranges",
|
||||||
|
"mode": "scheme"
|
||||||
|
},
|
||||||
|
"cellGap": 1,
|
||||||
|
"tooltip": {
|
||||||
|
"show": true
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"description": "Distribution of session durations"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 10,
|
||||||
|
"title": "Commands Executed by Shell",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 22},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"interval": "60s",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(oubliette_commands_executed_total{job=\"apiary\"}[$__rate_interval])",
|
||||||
|
"legendFormat": "{{shell}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "cps",
|
||||||
|
"custom": {
|
||||||
|
"drawStyle": "line",
|
||||||
|
"lineInterpolation": "smooth",
|
||||||
|
"fillOpacity": 20,
|
||||||
|
"pointSize": 5,
|
||||||
|
"showPoints": "auto",
|
||||||
|
"stacking": {"mode": "normal"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {"displayMode": "list", "placement": "bottom"},
|
||||||
|
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||||
|
},
|
||||||
|
"description": "Rate of commands executed in honeypot shells"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
446
services/grafana/dashboards/certificates.json
Normal file
446
services/grafana/dashboards/certificates.json
Normal file
@@ -0,0 +1,446 @@
|
|||||||
|
{
|
||||||
|
"uid": "certificates-homelab",
|
||||||
|
"title": "TLS Certificates",
|
||||||
|
"tags": ["certificates", "tls", "security", "homelab"],
|
||||||
|
"timezone": "browser",
|
||||||
|
"schemaVersion": 39,
|
||||||
|
"version": 1,
|
||||||
|
"refresh": "5m",
|
||||||
|
"time": {
|
||||||
|
"from": "now-7d",
|
||||||
|
"to": "now"
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"title": "Endpoints Monitored",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(probe_ssl_earliest_cert_expiry{job=\"blackbox_tls\"})",
|
||||||
|
"legendFormat": "Total",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "blue", "value": null}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none",
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"description": "Total number of TLS endpoints being monitored"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"title": "Probe Failures",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(probe_success{job=\"blackbox_tls\"} == 0) or vector(0)",
|
||||||
|
"legendFormat": "Failing",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "red", "value": 1}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none",
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"description": "Number of endpoints where TLS probe is failing"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"title": "Expiring Soon (< 7d)",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count((probe_ssl_earliest_cert_expiry{job=\"blackbox_tls\"} - time()) < 86400 * 7) or vector(0)",
|
||||||
|
"legendFormat": "Warning",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "yellow", "value": 1}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none",
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"description": "Certificates expiring within 7 days"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"title": "Expiring Critical (< 24h)",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count((probe_ssl_earliest_cert_expiry{job=\"blackbox_tls\"} - time()) < 86400) or vector(0)",
|
||||||
|
"legendFormat": "Critical",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "red", "value": 1}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none",
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"description": "Certificates expiring within 24 hours"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"title": "Minimum Days Remaining",
|
||||||
|
"type": "gauge",
|
||||||
|
"gridPos": {"h": 4, "w": 8, "x": 16, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "min((probe_ssl_earliest_cert_expiry{job=\"blackbox_tls\"} - time()) / 86400)",
|
||||||
|
"legendFormat": "Days",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "d",
|
||||||
|
"min": 0,
|
||||||
|
"max": 90,
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "red", "value": null},
|
||||||
|
{"color": "orange", "value": 7},
|
||||||
|
{"color": "yellow", "value": 14},
|
||||||
|
{"color": "green", "value": 30}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"showThresholdLabels": false,
|
||||||
|
"showThresholdMarkers": true
|
||||||
|
},
|
||||||
|
"description": "Shortest time until any certificate expires"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"title": "Certificate Expiry by Endpoint",
|
||||||
|
"type": "table",
|
||||||
|
"gridPos": {"h": 12, "w": 12, "x": 0, "y": 4},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "(probe_ssl_earliest_cert_expiry{job=\"blackbox_tls\"} - time()) / 86400",
|
||||||
|
"legendFormat": "{{instance}}",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {"Time": true, "job": true, "__name__": true},
|
||||||
|
"renameByName": {"instance": "Endpoint", "Value": "Days Until Expiry"}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "sortBy",
|
||||||
|
"options": {
|
||||||
|
"sort": [{"field": "Days Until Expiry", "desc": false}]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"custom": {
|
||||||
|
"align": "left"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Days Until Expiry"},
|
||||||
|
"properties": [
|
||||||
|
{"id": "unit", "value": "d"},
|
||||||
|
{"id": "decimals", "value": 1},
|
||||||
|
{"id": "custom.width", "value": 150},
|
||||||
|
{
|
||||||
|
"id": "thresholds",
|
||||||
|
"value": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "red", "value": null},
|
||||||
|
{"color": "orange", "value": 7},
|
||||||
|
{"color": "yellow", "value": 14},
|
||||||
|
{"color": "green", "value": 30}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{"id": "custom.cellOptions", "value": {"type": "color-background"}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"showHeader": true,
|
||||||
|
"sortBy": [{"displayName": "Days Until Expiry", "desc": false}]
|
||||||
|
},
|
||||||
|
"description": "All monitored endpoints sorted by days until certificate expiry"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 7,
|
||||||
|
"title": "Probe Status",
|
||||||
|
"type": "table",
|
||||||
|
"gridPos": {"h": 12, "w": 12, "x": 12, "y": 4},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "probe_success{job=\"blackbox_tls\"}",
|
||||||
|
"legendFormat": "{{instance}}",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "probe_http_status_code{job=\"blackbox_tls\"}",
|
||||||
|
"legendFormat": "{{instance}}",
|
||||||
|
"refId": "B",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "probe_duration_seconds{job=\"blackbox_tls\"}",
|
||||||
|
"legendFormat": "{{instance}}",
|
||||||
|
"refId": "C",
|
||||||
|
"instant": true,
|
||||||
|
"format": "table"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "joinByField",
|
||||||
|
"options": {
|
||||||
|
"byField": "instance",
|
||||||
|
"mode": "outer"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {"Time": true, "Time 1": true, "Time 2": true, "Time 3": true, "job": true, "job 1": true, "job 2": true, "job 3": true, "__name__": true},
|
||||||
|
"renameByName": {
|
||||||
|
"instance": "Endpoint",
|
||||||
|
"Value #A": "Success",
|
||||||
|
"Value #B": "HTTP Status",
|
||||||
|
"Value #C": "Duration"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"custom": {"align": "left"}
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Success"},
|
||||||
|
"properties": [
|
||||||
|
{"id": "custom.width", "value": 80},
|
||||||
|
{"id": "mappings", "value": [
|
||||||
|
{"type": "value", "options": {"0": {"text": "FAIL", "color": "red"}}},
|
||||||
|
{"type": "value", "options": {"1": {"text": "OK", "color": "green"}}}
|
||||||
|
]},
|
||||||
|
{"id": "custom.cellOptions", "value": {"type": "color-text"}}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "HTTP Status"},
|
||||||
|
"properties": [
|
||||||
|
{"id": "custom.width", "value": 100}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Duration"},
|
||||||
|
"properties": [
|
||||||
|
{"id": "unit", "value": "s"},
|
||||||
|
{"id": "decimals", "value": 3},
|
||||||
|
{"id": "custom.width", "value": 100}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"showHeader": true
|
||||||
|
},
|
||||||
|
"description": "Probe success status, HTTP response code, and probe duration"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 8,
|
||||||
|
"title": "Certificate Expiry Over Time",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 16},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "(probe_ssl_earliest_cert_expiry{job=\"blackbox_tls\"} - time()) / 86400",
|
||||||
|
"legendFormat": "{{instance}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "d",
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2,
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"showPoints": "never"
|
||||||
|
},
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "red", "value": null},
|
||||||
|
{"color": "orange", "value": 7},
|
||||||
|
{"color": "yellow", "value": 14},
|
||||||
|
{"color": "green", "value": 30}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {"displayMode": "table", "placement": "right", "calcs": ["lastNotNull"]},
|
||||||
|
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||||
|
},
|
||||||
|
"description": "Days until certificate expiry over time - useful for spotting renewal patterns"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 9,
|
||||||
|
"title": "Probe Success Rate",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 24},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "avg(probe_success{job=\"blackbox_tls\"}) * 100",
|
||||||
|
"legendFormat": "Success Rate",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2,
|
||||||
|
"fillOpacity": 20,
|
||||||
|
"showPoints": "never"
|
||||||
|
},
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "red", "value": null},
|
||||||
|
{"color": "yellow", "value": 90},
|
||||||
|
{"color": "green", "value": 100}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"color": {"mode": "thresholds"}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {"displayMode": "list", "placement": "bottom"},
|
||||||
|
"tooltip": {"mode": "single"}
|
||||||
|
},
|
||||||
|
"description": "Overall probe success rate across all endpoints"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 10,
|
||||||
|
"title": "Probe Duration",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 24},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "probe_duration_seconds{job=\"blackbox_tls\"}",
|
||||||
|
"legendFormat": "{{instance}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s",
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 1,
|
||||||
|
"fillOpacity": 0,
|
||||||
|
"showPoints": "never"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {"displayMode": "table", "placement": "right", "calcs": ["mean", "max"]},
|
||||||
|
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||||
|
},
|
||||||
|
"description": "Time taken to complete TLS probe for each endpoint"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
85
services/grafana/dashboards/logs.json
Normal file
85
services/grafana/dashboards/logs.json
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
{
|
||||||
|
"uid": "logs-homelab",
|
||||||
|
"title": "Logs - Homelab",
|
||||||
|
"tags": ["loki", "logs", "homelab"],
|
||||||
|
"timezone": "browser",
|
||||||
|
"schemaVersion": 39,
|
||||||
|
"version": 1,
|
||||||
|
"refresh": "30s",
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "host",
|
||||||
|
"type": "query",
|
||||||
|
"datasource": {"type": "loki", "uid": "loki"},
|
||||||
|
"query": "label_values(host)",
|
||||||
|
"refresh": 2,
|
||||||
|
"includeAll": true,
|
||||||
|
"multi": false,
|
||||||
|
"current": {"text": "All", "value": "$__all"}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "job",
|
||||||
|
"type": "query",
|
||||||
|
"datasource": {"type": "loki", "uid": "loki"},
|
||||||
|
"query": "label_values(job)",
|
||||||
|
"refresh": 2,
|
||||||
|
"includeAll": true,
|
||||||
|
"multi": false,
|
||||||
|
"current": {"text": "All", "value": "$__all"}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "search",
|
||||||
|
"type": "textbox",
|
||||||
|
"current": {"text": "", "value": ""},
|
||||||
|
"label": "Search"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"title": "Log Volume",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 6, "w": 24, "x": 0, "y": 0},
|
||||||
|
"datasource": {"type": "loki", "uid": "loki"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by (host) (count_over_time({host=~\"$host\", job=~\"$job\"} |~ \"$search\" [1m]))",
|
||||||
|
"legendFormat": "{{host}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {"displayMode": "list", "placement": "bottom"}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"title": "Logs",
|
||||||
|
"type": "logs",
|
||||||
|
"gridPos": {"h": 18, "w": 24, "x": 0, "y": 6},
|
||||||
|
"datasource": {"type": "loki", "uid": "loki"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "{host=~\"$host\", job=~\"$job\"} |~ \"$search\"",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"options": {
|
||||||
|
"showTime": true,
|
||||||
|
"showLabels": true,
|
||||||
|
"showCommonLabels": false,
|
||||||
|
"wrapLogMessage": true,
|
||||||
|
"prettifyLogMessage": false,
|
||||||
|
"enableLogDetails": true,
|
||||||
|
"sortOrder": "Descending"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
949
services/grafana/dashboards/nixos-fleet.json
Normal file
949
services/grafana/dashboards/nixos-fleet.json
Normal file
@@ -0,0 +1,949 @@
|
|||||||
|
{
|
||||||
|
"uid": "nixos-fleet-homelab",
|
||||||
|
"title": "NixOS Fleet - Homelab",
|
||||||
|
"tags": ["nixos", "fleet", "homelab"],
|
||||||
|
"timezone": "browser",
|
||||||
|
"schemaVersion": 39,
|
||||||
|
"version": 1,
|
||||||
|
"refresh": "1m",
|
||||||
|
"time": {
|
||||||
|
"from": "now-7d",
|
||||||
|
"to": "now"
|
||||||
|
},
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "tier",
|
||||||
|
"type": "query",
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"query": "label_values(nixos_flake_info, tier)",
|
||||||
|
"refresh": 2,
|
||||||
|
"includeAll": true,
|
||||||
|
"multi": false,
|
||||||
|
"current": {"text": "All", "value": "$__all"}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"title": "Hosts Behind Remote",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(nixos_flake_revision_behind{tier=~\"$tier\"} == 1)",
|
||||||
|
"legendFormat": "Behind",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "yellow", "value": 1},
|
||||||
|
{"color": "red", "value": 5}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none",
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"description": "Number of hosts where current revision differs from remote master"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"title": "Hosts Needing Reboot",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(nixos_config_mismatch{tier=~\"$tier\"} == 1)",
|
||||||
|
"legendFormat": "Need Reboot",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "yellow", "value": 1},
|
||||||
|
{"color": "orange", "value": 3},
|
||||||
|
{"color": "red", "value": 5}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
},
|
||||||
|
"description": "Hosts where booted generation differs from current (switched but not rebooted)"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"title": "Total Hosts",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 3, "x": 8, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(nixos_flake_info{tier=~\"$tier\"})",
|
||||||
|
"legendFormat": "Hosts",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{"color": "blue", "value": null}]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"title": "Nixpkgs Age",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 3, "x": 11, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "max(nixos_flake_input_age_seconds{input=\"nixpkgs\", tier=~\"$tier\"})",
|
||||||
|
"legendFormat": "Nixpkgs",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s",
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "yellow", "value": 604800},
|
||||||
|
{"color": "orange", "value": 1209600},
|
||||||
|
{"color": "red", "value": 2592000}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
},
|
||||||
|
"description": "Age of nixpkgs flake input (yellow >7d, orange >14d, red >30d)"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"title": "Hosts Up-to-date",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 3, "x": 14, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(nixos_flake_revision_behind{tier=~\"$tier\"} == 0)",
|
||||||
|
"legendFormat": "Up-to-date",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{"color": "green", "value": null}]
|
||||||
|
},
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 13,
|
||||||
|
"title": "Deployments (24h)",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 3, "x": 17, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(increase(homelab_deploy_deployments_total{status=\"completed\"}[24h]))",
|
||||||
|
"legendFormat": "Deployments",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{"color": "blue", "value": null}]
|
||||||
|
},
|
||||||
|
"noValue": "0",
|
||||||
|
"decimals": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
},
|
||||||
|
"description": "Total successful deployments in the last 24 hours"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 14,
|
||||||
|
"title": "Avg Deploy Time",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 20, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(increase(homelab_deploy_deployment_duration_seconds_sum{success=\"true\"}[24h])) / sum(increase(homelab_deploy_deployment_duration_seconds_count{success=\"true\"}[24h]))",
|
||||||
|
"legendFormat": "Avg Time",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s",
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "yellow", "value": 30},
|
||||||
|
{"color": "red", "value": 60}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"noValue": "-"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
},
|
||||||
|
"description": "Average deployment duration over the last 24 hours (yellow >30s, red >60s)"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"title": "Fleet Status",
|
||||||
|
"type": "table",
|
||||||
|
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 4},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "nixos_flake_info{tier=~\"$tier\"}",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true,
|
||||||
|
"refId": "info"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "nixos_flake_revision_behind{tier=~\"$tier\"}",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true,
|
||||||
|
"refId": "behind"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "nixos_config_mismatch{tier=~\"$tier\"}",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true,
|
||||||
|
"refId": "mismatch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "nixos_generation_age_seconds{tier=~\"$tier\"}",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true,
|
||||||
|
"refId": "age"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "nixos_generation_count{tier=~\"$tier\"}",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true,
|
||||||
|
"refId": "count"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Hostname"},
|
||||||
|
"properties": [{"id": "custom.width", "value": 120}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Current Rev"},
|
||||||
|
"properties": [{"id": "custom.width", "value": 90}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Remote Rev"},
|
||||||
|
"properties": [{"id": "custom.width", "value": 90}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Behind"},
|
||||||
|
"properties": [
|
||||||
|
{"id": "custom.width", "value": 70},
|
||||||
|
{"id": "mappings", "value": [
|
||||||
|
{"type": "value", "options": {"0": {"text": "No", "color": "green"}}},
|
||||||
|
{"type": "value", "options": {"1": {"text": "Yes", "color": "red"}}}
|
||||||
|
]},
|
||||||
|
{"id": "custom.cellOptions", "value": {"type": "color-text"}}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Need Reboot"},
|
||||||
|
"properties": [
|
||||||
|
{"id": "custom.width", "value": 100},
|
||||||
|
{"id": "mappings", "value": [
|
||||||
|
{"type": "value", "options": {"0": {"text": "No", "color": "green"}}},
|
||||||
|
{"type": "value", "options": {"1": {"text": "Yes", "color": "orange"}}}
|
||||||
|
]},
|
||||||
|
{"id": "custom.cellOptions", "value": {"type": "color-text"}}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Config Age"},
|
||||||
|
"properties": [
|
||||||
|
{"id": "unit", "value": "s"},
|
||||||
|
{"id": "custom.width", "value": 100}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Generations"},
|
||||||
|
"properties": [{"id": "custom.width", "value": 100}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Tier"},
|
||||||
|
"properties": [{"id": "custom.width", "value": 60}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Role"},
|
||||||
|
"properties": [{"id": "custom.width", "value": 80}]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"showHeader": true,
|
||||||
|
"sortBy": [{"displayName": "Hostname", "desc": false}]
|
||||||
|
},
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "joinByField",
|
||||||
|
"options": {"byField": "hostname", "mode": "outer"}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {
|
||||||
|
"Time": true,
|
||||||
|
"Time 1": true,
|
||||||
|
"Time 2": true,
|
||||||
|
"Time 3": true,
|
||||||
|
"Time 4": true,
|
||||||
|
"Time 5": true,
|
||||||
|
"Value #info": true,
|
||||||
|
"__name__": true,
|
||||||
|
"__name__ 1": true,
|
||||||
|
"__name__ 2": true,
|
||||||
|
"__name__ 3": true,
|
||||||
|
"__name__ 4": true,
|
||||||
|
"__name__ 5": true,
|
||||||
|
"dns_role": true,
|
||||||
|
"dns_role 1": true,
|
||||||
|
"dns_role 2": true,
|
||||||
|
"dns_role 3": true,
|
||||||
|
"dns_role 4": true,
|
||||||
|
"instance": true,
|
||||||
|
"instance 1": true,
|
||||||
|
"instance 2": true,
|
||||||
|
"instance 3": true,
|
||||||
|
"instance 4": true,
|
||||||
|
"job": true,
|
||||||
|
"job 1": true,
|
||||||
|
"job 2": true,
|
||||||
|
"job 3": true,
|
||||||
|
"job 4": true,
|
||||||
|
"nixos_version": true,
|
||||||
|
"nixpkgs_rev": true,
|
||||||
|
"role 1": true,
|
||||||
|
"role 2": true,
|
||||||
|
"role 3": true,
|
||||||
|
"role 4": true,
|
||||||
|
"tier 1": true,
|
||||||
|
"tier 2": true,
|
||||||
|
"tier 3": true,
|
||||||
|
"tier 4": true
|
||||||
|
},
|
||||||
|
"indexByName": {
|
||||||
|
"hostname": 0,
|
||||||
|
"tier": 1,
|
||||||
|
"role": 2,
|
||||||
|
"current_rev": 3,
|
||||||
|
"remote_rev": 4,
|
||||||
|
"Value #behind": 5,
|
||||||
|
"Value #mismatch": 6,
|
||||||
|
"Value #age": 7,
|
||||||
|
"Value #count": 8
|
||||||
|
},
|
||||||
|
"renameByName": {
|
||||||
|
"hostname": "Hostname",
|
||||||
|
"tier": "Tier",
|
||||||
|
"role": "Role",
|
||||||
|
"current_rev": "Current Rev",
|
||||||
|
"remote_rev": "Remote Rev",
|
||||||
|
"Value #behind": "Behind",
|
||||||
|
"Value #mismatch": "Need Reboot",
|
||||||
|
"Value #age": "Config Age",
|
||||||
|
"Value #count": "Generations"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 7,
|
||||||
|
"title": "Generation Age by Host",
|
||||||
|
"type": "bargauge",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 14},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sort_desc(nixos_generation_age_seconds{tier=~\"$tier\"})",
|
||||||
|
"legendFormat": "{{hostname}}",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s",
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "yellow", "value": 86400},
|
||||||
|
{"color": "orange", "value": 259200},
|
||||||
|
{"color": "red", "value": 604800}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"min": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": true
|
||||||
|
},
|
||||||
|
"description": "How long ago each host's current config was deployed (yellow >1d, orange >3d, red >7d)"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 8,
|
||||||
|
"title": "Generations per Host",
|
||||||
|
"type": "bargauge",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 14},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sort_desc(nixos_generation_count{tier=~\"$tier\"})",
|
||||||
|
"legendFormat": "{{hostname}}",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "blue", "value": null},
|
||||||
|
{"color": "purple", "value": 50}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"min": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": true
|
||||||
|
},
|
||||||
|
"description": "Total number of NixOS generations on each host"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 9,
|
||||||
|
"title": "Deployment Activity (Generation Age Over Time)",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 22},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "nixos_generation_age_seconds{tier=~\"$tier\"}",
|
||||||
|
"legendFormat": "{{hostname}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s",
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 1,
|
||||||
|
"fillOpacity": 0,
|
||||||
|
"showPoints": "never",
|
||||||
|
"stacking": {"mode": "none"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom"
|
||||||
|
},
|
||||||
|
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||||
|
},
|
||||||
|
"description": "Generation age increases over time, drops to near-zero when deployed. Useful to see deployment patterns."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 10,
|
||||||
|
"title": "Flake Input Ages",
|
||||||
|
"type": "table",
|
||||||
|
"gridPos": {"h": 6, "w": 12, "x": 0, "y": 30},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "max by (input) (nixos_flake_input_age_seconds)",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true,
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s"
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "input"},
|
||||||
|
"properties": [{"id": "custom.width", "value": 150}]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"showHeader": true,
|
||||||
|
"sortBy": [{"displayName": "Value", "desc": true}]
|
||||||
|
},
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {"Time": true},
|
||||||
|
"renameByName": {
|
||||||
|
"input": "Flake Input",
|
||||||
|
"Value": "Age"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"description": "Age of each flake input across the fleet"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 11,
|
||||||
|
"title": "Hosts by Revision",
|
||||||
|
"type": "piechart",
|
||||||
|
"gridPos": {"h": 6, "w": 6, "x": 12, "y": 30},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count by (current_rev) (nixos_flake_info{tier=~\"$tier\"})",
|
||||||
|
"legendFormat": "{{current_rev}}",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"legend": {"displayMode": "table", "placement": "right", "values": ["value"]},
|
||||||
|
"pieType": "pie"
|
||||||
|
},
|
||||||
|
"description": "Distribution of hosts by their current flake revision"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 12,
|
||||||
|
"title": "Hosts by Tier",
|
||||||
|
"type": "piechart",
|
||||||
|
"gridPos": {"h": 6, "w": 6, "x": 18, "y": 30},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count by (tier) (nixos_flake_info)",
|
||||||
|
"legendFormat": "{{tier}}",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"legend": {"displayMode": "table", "placement": "right", "values": ["value"]},
|
||||||
|
"pieType": "pie"
|
||||||
|
},
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "renameByRegex",
|
||||||
|
"options": {
|
||||||
|
"regex": "^$",
|
||||||
|
"renamePattern": "prod"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"description": "Distribution of hosts by tier (test vs prod)"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 15,
|
||||||
|
"title": "Build Service",
|
||||||
|
"type": "row",
|
||||||
|
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 36},
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 16,
|
||||||
|
"title": "Builds (24h)",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 37},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(increase(homelab_deploy_build_host_total{status=\"success\"}[24h]))",
|
||||||
|
"legendFormat": "Builds",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{"color": "green", "value": null}]
|
||||||
|
},
|
||||||
|
"noValue": "0",
|
||||||
|
"decimals": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
},
|
||||||
|
"description": "Successful host builds in the last 24 hours"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 17,
|
||||||
|
"title": "Failed Builds (24h)",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 37},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(increase(homelab_deploy_build_host_total{status=\"failure\"}[24h])) or vector(0)",
|
||||||
|
"legendFormat": "Failed",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "yellow", "value": 1},
|
||||||
|
{"color": "red", "value": 5}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"noValue": "0",
|
||||||
|
"decimals": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
},
|
||||||
|
"description": "Failed host builds in the last 24 hours"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 18,
|
||||||
|
"title": "Last Build",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 37},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "time() - max(homelab_deploy_build_last_timestamp)",
|
||||||
|
"legendFormat": "Last Build",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s",
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "yellow", "value": 86400},
|
||||||
|
{"color": "red", "value": 604800}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"noValue": "-"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
},
|
||||||
|
"description": "Time since last build attempt (yellow >1d, red >7d)"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 19,
|
||||||
|
"title": "Avg Build Time",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 37},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(increase(homelab_deploy_build_duration_seconds_sum[24h])) / sum(increase(homelab_deploy_build_duration_seconds_count[24h]))",
|
||||||
|
"legendFormat": "Avg Time",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s",
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "yellow", "value": 30},
|
||||||
|
{"color": "red", "value": 60}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"noValue": "-"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
},
|
||||||
|
"description": "Average build duration per host over the last 24 hours"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 20,
|
||||||
|
"title": "Total Hosts Built",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 37},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(homelab_deploy_build_duration_seconds_count)",
|
||||||
|
"legendFormat": "Hosts",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{"color": "blue", "value": null}]
|
||||||
|
},
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
},
|
||||||
|
"description": "Total number of unique hosts that have been built"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 21,
|
||||||
|
"title": "Build Jobs (24h)",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 20, "y": 37},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(increase(homelab_deploy_builds_total[24h]))",
|
||||||
|
"legendFormat": "Jobs",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{"color": "purple", "value": null}]
|
||||||
|
},
|
||||||
|
"noValue": "0",
|
||||||
|
"decimals": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
},
|
||||||
|
"description": "Total build jobs (each job may build multiple hosts) in the last 24 hours"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 22,
|
||||||
|
"title": "Build Time by Host",
|
||||||
|
"type": "bargauge",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 41},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sort_desc(homelab_deploy_build_duration_seconds_sum / homelab_deploy_build_duration_seconds_count)",
|
||||||
|
"legendFormat": "{{host}}",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s",
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "yellow", "value": 15},
|
||||||
|
{"color": "orange", "value": 25},
|
||||||
|
{"color": "red", "value": 45}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"min": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": true
|
||||||
|
},
|
||||||
|
"description": "Average build time per host (green <15s, yellow <25s, orange <45s, red >45s)"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 23,
|
||||||
|
"title": "Build Count by Host",
|
||||||
|
"type": "bargauge",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 41},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sort_desc(sum by (host) (homelab_deploy_build_host_total))",
|
||||||
|
"legendFormat": "{{host}}",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "blue", "value": null},
|
||||||
|
{"color": "purple", "value": 10}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"min": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": true
|
||||||
|
},
|
||||||
|
"description": "Total build count per host (all time)"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 24,
|
||||||
|
"title": "Build Activity",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 49},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(increase(homelab_deploy_build_host_total{status=\"success\"}[1h]))",
|
||||||
|
"legendFormat": "Successful",
|
||||||
|
"refId": "A"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(increase(homelab_deploy_build_host_total{status=\"failure\"}[1h]))",
|
||||||
|
"legendFormat": "Failed",
|
||||||
|
"refId": "B"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 1,
|
||||||
|
"fillOpacity": 30,
|
||||||
|
"showPoints": "never",
|
||||||
|
"stacking": {"mode": "none"}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Successful"},
|
||||||
|
"properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Failed"},
|
||||||
|
"properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom"
|
||||||
|
},
|
||||||
|
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||||
|
},
|
||||||
|
"description": "Build activity over time (successful vs failed builds per hour)"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
296
services/grafana/dashboards/nixos-operations.json
Normal file
296
services/grafana/dashboards/nixos-operations.json
Normal file
@@ -0,0 +1,296 @@
|
|||||||
|
{
|
||||||
|
"uid": "nixos-operations",
|
||||||
|
"title": "NixOS Operations",
|
||||||
|
"tags": ["loki", "nixos", "operations", "homelab"],
|
||||||
|
"timezone": "browser",
|
||||||
|
"schemaVersion": 39,
|
||||||
|
"version": 1,
|
||||||
|
"refresh": "1m",
|
||||||
|
"time": {
|
||||||
|
"from": "now-24h",
|
||||||
|
"to": "now"
|
||||||
|
},
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "host",
|
||||||
|
"type": "query",
|
||||||
|
"datasource": {"type": "loki", "uid": "loki"},
|
||||||
|
"query": "label_values(host)",
|
||||||
|
"refresh": 2,
|
||||||
|
"includeAll": true,
|
||||||
|
"multi": true,
|
||||||
|
"current": {"text": "All", "value": "$__all"}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"title": "Upgrade Log Volume",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 0},
|
||||||
|
"datasource": {"type": "loki", "uid": "loki"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(count_over_time({systemd_unit=\"nixos-upgrade.service\", host=~\"$host\"} [$__range]))",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{"color": "blue", "value": null}]
|
||||||
|
},
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
},
|
||||||
|
"description": "Total log entries from nixos-upgrade.service in selected time range"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"title": "Successful Upgrades",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 0},
|
||||||
|
"datasource": {"type": "loki", "uid": "loki"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(count_over_time({systemd_unit=\"nixos-upgrade.service\", host=~\"$host\"} |= \"Done. The new configuration is\" [$__range]))",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{"color": "green", "value": null}]
|
||||||
|
},
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
},
|
||||||
|
"description": "Upgrades that completed successfully"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"title": "Upgrade Errors",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 0},
|
||||||
|
"datasource": {"type": "loki", "uid": "loki"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(count_over_time({systemd_unit=\"nixos-upgrade.service\", host=~\"$host\"} |~ \"(?i)error|failed\" [$__range]))",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "red", "value": 1}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
},
|
||||||
|
"description": "Upgrade log entries containing errors"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"title": "Bootstrap Events",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 0},
|
||||||
|
"datasource": {"type": "loki", "uid": "loki"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(count_over_time({job=\"bootstrap\", host=~\"$host\"} [$__range]))",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{"color": "purple", "value": null}]
|
||||||
|
},
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
},
|
||||||
|
"description": "Bootstrap log entries from new VM deployments"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"title": "Upgrade Activity by Host",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 4},
|
||||||
|
"datasource": {"type": "loki", "uid": "loki"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by (host) (count_over_time({systemd_unit=\"nixos-upgrade.service\", host=~\"$host\"} [5m]))",
|
||||||
|
"legendFormat": "{{host}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short",
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 1,
|
||||||
|
"fillOpacity": 30,
|
||||||
|
"showPoints": "never",
|
||||||
|
"stacking": {"mode": "normal"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {"displayMode": "list", "placement": "bottom"},
|
||||||
|
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||||
|
},
|
||||||
|
"description": "When upgrades ran on each host"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"title": "ACME Certificate Activity",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 4},
|
||||||
|
"datasource": {"type": "loki", "uid": "loki"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by (host) (count_over_time({systemd_unit=~\"acme.*\", host=~\"$host\"} [5m]))",
|
||||||
|
"legendFormat": "{{host}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short",
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 1,
|
||||||
|
"fillOpacity": 30,
|
||||||
|
"showPoints": "never",
|
||||||
|
"stacking": {"mode": "normal"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {"displayMode": "list", "placement": "bottom"},
|
||||||
|
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||||
|
},
|
||||||
|
"description": "ACME certificate renewal activity"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 7,
|
||||||
|
"title": "Recent Upgrade Completions",
|
||||||
|
"type": "logs",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 12},
|
||||||
|
"datasource": {"type": "loki", "uid": "loki"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "{systemd_unit=\"nixos-upgrade.service\", host=~\"$host\"} |= \"Done. The new configuration is\" | json | line_format \"{{.MESSAGE}}\" | keep host",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"options": {
|
||||||
|
"showTime": true,
|
||||||
|
"showLabels": true,
|
||||||
|
"showCommonLabels": false,
|
||||||
|
"wrapLogMessage": true,
|
||||||
|
"prettifyLogMessage": false,
|
||||||
|
"enableLogDetails": true,
|
||||||
|
"sortOrder": "Descending"
|
||||||
|
},
|
||||||
|
"description": "Successful upgrade completion messages showing the new system path"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 8,
|
||||||
|
"title": "Build Activity",
|
||||||
|
"type": "logs",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 12},
|
||||||
|
"datasource": {"type": "loki", "uid": "loki"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "{systemd_unit=\"nixos-upgrade.service\", host=~\"$host\"} |= \"building\" | json | line_format \"{{.MESSAGE}}\" | keep host",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"options": {
|
||||||
|
"showTime": true,
|
||||||
|
"showLabels": true,
|
||||||
|
"showCommonLabels": false,
|
||||||
|
"wrapLogMessage": true,
|
||||||
|
"prettifyLogMessage": false,
|
||||||
|
"enableLogDetails": true,
|
||||||
|
"sortOrder": "Descending"
|
||||||
|
},
|
||||||
|
"description": "Derivations being built during upgrades"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 9,
|
||||||
|
"title": "Bootstrap Logs",
|
||||||
|
"type": "logs",
|
||||||
|
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 20},
|
||||||
|
"datasource": {"type": "loki", "uid": "loki"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "{job=\"bootstrap\", host=~\"$host\"}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"options": {
|
||||||
|
"showTime": true,
|
||||||
|
"showLabels": true,
|
||||||
|
"showCommonLabels": false,
|
||||||
|
"wrapLogMessage": true,
|
||||||
|
"prettifyLogMessage": false,
|
||||||
|
"enableLogDetails": true,
|
||||||
|
"sortOrder": "Descending"
|
||||||
|
},
|
||||||
|
"description": "Logs from VM bootstrap process (new deployments)"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 10,
|
||||||
|
"title": "Upgrade Errors & Failures",
|
||||||
|
"type": "logs",
|
||||||
|
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 28},
|
||||||
|
"datasource": {"type": "loki", "uid": "loki"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "{systemd_unit=\"nixos-upgrade.service\", host=~\"$host\"} |~ \"(?i)error|failed\" | json | line_format \"{{.MESSAGE}}\" | keep host",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"options": {
|
||||||
|
"showTime": true,
|
||||||
|
"showLabels": true,
|
||||||
|
"showCommonLabels": false,
|
||||||
|
"wrapLogMessage": true,
|
||||||
|
"prettifyLogMessage": false,
|
||||||
|
"enableLogDetails": true,
|
||||||
|
"sortOrder": "Descending"
|
||||||
|
},
|
||||||
|
"description": "Errors and failures during NixOS upgrades"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
208
services/grafana/dashboards/node-exporter.json
Normal file
208
services/grafana/dashboards/node-exporter.json
Normal file
@@ -0,0 +1,208 @@
|
|||||||
|
{
|
||||||
|
"uid": "node-exporter-homelab",
|
||||||
|
"title": "Node Exporter - Homelab",
|
||||||
|
"tags": ["node-exporter", "prometheus", "homelab"],
|
||||||
|
"timezone": "browser",
|
||||||
|
"schemaVersion": 39,
|
||||||
|
"version": 1,
|
||||||
|
"refresh": "30s",
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "instance",
|
||||||
|
"type": "query",
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"query": "label_values(node_uname_info, instance)",
|
||||||
|
"refresh": 2,
|
||||||
|
"includeAll": false,
|
||||||
|
"multi": false,
|
||||||
|
"current": {}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"title": "CPU Usage",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\", instance=~\"$instance\"}[5m])) * 100)",
|
||||||
|
"legendFormat": "CPU %",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "yellow", "value": 70},
|
||||||
|
{"color": "red", "value": 90}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"title": "Memory Usage",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "(1 - (node_memory_MemAvailable_bytes{instance=~\"$instance\"} / node_memory_MemTotal_bytes{instance=~\"$instance\"})) * 100",
|
||||||
|
"legendFormat": "Memory %",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "yellow", "value": 70},
|
||||||
|
{"color": "red", "value": 90}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"title": "Disk Usage",
|
||||||
|
"type": "gauge",
|
||||||
|
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 8},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 - ((node_filesystem_avail_bytes{instance=~\"$instance\",mountpoint=\"/\",fstype!=\"rootfs\"} / node_filesystem_size_bytes{instance=~\"$instance\",mountpoint=\"/\",fstype!=\"rootfs\"}) * 100)",
|
||||||
|
"legendFormat": "Root /",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "yellow", "value": 70},
|
||||||
|
{"color": "red", "value": 85}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"title": "System Load",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 8},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "node_load1{instance=~\"$instance\"}",
|
||||||
|
"legendFormat": "1m",
|
||||||
|
"refId": "A"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "node_load5{instance=~\"$instance\"}",
|
||||||
|
"legendFormat": "5m",
|
||||||
|
"refId": "B"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "node_load15{instance=~\"$instance\"}",
|
||||||
|
"legendFormat": "15m",
|
||||||
|
"refId": "C"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"title": "Uptime",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 8},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "time() - node_boot_time_seconds{instance=~\"$instance\"}",
|
||||||
|
"legendFormat": "Uptime",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"title": "Network Traffic",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(node_network_receive_bytes_total{instance=~\"$instance\",device!~\"lo|veth.*|br.*|docker.*\"}[5m])",
|
||||||
|
"legendFormat": "Receive {{device}}",
|
||||||
|
"refId": "A"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "-rate(node_network_transmit_bytes_total{instance=~\"$instance\",device!~\"lo|veth.*|br.*|docker.*\"}[5m])",
|
||||||
|
"legendFormat": "Transmit {{device}}",
|
||||||
|
"refId": "B"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 7,
|
||||||
|
"title": "Disk I/O",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(node_disk_read_bytes_total{instance=~\"$instance\",device!~\"dm-.*\"}[5m])",
|
||||||
|
"legendFormat": "Read {{device}}",
|
||||||
|
"refId": "A"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "-rate(node_disk_written_bytes_total{instance=~\"$instance\",device!~\"dm-.*\"}[5m])",
|
||||||
|
"legendFormat": "Write {{device}}",
|
||||||
|
"refId": "B"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
606
services/grafana/dashboards/proxmox.json
Normal file
606
services/grafana/dashboards/proxmox.json
Normal file
@@ -0,0 +1,606 @@
|
|||||||
|
{
|
||||||
|
"uid": "proxmox-homelab",
|
||||||
|
"title": "Proxmox - Homelab",
|
||||||
|
"tags": ["proxmox", "virtualization", "homelab"],
|
||||||
|
"timezone": "browser",
|
||||||
|
"schemaVersion": 39,
|
||||||
|
"version": 1,
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": {
|
||||||
|
"from": "now-6h",
|
||||||
|
"to": "now"
|
||||||
|
},
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "vm",
|
||||||
|
"type": "query",
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"query": "label_values(pve_guest_info{template=\"0\"}, name)",
|
||||||
|
"refresh": 2,
|
||||||
|
"includeAll": true,
|
||||||
|
"multi": true,
|
||||||
|
"current": {"text": "All", "value": "$__all"}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"title": "VMs Running",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(pve_up{id=~\"qemu/.*\"} * on(id) pve_guest_info{template=\"0\"} == 1)",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{"color": "green", "value": null}]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"title": "VMs Stopped",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(pve_up{id=~\"qemu/.*\"} * on(id) pve_guest_info{template=\"0\"} == 0)",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "yellow", "value": 1},
|
||||||
|
{"color": "red", "value": 3}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"title": "Node CPU",
|
||||||
|
"type": "gauge",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "pve_cpu_usage_ratio{id=~\"node/.*\"} * 100",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "yellow", "value": 70},
|
||||||
|
{"color": "red", "value": 90}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"showThresholdLabels": false,
|
||||||
|
"showThresholdMarkers": true
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"title": "Node Memory",
|
||||||
|
"type": "gauge",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "pve_memory_usage_bytes{id=~\"node/.*\"} / pve_memory_size_bytes{id=~\"node/.*\"} * 100",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "yellow", "value": 70},
|
||||||
|
{"color": "red", "value": 90}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"showThresholdLabels": false,
|
||||||
|
"showThresholdMarkers": true
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"title": "Node Uptime",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "pve_uptime_seconds{id=~\"node/.*\"}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s",
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{"color": "blue", "value": null}]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"title": "Templates",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 20, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(pve_guest_info{template=\"1\"})",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{"color": "purple", "value": null}]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 7,
|
||||||
|
"title": "VM Status",
|
||||||
|
"type": "table",
|
||||||
|
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 4},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "pve_guest_info{template=\"0\", name=~\"$vm\"}",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true,
|
||||||
|
"refId": "info"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "pve_up{id=~\"qemu/.*\"} * on(id) group_left(name) pve_guest_info{template=\"0\", name=~\"$vm\"}",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true,
|
||||||
|
"refId": "status"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "pve_cpu_usage_ratio{id=~\"qemu/.*\"} * on(id) group_left(name) pve_guest_info{template=\"0\", name=~\"$vm\"} * 100",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true,
|
||||||
|
"refId": "cpu"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "pve_memory_usage_bytes{id=~\"qemu/.*\"} * on(id) group_left(name) pve_guest_info{template=\"0\", name=~\"$vm\"} / on(id) pve_memory_size_bytes * 100",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true,
|
||||||
|
"refId": "mem"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "pve_uptime_seconds{id=~\"qemu/.*\"} * on(id) group_left(name) pve_guest_info{template=\"0\", name=~\"$vm\"}",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true,
|
||||||
|
"refId": "uptime"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Name"},
|
||||||
|
"properties": [{"id": "custom.width", "value": 150}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Status"},
|
||||||
|
"properties": [
|
||||||
|
{"id": "custom.width", "value": 80},
|
||||||
|
{"id": "mappings", "value": [
|
||||||
|
{"type": "value", "options": {"0": {"text": "Stopped", "color": "red"}}},
|
||||||
|
{"type": "value", "options": {"1": {"text": "Running", "color": "green"}}}
|
||||||
|
]},
|
||||||
|
{"id": "custom.cellOptions", "value": {"type": "color-text"}}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "CPU %"},
|
||||||
|
"properties": [
|
||||||
|
{"id": "unit", "value": "percent"},
|
||||||
|
{"id": "decimals", "value": 1},
|
||||||
|
{"id": "custom.width", "value": 80},
|
||||||
|
{"id": "custom.cellOptions", "value": {"type": "gauge", "mode": "basic"}},
|
||||||
|
{"id": "min", "value": 0},
|
||||||
|
{"id": "max", "value": 100},
|
||||||
|
{"id": "thresholds", "value": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 50}, {"color": "red", "value": 80}]}}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Memory %"},
|
||||||
|
"properties": [
|
||||||
|
{"id": "unit", "value": "percent"},
|
||||||
|
{"id": "decimals", "value": 1},
|
||||||
|
{"id": "custom.width", "value": 100},
|
||||||
|
{"id": "custom.cellOptions", "value": {"type": "gauge", "mode": "basic"}},
|
||||||
|
{"id": "min", "value": 0},
|
||||||
|
{"id": "max", "value": 100},
|
||||||
|
{"id": "thresholds", "value": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 70}, {"color": "red", "value": 90}]}}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Uptime"},
|
||||||
|
"properties": [
|
||||||
|
{"id": "unit", "value": "s"},
|
||||||
|
{"id": "custom.width", "value": 100}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "ID"},
|
||||||
|
"properties": [{"id": "custom.width", "value": 90}]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"showHeader": true,
|
||||||
|
"sortBy": [{"displayName": "Name", "desc": false}]
|
||||||
|
},
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "joinByField",
|
||||||
|
"options": {"byField": "name", "mode": "outer"}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {
|
||||||
|
"Time": true,
|
||||||
|
"Time 1": true,
|
||||||
|
"Time 2": true,
|
||||||
|
"Time 3": true,
|
||||||
|
"Time 4": true,
|
||||||
|
"Value #info": true,
|
||||||
|
"__name__": true,
|
||||||
|
"id 1": true,
|
||||||
|
"id 2": true,
|
||||||
|
"id 3": true,
|
||||||
|
"id 4": true,
|
||||||
|
"instance": true,
|
||||||
|
"instance 1": true,
|
||||||
|
"instance 2": true,
|
||||||
|
"instance 3": true,
|
||||||
|
"instance 4": true,
|
||||||
|
"job": true,
|
||||||
|
"job 1": true,
|
||||||
|
"job 2": true,
|
||||||
|
"job 3": true,
|
||||||
|
"job 4": true,
|
||||||
|
"name 1": true,
|
||||||
|
"name 2": true,
|
||||||
|
"name 3": true,
|
||||||
|
"name 4": true,
|
||||||
|
"node": true,
|
||||||
|
"tags": true,
|
||||||
|
"template": true,
|
||||||
|
"type": true
|
||||||
|
},
|
||||||
|
"indexByName": {
|
||||||
|
"name": 0,
|
||||||
|
"id": 1,
|
||||||
|
"Value #status": 2,
|
||||||
|
"Value #cpu": 3,
|
||||||
|
"Value #mem": 4,
|
||||||
|
"Value #uptime": 5
|
||||||
|
},
|
||||||
|
"renameByName": {
|
||||||
|
"name": "Name",
|
||||||
|
"id": "ID",
|
||||||
|
"Value #status": "Status",
|
||||||
|
"Value #cpu": "CPU %",
|
||||||
|
"Value #mem": "Memory %",
|
||||||
|
"Value #uptime": "Uptime"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 8,
|
||||||
|
"title": "VM CPU Usage",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 14},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "pve_cpu_usage_ratio{id=~\"qemu/.*\"} * on(id) group_left(name) pve_guest_info{template=\"0\", name=~\"$vm\"} * 100",
|
||||||
|
"legendFormat": "{{name}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 1,
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"showPoints": "never"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {"displayMode": "list", "placement": "bottom"},
|
||||||
|
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 9,
|
||||||
|
"title": "VM Memory Usage",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 14},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "pve_memory_usage_bytes{id=~\"qemu/.*\"} * on(id) group_left(name) pve_guest_info{template=\"0\", name=~\"$vm\"}",
|
||||||
|
"legendFormat": "{{name}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "bytes",
|
||||||
|
"min": 0,
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 1,
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"showPoints": "never"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {"displayMode": "list", "placement": "bottom"},
|
||||||
|
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 10,
|
||||||
|
"title": "VM Network Traffic",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 22},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(pve_network_receive_bytes{id=~\"qemu/.*\"}[5m]) * on(id) group_left(name) pve_guest_info{template=\"0\", name=~\"$vm\"}",
|
||||||
|
"legendFormat": "{{name}} RX",
|
||||||
|
"refId": "A"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "-rate(pve_network_transmit_bytes{id=~\"qemu/.*\"}[5m]) * on(id) group_left(name) pve_guest_info{template=\"0\", name=~\"$vm\"}",
|
||||||
|
"legendFormat": "{{name}} TX",
|
||||||
|
"refId": "B"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps",
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 1,
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"showPoints": "never"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {"displayMode": "list", "placement": "bottom"},
|
||||||
|
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 11,
|
||||||
|
"title": "VM Disk I/O",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 22},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(pve_disk_read_bytes{id=~\"qemu/.*\"}[5m]) * on(id) group_left(name) pve_guest_info{template=\"0\", name=~\"$vm\"}",
|
||||||
|
"legendFormat": "{{name}} Read",
|
||||||
|
"refId": "A"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "-rate(pve_disk_write_bytes{id=~\"qemu/.*\"}[5m]) * on(id) group_left(name) pve_guest_info{template=\"0\", name=~\"$vm\"}",
|
||||||
|
"legendFormat": "{{name}} Write",
|
||||||
|
"refId": "B"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "Bps",
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 1,
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"showPoints": "never"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {"displayMode": "list", "placement": "bottom"},
|
||||||
|
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 12,
|
||||||
|
"title": "Storage Usage",
|
||||||
|
"type": "bargauge",
|
||||||
|
"gridPos": {"h": 6, "w": 12, "x": 0, "y": 30},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "pve_disk_usage_bytes{id=~\"storage/.*\"} / pve_disk_size_bytes{id=~\"storage/.*\"} * 100",
|
||||||
|
"legendFormat": "{{id}}",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "yellow", "value": 70},
|
||||||
|
{"color": "red", "value": 85}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": true
|
||||||
|
},
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "renameByRegex",
|
||||||
|
"options": {
|
||||||
|
"regex": "storage/pve1/(.*)",
|
||||||
|
"renamePattern": "$1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 13,
|
||||||
|
"title": "Storage Capacity",
|
||||||
|
"type": "table",
|
||||||
|
"gridPos": {"h": 6, "w": 12, "x": 12, "y": 30},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "pve_disk_size_bytes{id=~\"storage/.*\"}",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true,
|
||||||
|
"refId": "size"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "pve_disk_usage_bytes{id=~\"storage/.*\"}",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true,
|
||||||
|
"refId": "used"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "pve_disk_size_bytes{id=~\"storage/.*\"} - pve_disk_usage_bytes{id=~\"storage/.*\"}",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true,
|
||||||
|
"refId": "free"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "bytes"
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Storage"},
|
||||||
|
"properties": [{"id": "unit", "value": "none"}]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"showHeader": true
|
||||||
|
},
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "joinByField",
|
||||||
|
"options": {"byField": "id", "mode": "outer"}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {
|
||||||
|
"Time": true,
|
||||||
|
"Time 1": true,
|
||||||
|
"Time 2": true,
|
||||||
|
"instance": true,
|
||||||
|
"instance 1": true,
|
||||||
|
"instance 2": true,
|
||||||
|
"job": true,
|
||||||
|
"job 1": true,
|
||||||
|
"job 2": true
|
||||||
|
},
|
||||||
|
"renameByName": {
|
||||||
|
"id": "Storage",
|
||||||
|
"Value #size": "Total",
|
||||||
|
"Value #used": "Used",
|
||||||
|
"Value #free": "Free"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "renameByRegex",
|
||||||
|
"options": {
|
||||||
|
"regex": "storage/pve1/(.*)",
|
||||||
|
"renamePattern": "$1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
553
services/grafana/dashboards/systemd.json
Normal file
553
services/grafana/dashboards/systemd.json
Normal file
@@ -0,0 +1,553 @@
|
|||||||
|
{
|
||||||
|
"uid": "systemd-homelab",
|
||||||
|
"title": "Systemd Services - Homelab",
|
||||||
|
"tags": ["systemd", "services", "homelab"],
|
||||||
|
"timezone": "browser",
|
||||||
|
"schemaVersion": 39,
|
||||||
|
"version": 1,
|
||||||
|
"refresh": "1m",
|
||||||
|
"time": {
|
||||||
|
"from": "now-24h",
|
||||||
|
"to": "now"
|
||||||
|
},
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "hostname",
|
||||||
|
"type": "query",
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"query": "label_values(systemd_unit_state, hostname)",
|
||||||
|
"refresh": 2,
|
||||||
|
"includeAll": true,
|
||||||
|
"multi": true,
|
||||||
|
"current": {"text": "All", "value": "$__all"}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"title": "Failed Units",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(systemd_unit_state{state=\"failed\", hostname=~\"$hostname\"} == 1) or vector(0)",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "red", "value": 1}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"title": "Active Units",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(systemd_unit_state{state=\"active\", hostname=~\"$hostname\"} == 1)",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{"color": "green", "value": null}]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"title": "Hosts Monitored",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(count by (hostname) (systemd_unit_state{hostname=~\"$hostname\"}))",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{"color": "blue", "value": null}]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"title": "Total Service Restarts",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(systemd_service_restart_total{hostname=~\"$hostname\"})",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": null},
|
||||||
|
{"color": "yellow", "value": 10},
|
||||||
|
{"color": "orange", "value": 50}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"noValue": "0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"title": "Inactive Units",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(systemd_unit_state{state=\"inactive\", hostname=~\"$hostname\"} == 1)",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{"color": "purple", "value": null}]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"title": "Timers",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 4, "w": 4, "x": 20, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "count(systemd_timer_last_trigger_seconds{hostname=~\"$hostname\"})",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{"color": "blue", "value": null}]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 7,
|
||||||
|
"title": "Failed Units",
|
||||||
|
"type": "table",
|
||||||
|
"gridPos": {"h": 6, "w": 12, "x": 0, "y": 4},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "systemd_unit_state{state=\"failed\", hostname=~\"$hostname\"} == 1",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true,
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Host"},
|
||||||
|
"properties": [{"id": "custom.width", "value": 120}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Unit"},
|
||||||
|
"properties": [{"id": "custom.width", "value": 300}]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"showHeader": true,
|
||||||
|
"sortBy": [{"displayName": "Host", "desc": false}]
|
||||||
|
},
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {
|
||||||
|
"Time": true,
|
||||||
|
"Value": true,
|
||||||
|
"__name__": true,
|
||||||
|
"dns_role": true,
|
||||||
|
"instance": true,
|
||||||
|
"job": true,
|
||||||
|
"role": true,
|
||||||
|
"state": true,
|
||||||
|
"tier": true,
|
||||||
|
"type": true
|
||||||
|
},
|
||||||
|
"renameByName": {
|
||||||
|
"hostname": "Host",
|
||||||
|
"name": "Unit"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"description": "Units currently in failed state"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 8,
|
||||||
|
"title": "Service Restarts (Top 15)",
|
||||||
|
"type": "table",
|
||||||
|
"gridPos": {"h": 6, "w": 12, "x": 12, "y": 4},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "topk(15, systemd_service_restart_total{hostname=~\"$hostname\"} > 0)",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true,
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Host"},
|
||||||
|
"properties": [{"id": "custom.width", "value": 120}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Service"},
|
||||||
|
"properties": [{"id": "custom.width", "value": 280}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Restarts"},
|
||||||
|
"properties": [{"id": "custom.width", "value": 80}]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"showHeader": true,
|
||||||
|
"sortBy": [{"displayName": "Restarts", "desc": true}]
|
||||||
|
},
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {
|
||||||
|
"Time": true,
|
||||||
|
"__name__": true,
|
||||||
|
"dns_role": true,
|
||||||
|
"instance": true,
|
||||||
|
"job": true,
|
||||||
|
"role": true,
|
||||||
|
"tier": true
|
||||||
|
},
|
||||||
|
"renameByName": {
|
||||||
|
"hostname": "Host",
|
||||||
|
"name": "Service",
|
||||||
|
"Value": "Restarts"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"description": "Services that have been restarted (since host boot)"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 9,
|
||||||
|
"title": "Active Units per Host",
|
||||||
|
"type": "bargauge",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 10},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sort_desc(count by (hostname) (systemd_unit_state{state=\"active\", hostname=~\"$hostname\"} == 1))",
|
||||||
|
"legendFormat": "{{hostname}}",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [{"color": "green", "value": null}]
|
||||||
|
},
|
||||||
|
"min": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"displayMode": "gradient",
|
||||||
|
"showUnfilled": true
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 10,
|
||||||
|
"title": "NixOS Upgrade Timers",
|
||||||
|
"type": "table",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 10},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "systemd_timer_last_trigger_seconds{name=\"nixos-upgrade.timer\", hostname=~\"$hostname\"}",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true,
|
||||||
|
"refId": "last"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "time() - systemd_timer_last_trigger_seconds{name=\"nixos-upgrade.timer\", hostname=~\"$hostname\"}",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true,
|
||||||
|
"refId": "ago"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Host"},
|
||||||
|
"properties": [{"id": "custom.width", "value": 130}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Last Trigger"},
|
||||||
|
"properties": [
|
||||||
|
{"id": "unit", "value": "dateTimeAsLocalNoDateIfToday"},
|
||||||
|
{"id": "custom.width", "value": 180}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Time Ago"},
|
||||||
|
"properties": [
|
||||||
|
{"id": "unit", "value": "s"},
|
||||||
|
{"id": "custom.width", "value": 120},
|
||||||
|
{"id": "thresholds", "value": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}},
|
||||||
|
{"id": "custom.cellOptions", "value": {"type": "color-text"}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"showHeader": true,
|
||||||
|
"sortBy": [{"displayName": "Time Ago", "desc": true}]
|
||||||
|
},
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "joinByField",
|
||||||
|
"options": {"byField": "hostname", "mode": "outer"}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {
|
||||||
|
"Time": true,
|
||||||
|
"Time 1": true,
|
||||||
|
"__name__": true,
|
||||||
|
"__name__ 1": true,
|
||||||
|
"dns_role": true,
|
||||||
|
"dns_role 1": true,
|
||||||
|
"instance": true,
|
||||||
|
"instance 1": true,
|
||||||
|
"job": true,
|
||||||
|
"job 1": true,
|
||||||
|
"name": true,
|
||||||
|
"name 1": true,
|
||||||
|
"role": true,
|
||||||
|
"role 1": true,
|
||||||
|
"tier": true,
|
||||||
|
"tier 1": true
|
||||||
|
},
|
||||||
|
"indexByName": {
|
||||||
|
"hostname": 0,
|
||||||
|
"Value #last": 1,
|
||||||
|
"Value #ago": 2
|
||||||
|
},
|
||||||
|
"renameByName": {
|
||||||
|
"hostname": "Host",
|
||||||
|
"Value #last": "Last Trigger",
|
||||||
|
"Value #ago": "Time Ago"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"description": "When nixos-upgrade.timer last ran on each host. Yellow >24h, Red >48h."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 11,
|
||||||
|
"title": "Backup Timers",
|
||||||
|
"type": "table",
|
||||||
|
"gridPos": {"h": 6, "w": 12, "x": 0, "y": 18},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "systemd_timer_last_trigger_seconds{name=~\"restic.*\", hostname=~\"$hostname\"}",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true,
|
||||||
|
"refId": "last"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "time() - systemd_timer_last_trigger_seconds{name=~\"restic.*\", hostname=~\"$hostname\"}",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true,
|
||||||
|
"refId": "ago"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Host"},
|
||||||
|
"properties": [{"id": "custom.width", "value": 120}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Timer"},
|
||||||
|
"properties": [{"id": "custom.width", "value": 220}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Last Trigger"},
|
||||||
|
"properties": [
|
||||||
|
{"id": "unit", "value": "dateTimeAsLocalNoDateIfToday"},
|
||||||
|
{"id": "custom.width", "value": 180}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Time Ago"},
|
||||||
|
"properties": [
|
||||||
|
{"id": "unit", "value": "s"},
|
||||||
|
{"id": "custom.width", "value": 100},
|
||||||
|
{"id": "thresholds", "value": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 86400}, {"color": "red", "value": 172800}]}},
|
||||||
|
{"id": "custom.cellOptions", "value": {"type": "color-text"}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"showHeader": true,
|
||||||
|
"sortBy": [{"displayName": "Time Ago", "desc": true}]
|
||||||
|
},
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "joinByField",
|
||||||
|
"options": {"byField": "name", "mode": "outer"}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {
|
||||||
|
"Time": true,
|
||||||
|
"Time 1": true,
|
||||||
|
"__name__": true,
|
||||||
|
"__name__ 1": true,
|
||||||
|
"dns_role": true,
|
||||||
|
"dns_role 1": true,
|
||||||
|
"instance": true,
|
||||||
|
"instance 1": true,
|
||||||
|
"job": true,
|
||||||
|
"job 1": true,
|
||||||
|
"role": true,
|
||||||
|
"role 1": true,
|
||||||
|
"tier": true,
|
||||||
|
"tier 1": true,
|
||||||
|
"hostname 1": true
|
||||||
|
},
|
||||||
|
"indexByName": {
|
||||||
|
"hostname": 0,
|
||||||
|
"name": 1,
|
||||||
|
"Value #last": 2,
|
||||||
|
"Value #ago": 3
|
||||||
|
},
|
||||||
|
"renameByName": {
|
||||||
|
"hostname": "Host",
|
||||||
|
"name": "Timer",
|
||||||
|
"Value #last": "Last Trigger",
|
||||||
|
"Value #ago": "Time Ago"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"description": "Restic backup timers"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 12,
|
||||||
|
"title": "Service Restarts Over Time",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 6, "w": 12, "x": 12, "y": 18},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by (hostname) (increase(systemd_service_restart_total{hostname=~\"$hostname\"}[1h]))",
|
||||||
|
"legendFormat": "{{hostname}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short",
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 1,
|
||||||
|
"fillOpacity": 20,
|
||||||
|
"showPoints": "never",
|
||||||
|
"stacking": {"mode": "normal"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {"displayMode": "list", "placement": "bottom"},
|
||||||
|
"tooltip": {"mode": "multi", "sort": "desc"}
|
||||||
|
},
|
||||||
|
"description": "Service restart rate per hour"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
399
services/grafana/dashboards/temperature.json
Normal file
399
services/grafana/dashboards/temperature.json
Normal file
@@ -0,0 +1,399 @@
|
|||||||
|
{
|
||||||
|
"uid": "temperature-homelab",
|
||||||
|
"title": "Temperature - Homelab",
|
||||||
|
"tags": ["home-assistant", "temperature", "homelab"],
|
||||||
|
"timezone": "browser",
|
||||||
|
"schemaVersion": 39,
|
||||||
|
"version": 1,
|
||||||
|
"refresh": "1m",
|
||||||
|
"time": {
|
||||||
|
"from": "now-30d",
|
||||||
|
"to": "now"
|
||||||
|
},
|
||||||
|
"templating": {
|
||||||
|
"list": []
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"title": "Current Temperatures",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 6, "w": 12, "x": 0, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "hass_sensor_temperature_celsius{entity!~\".*device_temperature\"}",
|
||||||
|
"legendFormat": "{{friendly_name}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "celsius",
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "blue", "value": null},
|
||||||
|
{"color": "green", "value": 18},
|
||||||
|
{"color": "yellow", "value": 24},
|
||||||
|
{"color": "orange", "value": 27},
|
||||||
|
{"color": "red", "value": 30}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"mappings": []
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {
|
||||||
|
"calcs": ["lastNotNull"],
|
||||||
|
"fields": "",
|
||||||
|
"values": false
|
||||||
|
},
|
||||||
|
"orientation": "auto",
|
||||||
|
"textMode": "auto",
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "area",
|
||||||
|
"justifyMode": "auto"
|
||||||
|
},
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "renameByRegex",
|
||||||
|
"options": {
|
||||||
|
"regex": "Temp (.*) Temperature",
|
||||||
|
"renamePattern": "$1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"title": "Average Home Temperature",
|
||||||
|
"type": "gauge",
|
||||||
|
"gridPos": {"h": 6, "w": 6, "x": 12, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "avg(hass_sensor_temperature_celsius{entity!~\".*device_temperature|.*server.*\"})",
|
||||||
|
"legendFormat": "Average",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "celsius",
|
||||||
|
"min": 15,
|
||||||
|
"max": 30,
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "blue", "value": null},
|
||||||
|
{"color": "green", "value": 18},
|
||||||
|
{"color": "yellow", "value": 24},
|
||||||
|
{"color": "red", "value": 28}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {
|
||||||
|
"calcs": ["lastNotNull"]
|
||||||
|
},
|
||||||
|
"showThresholdLabels": false,
|
||||||
|
"showThresholdMarkers": true
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"title": "Current Humidity",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": {"h": 6, "w": 6, "x": 18, "y": 0},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "hass_sensor_humidity_percent{entity!~\".*server.*\"}",
|
||||||
|
"legendFormat": "{{friendly_name}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "red", "value": null},
|
||||||
|
{"color": "yellow", "value": 30},
|
||||||
|
{"color": "green", "value": 40},
|
||||||
|
{"color": "yellow", "value": 60},
|
||||||
|
{"color": "red", "value": 70}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {
|
||||||
|
"calcs": ["lastNotNull"]
|
||||||
|
},
|
||||||
|
"orientation": "horizontal",
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
},
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "renameByRegex",
|
||||||
|
"options": {
|
||||||
|
"regex": "Temp (.*) Humidity",
|
||||||
|
"renamePattern": "$1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"title": "Temperature History (30 Days)",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 6},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "hass_sensor_temperature_celsius{entity!~\".*device_temperature\"}",
|
||||||
|
"legendFormat": "{{friendly_name}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "celsius",
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 1,
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"pointSize": 5,
|
||||||
|
"showPoints": "never",
|
||||||
|
"spanNulls": 3600000
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom",
|
||||||
|
"calcs": ["mean", "min", "max"]
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "multi",
|
||||||
|
"sort": "desc"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "renameByRegex",
|
||||||
|
"options": {
|
||||||
|
"regex": "Temp (.*) Temperature",
|
||||||
|
"renamePattern": "$1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "renameByRegex",
|
||||||
|
"options": {
|
||||||
|
"regex": "temp_server Temperature",
|
||||||
|
"renamePattern": "Server"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"title": "Temperature Trend (1h rate of change)",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "deriv(hass_sensor_temperature_celsius{entity!~\".*device_temperature\"}[1h]) * 3600",
|
||||||
|
"legendFormat": "{{friendly_name}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "celsius",
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 1,
|
||||||
|
"fillOpacity": 20,
|
||||||
|
"showPoints": "never",
|
||||||
|
"spanNulls": 3600000
|
||||||
|
},
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "blue", "value": null},
|
||||||
|
{"color": "green", "value": -0.5},
|
||||||
|
{"color": "green", "value": 0.5},
|
||||||
|
{"color": "red", "value": 1}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"displayName": "${__field.labels.friendly_name}"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom"
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "multi"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "renameByRegex",
|
||||||
|
"options": {
|
||||||
|
"regex": "Temp (.*) Temperature",
|
||||||
|
"renamePattern": "$1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "renameByRegex",
|
||||||
|
"options": {
|
||||||
|
"regex": "temp_server Temperature",
|
||||||
|
"renamePattern": "Server"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"description": "Rate of temperature change per hour. Positive = warming, Negative = cooling."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"title": "24h Min / Max / Avg",
|
||||||
|
"type": "table",
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "min_over_time(hass_sensor_temperature_celsius{entity!~\".*device_temperature\"}[24h])",
|
||||||
|
"legendFormat": "{{friendly_name}}",
|
||||||
|
"refId": "min",
|
||||||
|
"instant": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "max_over_time(hass_sensor_temperature_celsius{entity!~\".*device_temperature\"}[24h])",
|
||||||
|
"legendFormat": "{{friendly_name}}",
|
||||||
|
"refId": "max",
|
||||||
|
"instant": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "avg_over_time(hass_sensor_temperature_celsius{entity!~\".*device_temperature\"}[24h])",
|
||||||
|
"legendFormat": "{{friendly_name}}",
|
||||||
|
"refId": "avg",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "celsius",
|
||||||
|
"decimals": 1
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": {"id": "byName", "options": "Room"},
|
||||||
|
"properties": [{"id": "custom.width", "value": 150}]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"showHeader": true,
|
||||||
|
"sortBy": [{"displayName": "Room", "desc": false}]
|
||||||
|
},
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "joinByField",
|
||||||
|
"options": {
|
||||||
|
"byField": "friendly_name",
|
||||||
|
"mode": "outer"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {
|
||||||
|
"Time": true,
|
||||||
|
"domain": true,
|
||||||
|
"entity": true,
|
||||||
|
"hostname": true,
|
||||||
|
"instance": true,
|
||||||
|
"job": true
|
||||||
|
},
|
||||||
|
"renameByName": {
|
||||||
|
"friendly_name": "Room",
|
||||||
|
"Value #min": "Min (24h)",
|
||||||
|
"Value #max": "Max (24h)",
|
||||||
|
"Value #avg": "Avg (24h)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "renameByRegex",
|
||||||
|
"options": {
|
||||||
|
"regex": "Temp (.*) Temperature",
|
||||||
|
"renamePattern": "$1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 7,
|
||||||
|
"title": "Humidity History (30 Days)",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 24},
|
||||||
|
"datasource": {"type": "prometheus", "uid": "victoriametrics"},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "hass_sensor_humidity_percent",
|
||||||
|
"legendFormat": "{{friendly_name}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"min": 0,
|
||||||
|
"max": 100,
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 1,
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"showPoints": "never",
|
||||||
|
"spanNulls": 3600000
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom",
|
||||||
|
"calcs": ["mean", "min", "max"]
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "multi",
|
||||||
|
"sort": "desc"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "renameByRegex",
|
||||||
|
"options": {
|
||||||
|
"regex": "Temp (.*) Humidity",
|
||||||
|
"renamePattern": "$1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "renameByRegex",
|
||||||
|
"options": {
|
||||||
|
"regex": "temp_server Humidity",
|
||||||
|
"renamePattern": "Server"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
121
services/grafana/default.nix
Normal file
121
services/grafana/default.nix
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
{ config, pkgs, ... }:
|
||||||
|
{
|
||||||
|
services.grafana = {
|
||||||
|
enable = true;
|
||||||
|
settings = {
|
||||||
|
server = {
|
||||||
|
http_addr = "127.0.0.1";
|
||||||
|
http_port = 3000;
|
||||||
|
domain = "grafana-test.home.2rjus.net";
|
||||||
|
root_url = "https://grafana-test.home.2rjus.net/";
|
||||||
|
};
|
||||||
|
|
||||||
|
# Disable anonymous access
|
||||||
|
"auth.anonymous".enabled = false;
|
||||||
|
|
||||||
|
# OIDC authentication via Kanidm
|
||||||
|
"auth.generic_oauth" = {
|
||||||
|
enabled = true;
|
||||||
|
name = "Kanidm";
|
||||||
|
client_id = "grafana";
|
||||||
|
client_secret = "$__file{/run/secrets/grafana-oauth2}";
|
||||||
|
auth_url = "https://auth.home.2rjus.net/ui/oauth2";
|
||||||
|
token_url = "https://auth.home.2rjus.net/oauth2/token";
|
||||||
|
api_url = "https://auth.home.2rjus.net/oauth2/openid/grafana/userinfo";
|
||||||
|
scopes = "openid profile email groups";
|
||||||
|
use_pkce = true; # Required by Kanidm, more secure
|
||||||
|
# Extract user attributes from userinfo response
|
||||||
|
email_attribute_path = "email";
|
||||||
|
login_attribute_path = "preferred_username";
|
||||||
|
name_attribute_path = "name";
|
||||||
|
# Map admins group to Admin role, everyone else to Editor (for Explore access)
|
||||||
|
role_attribute_path = "contains(groups[*], 'admins') && 'Admin' || 'Editor'";
|
||||||
|
allow_sign_up = true;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# Declarative datasources
|
||||||
|
provision.datasources.settings = {
|
||||||
|
apiVersion = 1;
|
||||||
|
prune = true;
|
||||||
|
deleteDatasources = [
|
||||||
|
{ name = "Prometheus (monitoring01)"; orgId = 1; }
|
||||||
|
];
|
||||||
|
datasources = [
|
||||||
|
{
|
||||||
|
name = "VictoriaMetrics";
|
||||||
|
type = "prometheus";
|
||||||
|
url = "http://localhost:8428";
|
||||||
|
isDefault = true;
|
||||||
|
uid = "victoriametrics";
|
||||||
|
}
|
||||||
|
{
|
||||||
|
name = "Loki";
|
||||||
|
type = "loki";
|
||||||
|
url = "http://localhost:3100";
|
||||||
|
uid = "loki";
|
||||||
|
}
|
||||||
|
];
|
||||||
|
};
|
||||||
|
|
||||||
|
# Declarative dashboards
|
||||||
|
provision.dashboards.settings = {
|
||||||
|
apiVersion = 1;
|
||||||
|
providers = [
|
||||||
|
{
|
||||||
|
name = "homelab";
|
||||||
|
type = "file";
|
||||||
|
options.path = ./dashboards;
|
||||||
|
disableDeletion = true;
|
||||||
|
}
|
||||||
|
];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# Vault secret for OAuth2 client secret
|
||||||
|
vault.secrets.grafana-oauth2 = {
|
||||||
|
secretPath = "services/grafana/oauth2-client-secret";
|
||||||
|
extractKey = "password";
|
||||||
|
services = [ "grafana" ];
|
||||||
|
owner = "grafana";
|
||||||
|
group = "grafana";
|
||||||
|
};
|
||||||
|
|
||||||
|
# Local Caddy for TLS termination
|
||||||
|
services.caddy = {
|
||||||
|
enable = true;
|
||||||
|
package = pkgs.unstable.caddy;
|
||||||
|
globalConfig = ''
|
||||||
|
acme_ca https://vault.home.2rjus.net:8200/v1/pki_int/acme/directory
|
||||||
|
metrics
|
||||||
|
'';
|
||||||
|
virtualHosts."grafana.home.2rjus.net".extraConfig = ''
|
||||||
|
log {
|
||||||
|
output file /var/log/caddy/grafana.log {
|
||||||
|
mode 644
|
||||||
|
}
|
||||||
|
}
|
||||||
|
reverse_proxy http://127.0.0.1:3000
|
||||||
|
'';
|
||||||
|
virtualHosts."grafana-test.home.2rjus.net".extraConfig = ''
|
||||||
|
log {
|
||||||
|
output file /var/log/caddy/grafana.log {
|
||||||
|
mode 644
|
||||||
|
}
|
||||||
|
}
|
||||||
|
reverse_proxy http://127.0.0.1:3000
|
||||||
|
'';
|
||||||
|
# Metrics endpoint on plain HTTP for Prometheus scraping
|
||||||
|
extraConfig = ''
|
||||||
|
http://${config.networking.hostName}.home.2rjus.net/metrics {
|
||||||
|
metrics
|
||||||
|
}
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
|
# Expose Caddy metrics for Prometheus
|
||||||
|
homelab.monitoring.scrapeTargets = [{
|
||||||
|
job_name = "caddy";
|
||||||
|
port = 80;
|
||||||
|
}];
|
||||||
|
}
|
||||||
@@ -78,15 +78,15 @@
|
|||||||
# Override battery calculation using voltage (mV): (voltage - 2100) / 9
|
# Override battery calculation using voltage (mV): (voltage - 2100) / 9
|
||||||
"0x54ef441000a547bd" = {
|
"0x54ef441000a547bd" = {
|
||||||
friendly_name = "0x54ef441000a547bd";
|
friendly_name = "0x54ef441000a547bd";
|
||||||
homeassistant.battery.value_template = "{{ (((value_json.voltage | float) - 2100) / 9) | round(0) | int | min(100) | max(0) }}";
|
homeassistant.battery.value_template = "{{ [[(((value_json.voltage | float) - 2100) / 9) | round(0) | int, 100] | min, 0] | max }}";
|
||||||
};
|
};
|
||||||
"0x54ef441000a54d3c" = {
|
"0x54ef441000a54d3c" = {
|
||||||
friendly_name = "0x54ef441000a54d3c";
|
friendly_name = "0x54ef441000a54d3c";
|
||||||
homeassistant.battery.value_template = "{{ (((value_json.voltage | float) - 2100) / 9) | round(0) | int | min(100) | max(0) }}";
|
homeassistant.battery.value_template = "{{ [[(((value_json.voltage | float) - 2100) / 9) | round(0) | int, 100] | min, 0] | max }}";
|
||||||
};
|
};
|
||||||
"0x54ef441000a564b6" = {
|
"0x54ef441000a564b6" = {
|
||||||
friendly_name = "temp_server";
|
friendly_name = "temp_server";
|
||||||
homeassistant.battery.value_template = "{{ (((value_json.voltage | float) - 2100) / 9) | round(0) | int | min(100) | max(0) }}";
|
homeassistant.battery.value_template = "{{ [[(((value_json.voltage | float) - 2100) / 9) | round(0) | int, 100] | min, 0] | max }}";
|
||||||
};
|
};
|
||||||
|
|
||||||
# Other sensors
|
# Other sensors
|
||||||
|
|||||||
@@ -54,53 +54,49 @@
|
|||||||
}
|
}
|
||||||
reverse_proxy http://ha1.home.2rjus.net:8080
|
reverse_proxy http://ha1.home.2rjus.net:8080
|
||||||
}
|
}
|
||||||
prometheus.home.2rjus.net {
|
|
||||||
log {
|
|
||||||
output file /var/log/caddy/prometheus.log {
|
|
||||||
mode 644
|
|
||||||
}
|
|
||||||
}
|
|
||||||
reverse_proxy http://monitoring01.home.2rjus.net:9090
|
|
||||||
}
|
|
||||||
alertmanager.home.2rjus.net {
|
|
||||||
log {
|
|
||||||
output file /var/log/caddy/alertmanager.log {
|
|
||||||
mode 644
|
|
||||||
}
|
|
||||||
}
|
|
||||||
reverse_proxy http://monitoring01.home.2rjus.net:9093
|
|
||||||
}
|
|
||||||
grafana.home.2rjus.net {
|
|
||||||
log {
|
|
||||||
output file /var/log/caddy/grafana.log {
|
|
||||||
mode 644
|
|
||||||
}
|
|
||||||
}
|
|
||||||
reverse_proxy http://monitoring01.home.2rjus.net:3000
|
|
||||||
}
|
|
||||||
jelly.home.2rjus.net {
|
jelly.home.2rjus.net {
|
||||||
log {
|
log {
|
||||||
output file /var/log/caddy/jelly.log {
|
output file /var/log/caddy/jelly.log {
|
||||||
mode 644
|
mode 644
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
reverse_proxy http://jelly01.home.2rjus.net:8096
|
header Content-Type text/html
|
||||||
}
|
respond <<HTML
|
||||||
pyroscope.home.2rjus.net {
|
<!DOCTYPE html>
|
||||||
log {
|
<html>
|
||||||
output file /var/log/caddy/pyroscope.log {
|
<head>
|
||||||
mode 644
|
<title>Jellyfin - Maintenance</title>
|
||||||
}
|
<style>
|
||||||
}
|
body {
|
||||||
reverse_proxy http://monitoring01.home.2rjus.net:4040
|
background: #101020;
|
||||||
}
|
color: #ddd;
|
||||||
pushgw.home.2rjus.net {
|
font-family: sans-serif;
|
||||||
log {
|
display: flex;
|
||||||
output file /var/log/caddy/pushgw.log {
|
justify-content: center;
|
||||||
mode 644
|
align-items: center;
|
||||||
}
|
min-height: 100vh;
|
||||||
}
|
margin: 0;
|
||||||
reverse_proxy http://monitoring01.home.2rjus.net:9091
|
text-align: center;
|
||||||
|
}
|
||||||
|
.container { max-width: 500px; }
|
||||||
|
.disk { font-size: 80px; animation: spin 3s linear infinite; display: inline-block; }
|
||||||
|
@keyframes spin { from { transform: rotate(0deg); } to { transform: rotate(360deg); } }
|
||||||
|
h1 { color: #00a4dc; }
|
||||||
|
p { font-size: 1.2em; line-height: 1.6; }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="container">
|
||||||
|
<div class="disk">💿</div>
|
||||||
|
<h1>Jellyfin is taking a nap</h1>
|
||||||
|
<p>The NAS is getting shiny new hard drives.<br>
|
||||||
|
Jellyfin will be back once the disks stop spinning up.</p>
|
||||||
|
<p style="color:#666;font-size:0.9em;">In the meantime, maybe go outside?</p>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
HTML 200
|
||||||
}
|
}
|
||||||
http://http-proxy.home.2rjus.net/metrics {
|
http://http-proxy.home.2rjus.net/metrics {
|
||||||
log {
|
log {
|
||||||
|
|||||||
@@ -17,20 +17,43 @@
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
# Provisioning - initial users/groups
|
# Provision base groups only - users are managed via CLI
|
||||||
|
# See docs/user-management.md for details
|
||||||
provision = {
|
provision = {
|
||||||
enable = true;
|
enable = true;
|
||||||
idmAdminPasswordFile = config.vault.secrets.kanidm-idm-admin.outputDir;
|
idmAdminPasswordFile = config.vault.secrets.kanidm-idm-admin.outputDir;
|
||||||
|
|
||||||
groups = {
|
groups = {
|
||||||
admins = { };
|
# overwriteMembers = false allows imperative member management via CLI
|
||||||
users = { };
|
admins = { overwriteMembers = false; };
|
||||||
ssh-users = { };
|
users = { overwriteMembers = false; };
|
||||||
|
ssh-users = { overwriteMembers = false; };
|
||||||
};
|
};
|
||||||
|
|
||||||
persons.torjus = {
|
# Regular users (persons) are managed imperatively via kanidm CLI
|
||||||
displayName = "Torjus";
|
|
||||||
groups = [ "admins" "users" "ssh-users" ];
|
# OAuth2/OIDC clients for service authentication
|
||||||
|
systems.oauth2.grafana = {
|
||||||
|
displayName = "Grafana";
|
||||||
|
originUrl = "https://grafana-test.home.2rjus.net/login/generic_oauth";
|
||||||
|
originLanding = "https://grafana-test.home.2rjus.net/";
|
||||||
|
basicSecretFile = config.vault.secrets.grafana-oauth2.outputDir;
|
||||||
|
preferShortUsername = true;
|
||||||
|
scopeMaps.users = [ "openid" "profile" "email" "groups" ];
|
||||||
|
};
|
||||||
|
|
||||||
|
systems.oauth2.openbao = {
|
||||||
|
displayName = "OpenBao Secrets";
|
||||||
|
# Web UI callback only (CLI localhost not supported with confidential clients)
|
||||||
|
originUrl = "https://vault.home.2rjus.net:8200/ui/vault/auth/oidc/oidc/callback";
|
||||||
|
originLanding = "https://vault.home.2rjus.net:8200/";
|
||||||
|
basicSecretFile = config.vault.secrets.openbao-oauth2.outputDir;
|
||||||
|
preferShortUsername = true;
|
||||||
|
# Enable RS256 signing algorithm (required by OpenBao)
|
||||||
|
enableLegacyCrypto = true;
|
||||||
|
# Allow groups scope for role binding
|
||||||
|
scopeMaps.admins = [ "openid" "profile" "email" "groups" ];
|
||||||
|
scopeMaps.users = [ "openid" "profile" "email" "groups" ];
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
@@ -46,7 +69,7 @@
|
|||||||
extraDomainNames = [ "${config.networking.hostName}.home.2rjus.net" ];
|
extraDomainNames = [ "${config.networking.hostName}.home.2rjus.net" ];
|
||||||
};
|
};
|
||||||
|
|
||||||
# Vault secret for idm_admin password
|
# Vault secret for idm_admin password (used for provisioning)
|
||||||
vault.secrets.kanidm-idm-admin = {
|
vault.secrets.kanidm-idm-admin = {
|
||||||
secretPath = "kanidm/idm-admin-password";
|
secretPath = "kanidm/idm-admin-password";
|
||||||
extractKey = "password";
|
extractKey = "password";
|
||||||
@@ -55,6 +78,24 @@
|
|||||||
group = "kanidm";
|
group = "kanidm";
|
||||||
};
|
};
|
||||||
|
|
||||||
|
# Vault secret for Grafana OAuth2 client secret
|
||||||
|
vault.secrets.grafana-oauth2 = {
|
||||||
|
secretPath = "services/grafana/oauth2-client-secret";
|
||||||
|
extractKey = "password";
|
||||||
|
services = [ "kanidm" ];
|
||||||
|
owner = "kanidm";
|
||||||
|
group = "kanidm";
|
||||||
|
};
|
||||||
|
|
||||||
|
# Vault secret for OpenBao OAuth2 client secret
|
||||||
|
vault.secrets.openbao-oauth2 = {
|
||||||
|
secretPath = "services/openbao/oauth2-client-secret";
|
||||||
|
extractKey = "password";
|
||||||
|
services = [ "kanidm" ];
|
||||||
|
owner = "kanidm";
|
||||||
|
group = "kanidm";
|
||||||
|
};
|
||||||
|
|
||||||
# Note: Kanidm does not expose Prometheus metrics
|
# Note: Kanidm does not expose Prometheus metrics
|
||||||
# If metrics support is added in the future, uncomment:
|
# If metrics support is added in the future, uncomment:
|
||||||
# homelab.monitoring.scrapeTargets = [
|
# homelab.monitoring.scrapeTargets = [
|
||||||
|
|||||||
104
services/loki/default.nix
Normal file
104
services/loki/default.nix
Normal file
@@ -0,0 +1,104 @@
|
|||||||
|
{ config, lib, pkgs, ... }:
|
||||||
|
let
|
||||||
|
# Script to generate bcrypt hash from Vault password for Caddy basic_auth
|
||||||
|
generateCaddyAuth = pkgs.writeShellApplication {
|
||||||
|
name = "generate-caddy-loki-auth";
|
||||||
|
runtimeInputs = [ config.services.caddy.package ];
|
||||||
|
text = ''
|
||||||
|
PASSWORD=$(cat /run/secrets/loki-push-auth)
|
||||||
|
HASH=$(caddy hash-password --plaintext "$PASSWORD")
|
||||||
|
echo "LOKI_PUSH_HASH=$HASH" > /run/secrets/caddy-loki-auth.env
|
||||||
|
chmod 0400 /run/secrets/caddy-loki-auth.env
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
in
|
||||||
|
{
|
||||||
|
# Fetch Loki push password from Vault
|
||||||
|
vault.secrets.loki-push-auth = {
|
||||||
|
secretPath = "shared/loki/push-auth";
|
||||||
|
extractKey = "password";
|
||||||
|
services = [ "caddy" ];
|
||||||
|
};
|
||||||
|
|
||||||
|
# Generate bcrypt hash for Caddy before it starts
|
||||||
|
systemd.services.caddy-loki-auth = {
|
||||||
|
description = "Generate Caddy basic auth hash for Loki";
|
||||||
|
after = [ "vault-secret-loki-push-auth.service" ];
|
||||||
|
requires = [ "vault-secret-loki-push-auth.service" ];
|
||||||
|
before = [ "caddy.service" ];
|
||||||
|
requiredBy = [ "caddy.service" ];
|
||||||
|
serviceConfig = {
|
||||||
|
Type = "oneshot";
|
||||||
|
RemainAfterExit = true;
|
||||||
|
ExecStart = lib.getExe generateCaddyAuth;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# Load the bcrypt hash as environment variable for Caddy
|
||||||
|
services.caddy.environmentFile = "/run/secrets/caddy-loki-auth.env";
|
||||||
|
|
||||||
|
# Caddy reverse proxy for Loki with basic auth
|
||||||
|
services.caddy.virtualHosts."loki.home.2rjus.net".extraConfig = ''
|
||||||
|
basic_auth {
|
||||||
|
promtail {env.LOKI_PUSH_HASH}
|
||||||
|
}
|
||||||
|
reverse_proxy http://127.0.0.1:3100
|
||||||
|
'';
|
||||||
|
|
||||||
|
services.loki = {
|
||||||
|
enable = true;
|
||||||
|
configuration = {
|
||||||
|
auth_enabled = false;
|
||||||
|
|
||||||
|
server = {
|
||||||
|
http_listen_address = "127.0.0.1";
|
||||||
|
http_listen_port = 3100;
|
||||||
|
};
|
||||||
|
common = {
|
||||||
|
ring = {
|
||||||
|
instance_addr = "127.0.0.1";
|
||||||
|
kvstore = {
|
||||||
|
store = "inmemory";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
replication_factor = 1;
|
||||||
|
path_prefix = "/var/lib/loki";
|
||||||
|
};
|
||||||
|
schema_config = {
|
||||||
|
configs = [
|
||||||
|
{
|
||||||
|
from = "2024-01-01";
|
||||||
|
store = "tsdb";
|
||||||
|
object_store = "filesystem";
|
||||||
|
schema = "v13";
|
||||||
|
index = {
|
||||||
|
prefix = "loki_index_";
|
||||||
|
period = "24h";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
};
|
||||||
|
storage_config = {
|
||||||
|
filesystem = {
|
||||||
|
directory = "/var/lib/loki/chunks";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
compactor = {
|
||||||
|
working_directory = "/var/lib/loki/compactor";
|
||||||
|
compaction_interval = "10m";
|
||||||
|
retention_enabled = true;
|
||||||
|
retention_delete_delay = "2h";
|
||||||
|
retention_delete_worker_count = 150;
|
||||||
|
delete_request_store = "filesystem";
|
||||||
|
};
|
||||||
|
limits_config = {
|
||||||
|
retention_period = "30d";
|
||||||
|
ingestion_rate_mb = 10;
|
||||||
|
ingestion_burst_size_mb = 20;
|
||||||
|
max_streams_per_user = 10000;
|
||||||
|
max_query_series = 500;
|
||||||
|
max_query_parallelism = 8;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
31
services/monitoring/blackbox.nix
Normal file
31
services/monitoring/blackbox.nix
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
{ pkgs, ... }:
|
||||||
|
{
|
||||||
|
services.prometheus.exporters.blackbox = {
|
||||||
|
enable = true;
|
||||||
|
configFile = pkgs.writeText "blackbox.yml" ''
|
||||||
|
modules:
|
||||||
|
https_cert:
|
||||||
|
prober: http
|
||||||
|
timeout: 10s
|
||||||
|
http:
|
||||||
|
fail_if_not_ssl: true
|
||||||
|
preferred_ip_protocol: ip4
|
||||||
|
valid_status_codes:
|
||||||
|
- 200
|
||||||
|
- 204
|
||||||
|
- 301
|
||||||
|
- 302
|
||||||
|
- 303
|
||||||
|
- 307
|
||||||
|
- 308
|
||||||
|
- 400
|
||||||
|
- 401
|
||||||
|
- 403
|
||||||
|
- 404
|
||||||
|
- 405
|
||||||
|
- 500
|
||||||
|
- 502
|
||||||
|
- 503
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
{ ... }:
|
|
||||||
{
|
|
||||||
imports = [
|
|
||||||
./loki.nix
|
|
||||||
./grafana.nix
|
|
||||||
./prometheus.nix
|
|
||||||
./pve.nix
|
|
||||||
./alerttonotify.nix
|
|
||||||
./pyroscope.nix
|
|
||||||
./tempo.nix
|
|
||||||
];
|
|
||||||
}
|
|
||||||
17
services/monitoring/exportarr.nix
Normal file
17
services/monitoring/exportarr.nix
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
{ config, ... }:
|
||||||
|
{
|
||||||
|
# Vault secret for API key
|
||||||
|
vault.secrets.sonarr-api-key = {
|
||||||
|
secretPath = "services/exportarr/sonarr";
|
||||||
|
extractKey = "api_key";
|
||||||
|
services = [ "prometheus-exportarr-sonarr-exporter" ];
|
||||||
|
};
|
||||||
|
|
||||||
|
# Sonarr exporter
|
||||||
|
services.prometheus.exporters.exportarr-sonarr = {
|
||||||
|
enable = true;
|
||||||
|
url = "http://sonarr-jail.home.2rjus.net:8989";
|
||||||
|
apiKeyFile = config.vault.secrets.sonarr-api-key.outputDir;
|
||||||
|
port = 9709;
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -1,11 +0,0 @@
|
|||||||
{ pkgs, ... }:
|
|
||||||
{
|
|
||||||
services.grafana = {
|
|
||||||
enable = true;
|
|
||||||
settings = {
|
|
||||||
server = {
|
|
||||||
http_addr = "";
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -1,42 +0,0 @@
|
|||||||
{ ... }:
|
|
||||||
{
|
|
||||||
services.loki = {
|
|
||||||
enable = true;
|
|
||||||
configuration = {
|
|
||||||
auth_enabled = false;
|
|
||||||
|
|
||||||
server = {
|
|
||||||
http_listen_port = 3100;
|
|
||||||
};
|
|
||||||
common = {
|
|
||||||
ring = {
|
|
||||||
instance_addr = "127.0.0.1";
|
|
||||||
kvstore = {
|
|
||||||
store = "inmemory";
|
|
||||||
};
|
|
||||||
};
|
|
||||||
replication_factor = 1;
|
|
||||||
path_prefix = "/var/lib/loki";
|
|
||||||
};
|
|
||||||
schema_config = {
|
|
||||||
configs = [
|
|
||||||
{
|
|
||||||
from = "2024-01-01";
|
|
||||||
store = "tsdb";
|
|
||||||
object_store = "filesystem";
|
|
||||||
schema = "v13";
|
|
||||||
index = {
|
|
||||||
prefix = "loki_index_";
|
|
||||||
period = "24h";
|
|
||||||
};
|
|
||||||
}
|
|
||||||
];
|
|
||||||
};
|
|
||||||
storage_config = {
|
|
||||||
filesystem = {
|
|
||||||
directory = "/var/lib/loki/chunks";
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -1,247 +0,0 @@
|
|||||||
{ self, lib, pkgs, ... }:
|
|
||||||
let
|
|
||||||
monLib = import ../../lib/monitoring.nix { inherit lib; };
|
|
||||||
externalTargets = import ./external-targets.nix;
|
|
||||||
|
|
||||||
nodeExporterTargets = monLib.generateNodeExporterTargets self externalTargets;
|
|
||||||
autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets;
|
|
||||||
|
|
||||||
# Script to fetch AppRole token for Prometheus to use when scraping OpenBao metrics
|
|
||||||
fetchOpenbaoToken = pkgs.writeShellApplication {
|
|
||||||
name = "fetch-openbao-token";
|
|
||||||
runtimeInputs = [ pkgs.curl pkgs.jq ];
|
|
||||||
text = ''
|
|
||||||
VAULT_ADDR="https://vault01.home.2rjus.net:8200"
|
|
||||||
APPROLE_DIR="/var/lib/vault/approle"
|
|
||||||
OUTPUT_FILE="/run/secrets/prometheus/openbao-token"
|
|
||||||
|
|
||||||
# Read AppRole credentials
|
|
||||||
if [ ! -f "$APPROLE_DIR/role-id" ] || [ ! -f "$APPROLE_DIR/secret-id" ]; then
|
|
||||||
echo "AppRole credentials not found at $APPROLE_DIR" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
ROLE_ID=$(cat "$APPROLE_DIR/role-id")
|
|
||||||
SECRET_ID=$(cat "$APPROLE_DIR/secret-id")
|
|
||||||
|
|
||||||
# Authenticate to Vault
|
|
||||||
AUTH_RESPONSE=$(curl -sf -k -X POST \
|
|
||||||
-d "{\"role_id\":\"$ROLE_ID\",\"secret_id\":\"$SECRET_ID\"}" \
|
|
||||||
"$VAULT_ADDR/v1/auth/approle/login")
|
|
||||||
|
|
||||||
# Extract token
|
|
||||||
VAULT_TOKEN=$(echo "$AUTH_RESPONSE" | jq -r '.auth.client_token')
|
|
||||||
if [ -z "$VAULT_TOKEN" ] || [ "$VAULT_TOKEN" = "null" ]; then
|
|
||||||
echo "Failed to extract Vault token from response" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Write token to file
|
|
||||||
mkdir -p "$(dirname "$OUTPUT_FILE")"
|
|
||||||
echo -n "$VAULT_TOKEN" > "$OUTPUT_FILE"
|
|
||||||
chown prometheus:prometheus "$OUTPUT_FILE"
|
|
||||||
chmod 0400 "$OUTPUT_FILE"
|
|
||||||
|
|
||||||
echo "Successfully fetched OpenBao token"
|
|
||||||
'';
|
|
||||||
};
|
|
||||||
in
|
|
||||||
{
|
|
||||||
# Systemd service to fetch AppRole token for Prometheus OpenBao scraping
|
|
||||||
# The token is used to authenticate when scraping /v1/sys/metrics
|
|
||||||
systemd.services.prometheus-openbao-token = {
|
|
||||||
description = "Fetch OpenBao token for Prometheus metrics scraping";
|
|
||||||
after = [ "network-online.target" ];
|
|
||||||
wants = [ "network-online.target" ];
|
|
||||||
before = [ "prometheus.service" ];
|
|
||||||
requiredBy = [ "prometheus.service" ];
|
|
||||||
|
|
||||||
serviceConfig = {
|
|
||||||
Type = "oneshot";
|
|
||||||
ExecStart = lib.getExe fetchOpenbaoToken;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
# Timer to periodically refresh the token (AppRole tokens have 1-hour TTL)
|
|
||||||
systemd.timers.prometheus-openbao-token = {
|
|
||||||
description = "Refresh OpenBao token for Prometheus";
|
|
||||||
wantedBy = [ "timers.target" ];
|
|
||||||
timerConfig = {
|
|
||||||
OnBootSec = "5min";
|
|
||||||
OnUnitActiveSec = "30min";
|
|
||||||
RandomizedDelaySec = "5min";
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
services.prometheus = {
|
|
||||||
enable = true;
|
|
||||||
# syntax-only check because we use external credential files (e.g., openbao-token)
|
|
||||||
checkConfig = "syntax-only";
|
|
||||||
alertmanager = {
|
|
||||||
enable = true;
|
|
||||||
configuration = {
|
|
||||||
global = {
|
|
||||||
};
|
|
||||||
route = {
|
|
||||||
receiver = "webhook_natstonotify";
|
|
||||||
group_wait = "30s";
|
|
||||||
group_interval = "5m";
|
|
||||||
repeat_interval = "1h";
|
|
||||||
group_by = [ "alertname" ];
|
|
||||||
};
|
|
||||||
receivers = [
|
|
||||||
{
|
|
||||||
name = "webhook_natstonotify";
|
|
||||||
webhook_configs = [
|
|
||||||
{
|
|
||||||
url = "http://localhost:5001/alert";
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
};
|
|
||||||
};
|
|
||||||
alertmanagers = [
|
|
||||||
{
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "localhost:9093" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
|
|
||||||
retentionTime = "30d";
|
|
||||||
globalConfig = {
|
|
||||||
scrape_interval = "15s";
|
|
||||||
};
|
|
||||||
rules = [
|
|
||||||
(builtins.readFile ./rules.yml)
|
|
||||||
];
|
|
||||||
|
|
||||||
scrapeConfigs = [
|
|
||||||
# Auto-generated node-exporter targets from flake hosts + external
|
|
||||||
# Each static_config entry may have labels from homelab.host metadata
|
|
||||||
{
|
|
||||||
job_name = "node-exporter";
|
|
||||||
static_configs = nodeExporterTargets;
|
|
||||||
}
|
|
||||||
# Systemd exporter on all hosts (same targets, different port)
|
|
||||||
# Preserves the same label grouping as node-exporter
|
|
||||||
{
|
|
||||||
job_name = "systemd-exporter";
|
|
||||||
static_configs = map
|
|
||||||
(cfg: cfg // {
|
|
||||||
targets = map (t: builtins.replaceStrings [ ":9100" ] [ ":9558" ] t) cfg.targets;
|
|
||||||
})
|
|
||||||
nodeExporterTargets;
|
|
||||||
}
|
|
||||||
# Local monitoring services (not auto-generated)
|
|
||||||
{
|
|
||||||
job_name = "prometheus";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "localhost:9090" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "loki";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "localhost:3100" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "grafana";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "localhost:3000" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "alertmanager";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "localhost:9093" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
{
|
|
||||||
job_name = "pushgateway";
|
|
||||||
honor_labels = true;
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "localhost:9091" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
# TODO: nix-cache_caddy can't be auto-generated because the cert is issued
|
|
||||||
# for nix-cache.home.2rjus.net (service CNAME), not nix-cache01 (hostname).
|
|
||||||
# Consider adding a target override to homelab.monitoring.scrapeTargets.
|
|
||||||
{
|
|
||||||
job_name = "nix-cache_caddy";
|
|
||||||
scheme = "https";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "nix-cache.home.2rjus.net" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
# pve-exporter with complex relabel config
|
|
||||||
{
|
|
||||||
job_name = "pve-exporter";
|
|
||||||
static_configs = [
|
|
||||||
{
|
|
||||||
targets = [ "10.69.12.75" ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
metrics_path = "/pve";
|
|
||||||
params = {
|
|
||||||
module = [ "default" ];
|
|
||||||
cluster = [ "1" ];
|
|
||||||
node = [ "1" ];
|
|
||||||
};
|
|
||||||
relabel_configs = [
|
|
||||||
{
|
|
||||||
source_labels = [ "__address__" ];
|
|
||||||
target_label = "__param_target";
|
|
||||||
}
|
|
||||||
{
|
|
||||||
source_labels = [ "__param_target" ];
|
|
||||||
target_label = "instance";
|
|
||||||
}
|
|
||||||
{
|
|
||||||
target_label = "__address__";
|
|
||||||
replacement = "127.0.0.1:9221";
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
# OpenBao metrics with bearer token auth
|
|
||||||
{
|
|
||||||
job_name = "openbao";
|
|
||||||
scheme = "https";
|
|
||||||
metrics_path = "/v1/sys/metrics";
|
|
||||||
params = {
|
|
||||||
format = [ "prometheus" ];
|
|
||||||
};
|
|
||||||
static_configs = [{
|
|
||||||
targets = [ "vault01.home.2rjus.net:8200" ];
|
|
||||||
}];
|
|
||||||
authorization = {
|
|
||||||
type = "Bearer";
|
|
||||||
credentials_file = "/run/secrets/prometheus/openbao-token";
|
|
||||||
};
|
|
||||||
}
|
|
||||||
] ++ autoScrapeConfigs;
|
|
||||||
|
|
||||||
pushgateway = {
|
|
||||||
enable = true;
|
|
||||||
web = {
|
|
||||||
external-url = "https://pushgw.home.2rjus.net";
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
{ config, ... }:
|
{ config, ... }:
|
||||||
{
|
{
|
||||||
vault.secrets.pve-exporter = {
|
vault.secrets.pve-exporter = {
|
||||||
secretPath = "hosts/monitoring01/pve-exporter";
|
secretPath = "hosts/monitoring02/pve-exporter";
|
||||||
extractKey = "config";
|
extractKey = "config";
|
||||||
outputDir = "/run/secrets/pve_exporter";
|
outputDir = "/run/secrets/pve_exporter";
|
||||||
mode = "0444";
|
mode = "0444";
|
||||||
|
|||||||
@@ -1,8 +0,0 @@
|
|||||||
{ ... }:
|
|
||||||
{
|
|
||||||
virtualisation.oci-containers.containers.pyroscope = {
|
|
||||||
pull = "missing";
|
|
||||||
image = "grafana/pyroscope:latest";
|
|
||||||
ports = [ "4040:4040" ];
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -67,13 +67,13 @@ groups:
|
|||||||
summary: "Promtail service not running on {{ $labels.instance }}"
|
summary: "Promtail service not running on {{ $labels.instance }}"
|
||||||
description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes."
|
description: "The promtail service has not been active on {{ $labels.instance }} for 5 minutes."
|
||||||
- alert: filesystem_filling_up
|
- alert: filesystem_filling_up
|
||||||
expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[6h], 24*3600) < 0
|
expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[24h], 24*3600) < 0
|
||||||
for: 1h
|
for: 1h
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Filesystem predicted to fill within 24h on {{ $labels.instance }}"
|
summary: "Filesystem predicted to fill within 24h on {{ $labels.instance }}"
|
||||||
description: "Based on the last 6h trend, the root filesystem on {{ $labels.instance }} is predicted to run out of space within 24 hours."
|
description: "Based on the last 24h trend, the root filesystem on {{ $labels.instance }} is predicted to run out of space within 24 hours."
|
||||||
- alert: systemd_not_running
|
- alert: systemd_not_running
|
||||||
expr: node_systemd_system_running == 0
|
expr: node_systemd_system_running == 0
|
||||||
for: 10m
|
for: 10m
|
||||||
@@ -118,13 +118,13 @@ groups:
|
|||||||
description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
|
description: "NSD has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
# Only alert on primary DNS (secondary has cold cache after failover)
|
# Only alert on primary DNS (secondary has cold cache after failover)
|
||||||
- alert: unbound_low_cache_hit_ratio
|
- alert: unbound_low_cache_hit_ratio
|
||||||
expr: (rate(unbound_cache_hits_total{dns_role="primary"}[5m]) / (rate(unbound_cache_hits_total{dns_role="primary"}[5m]) + rate(unbound_cache_misses_total{dns_role="primary"}[5m]))) < 0.5
|
expr: (rate(unbound_cache_hits_total{dns_role="primary"}[5m]) / (rate(unbound_cache_hits_total{dns_role="primary"}[5m]) + rate(unbound_cache_misses_total{dns_role="primary"}[5m]))) < 0.2
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Low DNS cache hit ratio on {{ $labels.instance }}"
|
summary: "Low DNS cache hit ratio on {{ $labels.instance }}"
|
||||||
description: "Unbound cache hit ratio is below 50% on {{ $labels.instance }}."
|
description: "Unbound cache hit ratio is below 20% on {{ $labels.instance }}."
|
||||||
- name: http_proxy_rules
|
- name: http_proxy_rules
|
||||||
rules:
|
rules:
|
||||||
- alert: caddy_down
|
- alert: caddy_down
|
||||||
@@ -171,37 +171,14 @@ groups:
|
|||||||
description: "NATS has {{ $value }} slow consumers on {{ $labels.instance }}."
|
description: "NATS has {{ $value }} slow consumers on {{ $labels.instance }}."
|
||||||
- name: nix_cache_rules
|
- name: nix_cache_rules
|
||||||
rules:
|
rules:
|
||||||
- alert: build_flakes_service_not_active_recently
|
|
||||||
expr: count_over_time(node_systemd_unit_state{instance="nix-cache01.home.2rjus.net:9100", name="build-flakes.service", state="active"}[1h]) < 1
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: "The build-flakes service on {{ $labels.instance }} has not run recently"
|
|
||||||
description: "The build-flakes service on {{ $labels.instance }} has not run recently"
|
|
||||||
- alert: build_flakes_error
|
|
||||||
expr: build_flakes_error == 1
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "The build-flakes job has failed for host {{ $labels.host }}."
|
|
||||||
description: "The build-flakes job has failed for host {{ $labels.host }}."
|
|
||||||
- alert: harmonia_down
|
- alert: harmonia_down
|
||||||
expr: node_systemd_unit_state {instance="nix-cache01.home.2rjus.net:9100", name = "harmonia.service", state = "active"} == 0
|
expr: node_systemd_unit_state{instance="nix-cache02.home.2rjus.net:9100", name="harmonia.service", state="active"} == 0
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Harmonia not running on {{ $labels.instance }}"
|
summary: "Harmonia not running on {{ $labels.instance }}"
|
||||||
description: "Harmonia has been down on {{ $labels.instance }} more than 5 minutes."
|
description: "Harmonia has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
- alert: low_disk_space_nix
|
|
||||||
expr: node_filesystem_free_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} / node_filesystem_size_bytes{instance="nix-cache01.home.2rjus.net:9100", mountpoint="/nix"} * 100 < 10
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "Disk space low on /nix for {{ $labels.instance }}"
|
|
||||||
description: "Disk space is low on /nix for host {{ $labels.instance }}. Please check."
|
|
||||||
- name: home_assistant_rules
|
- name: home_assistant_rules
|
||||||
rules:
|
rules:
|
||||||
- alert: home_assistant_down
|
- alert: home_assistant_down
|
||||||
@@ -229,13 +206,13 @@ groups:
|
|||||||
summary: "Mosquitto not running on {{ $labels.instance }}"
|
summary: "Mosquitto not running on {{ $labels.instance }}"
|
||||||
description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
|
description: "Mosquitto has been down on {{ $labels.instance }} more than 5 minutes."
|
||||||
- alert: zigbee_sensor_stale
|
- alert: zigbee_sensor_stale
|
||||||
expr: (time() - hass_last_updated_time_seconds{entity=~"sensor\\.(0x[0-9a-f]+|temp_server)_temperature"}) > 7200
|
expr: (time() - hass_last_updated_time_seconds{entity=~"sensor\\.(0x[0-9a-f]+|temp_server)_temperature"}) > 14400
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Zigbee sensor {{ $labels.friendly_name }} is stale"
|
summary: "Zigbee sensor {{ $labels.friendly_name }} is stale"
|
||||||
description: "Zigbee temperature sensor {{ $labels.entity }} has not reported data for over 2 hours. The sensor may have a dead battery or connectivity issues."
|
description: "Zigbee temperature sensor {{ $labels.entity }} has not reported data for over 4 hours. The sensor may have a dead battery or connectivity issues."
|
||||||
- name: smartctl_rules
|
- name: smartctl_rules
|
||||||
rules:
|
rules:
|
||||||
- alert: smart_critical_warning
|
- alert: smart_critical_warning
|
||||||
@@ -282,32 +259,32 @@ groups:
|
|||||||
description: "Wireguard handshake timeout on {{ $labels.instance }} for peer {{ $labels.public_key }}."
|
description: "Wireguard handshake timeout on {{ $labels.instance }} for peer {{ $labels.public_key }}."
|
||||||
- name: monitoring_rules
|
- name: monitoring_rules
|
||||||
rules:
|
rules:
|
||||||
- alert: prometheus_not_running
|
- alert: victoriametrics_not_running
|
||||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="prometheus.service", state="active"} == 0
|
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="victoriametrics.service", state="active"} == 0
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Prometheus service not running on {{ $labels.instance }}"
|
summary: "VictoriaMetrics service not running on {{ $labels.instance }}"
|
||||||
description: "Prometheus service not running on {{ $labels.instance }}"
|
description: "VictoriaMetrics service not running on {{ $labels.instance }}"
|
||||||
|
- alert: vmalert_not_running
|
||||||
|
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="vmalert.service", state="active"} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "vmalert service not running on {{ $labels.instance }}"
|
||||||
|
description: "vmalert service not running on {{ $labels.instance }}"
|
||||||
- alert: alertmanager_not_running
|
- alert: alertmanager_not_running
|
||||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
|
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="alertmanager.service", state="active"} == 0
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Alertmanager service not running on {{ $labels.instance }}"
|
summary: "Alertmanager service not running on {{ $labels.instance }}"
|
||||||
description: "Alertmanager service not running on {{ $labels.instance }}"
|
description: "Alertmanager service not running on {{ $labels.instance }}"
|
||||||
- alert: pushgateway_not_running
|
|
||||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="pushgateway.service", state="active"} == 0
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: "Pushgateway service not running on {{ $labels.instance }}"
|
|
||||||
description: "Pushgateway service not running on {{ $labels.instance }}"
|
|
||||||
- alert: loki_not_running
|
- alert: loki_not_running
|
||||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="loki.service", state="active"} == 0
|
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="loki.service", state="active"} == 0
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
@@ -315,29 +292,13 @@ groups:
|
|||||||
summary: "Loki service not running on {{ $labels.instance }}"
|
summary: "Loki service not running on {{ $labels.instance }}"
|
||||||
description: "Loki service not running on {{ $labels.instance }}"
|
description: "Loki service not running on {{ $labels.instance }}"
|
||||||
- alert: grafana_not_running
|
- alert: grafana_not_running
|
||||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
|
expr: node_systemd_unit_state{instance="monitoring02.home.2rjus.net:9100", name="grafana.service", state="active"} == 0
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Grafana service not running on {{ $labels.instance }}"
|
summary: "Grafana service not running on {{ $labels.instance }}"
|
||||||
description: "Grafana service not running on {{ $labels.instance }}"
|
description: "Grafana service not running on {{ $labels.instance }}"
|
||||||
- alert: tempo_not_running
|
|
||||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="tempo.service", state="active"} == 0
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "Tempo service not running on {{ $labels.instance }}"
|
|
||||||
description: "Tempo service not running on {{ $labels.instance }}"
|
|
||||||
- alert: pyroscope_not_running
|
|
||||||
expr: node_systemd_unit_state{instance="monitoring01.home.2rjus.net:9100", name="podman-pyroscope.service", state="active"} == 0
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "Pyroscope service not running on {{ $labels.instance }}"
|
|
||||||
description: "Pyroscope service not running on {{ $labels.instance }}"
|
|
||||||
- name: proxmox_rules
|
- name: proxmox_rules
|
||||||
rules:
|
rules:
|
||||||
- alert: pve_node_down
|
- alert: pve_node_down
|
||||||
@@ -392,3 +353,47 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: "Cannot scrape OpenBao metrics from {{ $labels.instance }}"
|
summary: "Cannot scrape OpenBao metrics from {{ $labels.instance }}"
|
||||||
description: "OpenBao metrics endpoint is not responding on {{ $labels.instance }}."
|
description: "OpenBao metrics endpoint is not responding on {{ $labels.instance }}."
|
||||||
|
- name: certificate_rules
|
||||||
|
rules:
|
||||||
|
- alert: tls_certificate_expiring_soon
|
||||||
|
expr: (probe_ssl_earliest_cert_expiry - time()) < 86400 * 7
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "TLS certificate expiring soon on {{ $labels.instance }}"
|
||||||
|
description: "The TLS certificate for {{ $labels.instance }} expires in less than 7 days."
|
||||||
|
- alert: tls_certificate_expiring_critical
|
||||||
|
expr: (probe_ssl_earliest_cert_expiry - time()) < 86400
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "TLS certificate expiring within 24h on {{ $labels.instance }}"
|
||||||
|
description: "The TLS certificate for {{ $labels.instance }} expires in less than 24 hours. Immediate action required."
|
||||||
|
- alert: tls_probe_failed
|
||||||
|
expr: probe_success{job="blackbox_tls"} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "TLS probe failed for {{ $labels.instance }}"
|
||||||
|
description: "Cannot connect to {{ $labels.instance }} to check TLS certificate. The service may be down or unreachable."
|
||||||
|
- name: homelab_deploy_rules
|
||||||
|
rules:
|
||||||
|
- alert: homelab_deploy_build_failed
|
||||||
|
expr: increase(homelab_deploy_build_host_total{status="failure"}[1h]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Build failed for {{ $labels.host }} in repo {{ $labels.repo }}"
|
||||||
|
description: "Host {{ $labels.host }} failed to build from {{ $labels.repo }} repository."
|
||||||
|
- alert: homelab_deploy_builder_down
|
||||||
|
expr: up{job="homelab-deploy-builder"} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Homelab deploy builder not responding on {{ $labels.instance }}"
|
||||||
|
description: "Cannot scrape homelab-deploy-builder metrics from {{ $labels.instance }} for 5 minutes."
|
||||||
|
|||||||
@@ -1,37 +0,0 @@
|
|||||||
{ ... }:
|
|
||||||
{
|
|
||||||
services.tempo = {
|
|
||||||
enable = true;
|
|
||||||
settings = {
|
|
||||||
server = {
|
|
||||||
http_listen_port = 3200;
|
|
||||||
grpc_listen_port = 3201;
|
|
||||||
};
|
|
||||||
distributor = {
|
|
||||||
receivers = {
|
|
||||||
otlp = {
|
|
||||||
protocols = {
|
|
||||||
http = {
|
|
||||||
endpoint = ":4318";
|
|
||||||
cors = {
|
|
||||||
allowed_origins = [ "*.home.2rjus.net" ];
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
storage = {
|
|
||||||
trace = {
|
|
||||||
backend = "local";
|
|
||||||
local = {
|
|
||||||
path = "/var/lib/tempo";
|
|
||||||
};
|
|
||||||
wal = {
|
|
||||||
path = "/var/lib/tempo/wal";
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -35,9 +35,18 @@
|
|||||||
HOMELAB = {
|
HOMELAB = {
|
||||||
jetstream = "enabled";
|
jetstream = "enabled";
|
||||||
users = [
|
users = [
|
||||||
|
# alerttonotify (full access to HOMELAB account)
|
||||||
{
|
{
|
||||||
nkey = "UASLNKLWGICRTZMIXVD3RXLQ57XRIMCKBHP5V3PYFFRNO3E3BIJBCYMZ";
|
nkey = "UASLNKLWGICRTZMIXVD3RXLQ57XRIMCKBHP5V3PYFFRNO3E3BIJBCYMZ";
|
||||||
}
|
}
|
||||||
|
# nixos-exporter (restricted to nixos-exporter subjects)
|
||||||
|
{
|
||||||
|
nkey = "UBCL3ODHVERVZJNGUJ567YBBKHQZOV3LK3WO6TVVSGQOCTK2NQ3IJVRV"; # Replace with public key from: nix develop -c nk -gen user -pubout
|
||||||
|
permissions = {
|
||||||
|
publish = [ "nixos-exporter.>" ];
|
||||||
|
subscribe = [ "nixos-exporter.>" ];
|
||||||
|
};
|
||||||
|
}
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -65,10 +74,12 @@
|
|||||||
publish = [
|
publish = [
|
||||||
"deploy.test.>"
|
"deploy.test.>"
|
||||||
"deploy.discover"
|
"deploy.discover"
|
||||||
|
"build.>"
|
||||||
];
|
];
|
||||||
subscribe = [
|
subscribe = [
|
||||||
"deploy.responses.>"
|
"deploy.responses.>"
|
||||||
"deploy.discover"
|
"deploy.discover"
|
||||||
|
"build.responses.>"
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@@ -76,8 +87,30 @@
|
|||||||
{
|
{
|
||||||
nkey = "UD2BFB7DLM67P5UUVCKBUJMCHADIZLGGVUNSRLZE2ZC66FW2XT44P73Y";
|
nkey = "UD2BFB7DLM67P5UUVCKBUJMCHADIZLGGVUNSRLZE2ZC66FW2XT44P73Y";
|
||||||
permissions = {
|
permissions = {
|
||||||
publish = [ "deploy.>" ];
|
publish = [
|
||||||
subscribe = [ "deploy.>" ];
|
"deploy.>"
|
||||||
|
"build.>"
|
||||||
|
];
|
||||||
|
subscribe = [
|
||||||
|
"deploy.>"
|
||||||
|
"build.responses.>"
|
||||||
|
];
|
||||||
|
};
|
||||||
|
}
|
||||||
|
# Builder (subscribes to build requests, publishes responses)
|
||||||
|
{
|
||||||
|
nkey = "UB4PUHGKAWAK6OS62FX7DOQTPFFJTLZZBTKCOCAXDP75H3NSMWAEDJ7E";
|
||||||
|
permissions = {
|
||||||
|
subscribe = [ "build.>" ];
|
||||||
|
publish = [ "build.responses.>" ];
|
||||||
|
};
|
||||||
|
}
|
||||||
|
# Scheduler (publishes build requests, subscribes to responses)
|
||||||
|
{
|
||||||
|
nkey = "UDQ5SFEGDM66AQGLK7KQDW6ZOC2QCXE2P6EJQ6VPBSR2CRCABPOVWRI4";
|
||||||
|
permissions = {
|
||||||
|
publish = [ "build.>" ];
|
||||||
|
subscribe = [ "build.responses.>" ];
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
|
|||||||
@@ -1,29 +0,0 @@
|
|||||||
{ pkgs, ... }:
|
|
||||||
let
|
|
||||||
build-flake-script = pkgs.writeShellApplication {
|
|
||||||
name = "build-flake-script";
|
|
||||||
runtimeInputs = with pkgs; [
|
|
||||||
git
|
|
||||||
nix
|
|
||||||
nixos-rebuild
|
|
||||||
jq
|
|
||||||
curl
|
|
||||||
];
|
|
||||||
text = builtins.readFile ./build-flakes.sh;
|
|
||||||
};
|
|
||||||
in
|
|
||||||
{
|
|
||||||
systemd.services."build-flakes" = {
|
|
||||||
serviceConfig = {
|
|
||||||
Type = "exec";
|
|
||||||
ExecStart = "${build-flake-script}/bin/build-flake-script";
|
|
||||||
};
|
|
||||||
};
|
|
||||||
systemd.timers."build-flakes" = {
|
|
||||||
enable = true;
|
|
||||||
wantedBy = [ "timers.target" ];
|
|
||||||
timerConfig = {
|
|
||||||
OnCalendar = "*-*-* *:30:00";
|
|
||||||
};
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -1,44 +0,0 @@
|
|||||||
JOB_NAME="build_flakes"
|
|
||||||
|
|
||||||
cd /root/nixos-servers
|
|
||||||
git pull
|
|
||||||
echo "Starting nixos-servers builds"
|
|
||||||
for host in $(nix flake show --json| jq -r '.nixosConfigurations | keys[]'); do
|
|
||||||
echo "Building $host"
|
|
||||||
if ! nixos-rebuild --verbose -L --flake ".#$host" build; then
|
|
||||||
echo "Build failed for $host"
|
|
||||||
cat <<EOF | curl -sS -X PUT --data-binary @- "https://pushgw.home.2rjus.net/metrics/job/$JOB_NAME/host/$host"
|
|
||||||
# TYPE build_flakes_error gauge
|
|
||||||
# HELP build_flakes_error 0 if the build was successful, 1 if it failed
|
|
||||||
build_flakes_error{instance="$HOSTNAME"} 1
|
|
||||||
EOF
|
|
||||||
else
|
|
||||||
echo "Build successful for $host"
|
|
||||||
cat <<EOF | curl -sS -X PUT --data-binary @- "https://pushgw.home.2rjus.net/metrics/job/$JOB_NAME/host/$host"
|
|
||||||
# TYPE build_flakes_error gauge
|
|
||||||
# HELP build_flakes_error 0 if the build was successful, 1 if it failed
|
|
||||||
build_flakes_error{instance="$HOSTNAME"} 0
|
|
||||||
EOF
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
echo "All nixos-servers builds complete"
|
|
||||||
|
|
||||||
echo "Building gunter"
|
|
||||||
cd /root/nixos
|
|
||||||
git pull
|
|
||||||
host="gunter"
|
|
||||||
if ! nixos-rebuild --verbose -L --flake ".#gunter" build; then
|
|
||||||
echo "Build failed for $host"
|
|
||||||
cat <<EOF | curl -sS -X PUT --data-binary @- "https://pushgw.home.2rjus.net/metrics/job/$JOB_NAME/host/$host"
|
|
||||||
# TYPE build_flakes_error gauge
|
|
||||||
# HELP build_flakes_error 0 if the build was successful, 1 if it failed
|
|
||||||
build_flakes_error{instance="$HOSTNAME"} 1
|
|
||||||
EOF
|
|
||||||
else
|
|
||||||
echo "Build successful for $host"
|
|
||||||
cat <<EOF | curl -sS -X PUT --data-binary @- "https://pushgw.home.2rjus.net/metrics/job/$JOB_NAME/host/$host"
|
|
||||||
# TYPE build_flakes_error gauge
|
|
||||||
# HELP build_flakes_error 0 if the build was successful, 1 if it failed
|
|
||||||
build_flakes_error{instance="$HOSTNAME"} 0
|
|
||||||
EOF
|
|
||||||
fi
|
|
||||||
@@ -1,10 +1,8 @@
|
|||||||
{ ... }:
|
{ ... }:
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
./build-flakes.nix
|
|
||||||
./harmonia.nix
|
./harmonia.nix
|
||||||
./proxy.nix
|
./proxy.nix
|
||||||
./nix.nix
|
./nix.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
{ pkgs, config, ... }:
|
{ pkgs, config, ... }:
|
||||||
{
|
{
|
||||||
vault.secrets.cache-secret = {
|
vault.secrets.cache-secret = {
|
||||||
secretPath = "hosts/nix-cache01/cache-secret";
|
secretPath = "hosts/${config.networking.hostName}/cache-secret";
|
||||||
extractKey = "key";
|
extractKey = "key";
|
||||||
outputDir = "/run/secrets/cache-secret";
|
outputDir = "/run/secrets/cache-secret";
|
||||||
services = [ "harmonia" ];
|
services = [ "harmonia" ];
|
||||||
|
|||||||
270
services/victoriametrics/default.nix
Normal file
270
services/victoriametrics/default.nix
Normal file
@@ -0,0 +1,270 @@
|
|||||||
|
{ self, config, lib, pkgs, ... }:
|
||||||
|
let
|
||||||
|
monLib = import ../../lib/monitoring.nix { inherit lib; };
|
||||||
|
externalTargets = import ../monitoring/external-targets.nix;
|
||||||
|
|
||||||
|
nodeExporterTargets = monLib.generateNodeExporterTargets self externalTargets;
|
||||||
|
autoScrapeConfigs = monLib.generateScrapeConfigs self externalTargets;
|
||||||
|
|
||||||
|
# TLS endpoints to monitor for certificate expiration via blackbox exporter
|
||||||
|
tlsTargets = [
|
||||||
|
"https://vault.home.2rjus.net:8200"
|
||||||
|
"https://auth.home.2rjus.net"
|
||||||
|
"https://testvm01.home.2rjus.net"
|
||||||
|
"https://nzbget.home.2rjus.net"
|
||||||
|
"https://radarr.home.2rjus.net"
|
||||||
|
"https://sonarr.home.2rjus.net"
|
||||||
|
"https://ha.home.2rjus.net"
|
||||||
|
"https://z2m.home.2rjus.net"
|
||||||
|
"https://metrics.home.2rjus.net"
|
||||||
|
"https://alertmanager.home.2rjus.net"
|
||||||
|
"https://grafana.home.2rjus.net"
|
||||||
|
"https://jelly.home.2rjus.net"
|
||||||
|
"https://nix-cache.home.2rjus.net"
|
||||||
|
"https://grafana-test.home.2rjus.net"
|
||||||
|
];
|
||||||
|
|
||||||
|
# Script to fetch AppRole token for VictoriaMetrics to use when scraping OpenBao metrics
|
||||||
|
fetchOpenbaoToken = pkgs.writeShellApplication {
|
||||||
|
name = "fetch-openbao-token-vm";
|
||||||
|
runtimeInputs = [ pkgs.curl pkgs.jq ];
|
||||||
|
text = ''
|
||||||
|
VAULT_ADDR="https://vault01.home.2rjus.net:8200"
|
||||||
|
APPROLE_DIR="/var/lib/vault/approle"
|
||||||
|
OUTPUT_FILE="/run/secrets/victoriametrics/openbao-token"
|
||||||
|
|
||||||
|
# Read AppRole credentials
|
||||||
|
if [ ! -f "$APPROLE_DIR/role-id" ] || [ ! -f "$APPROLE_DIR/secret-id" ]; then
|
||||||
|
echo "AppRole credentials not found at $APPROLE_DIR" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
ROLE_ID=$(cat "$APPROLE_DIR/role-id")
|
||||||
|
SECRET_ID=$(cat "$APPROLE_DIR/secret-id")
|
||||||
|
|
||||||
|
# Authenticate to Vault
|
||||||
|
AUTH_RESPONSE=$(curl -sf -k -X POST \
|
||||||
|
-d "{\"role_id\":\"$ROLE_ID\",\"secret_id\":\"$SECRET_ID\"}" \
|
||||||
|
"$VAULT_ADDR/v1/auth/approle/login")
|
||||||
|
|
||||||
|
# Extract token
|
||||||
|
VAULT_TOKEN=$(echo "$AUTH_RESPONSE" | jq -r '.auth.client_token')
|
||||||
|
if [ -z "$VAULT_TOKEN" ] || [ "$VAULT_TOKEN" = "null" ]; then
|
||||||
|
echo "Failed to extract Vault token from response" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Write token to file
|
||||||
|
mkdir -p "$(dirname "$OUTPUT_FILE")"
|
||||||
|
echo -n "$VAULT_TOKEN" > "$OUTPUT_FILE"
|
||||||
|
chown victoriametrics:victoriametrics "$OUTPUT_FILE"
|
||||||
|
chmod 0400 "$OUTPUT_FILE"
|
||||||
|
|
||||||
|
echo "Successfully fetched OpenBao token"
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
|
scrapeConfigs = [
|
||||||
|
# Auto-generated node-exporter targets from flake hosts + external
|
||||||
|
{
|
||||||
|
job_name = "node-exporter";
|
||||||
|
static_configs = nodeExporterTargets;
|
||||||
|
}
|
||||||
|
# Systemd exporter on all hosts (same targets, different port)
|
||||||
|
{
|
||||||
|
job_name = "systemd-exporter";
|
||||||
|
static_configs = map
|
||||||
|
(cfg: cfg // {
|
||||||
|
targets = map (t: builtins.replaceStrings [ ":9100" ] [ ":9558" ] t) cfg.targets;
|
||||||
|
})
|
||||||
|
nodeExporterTargets;
|
||||||
|
}
|
||||||
|
# Local monitoring services
|
||||||
|
{
|
||||||
|
job_name = "victoriametrics";
|
||||||
|
static_configs = [{ targets = [ "localhost:8428" ]; }];
|
||||||
|
}
|
||||||
|
{
|
||||||
|
job_name = "loki";
|
||||||
|
static_configs = [{ targets = [ "localhost:3100" ]; }];
|
||||||
|
}
|
||||||
|
{
|
||||||
|
job_name = "grafana";
|
||||||
|
static_configs = [{ targets = [ "localhost:3000" ]; }];
|
||||||
|
}
|
||||||
|
{
|
||||||
|
job_name = "alertmanager";
|
||||||
|
static_configs = [{ targets = [ "localhost:9093" ]; }];
|
||||||
|
}
|
||||||
|
# Caddy metrics from nix-cache02
|
||||||
|
{
|
||||||
|
job_name = "nix-cache_caddy";
|
||||||
|
scheme = "https";
|
||||||
|
static_configs = [{ targets = [ "nix-cache.home.2rjus.net" ]; }];
|
||||||
|
}
|
||||||
|
# OpenBao metrics with bearer token auth
|
||||||
|
{
|
||||||
|
job_name = "openbao";
|
||||||
|
scheme = "https";
|
||||||
|
metrics_path = "/v1/sys/metrics";
|
||||||
|
params = { format = [ "prometheus" ]; };
|
||||||
|
static_configs = [{ targets = [ "vault01.home.2rjus.net:8200" ]; }];
|
||||||
|
authorization = {
|
||||||
|
type = "Bearer";
|
||||||
|
credentials_file = "/run/secrets/victoriametrics/openbao-token";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
# Apiary external service
|
||||||
|
{
|
||||||
|
job_name = "apiary";
|
||||||
|
scheme = "https";
|
||||||
|
scrape_interval = "60s";
|
||||||
|
static_configs = [{ targets = [ "apiary.t-juice.club" ]; }];
|
||||||
|
authorization = {
|
||||||
|
type = "Bearer";
|
||||||
|
credentials_file = "/run/secrets/victoriametrics-apiary-token";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
# Blackbox TLS certificate monitoring
|
||||||
|
{
|
||||||
|
job_name = "blackbox_tls";
|
||||||
|
metrics_path = "/probe";
|
||||||
|
params = {
|
||||||
|
module = [ "https_cert" ];
|
||||||
|
};
|
||||||
|
static_configs = [{ targets = tlsTargets; }];
|
||||||
|
relabel_configs = [
|
||||||
|
{
|
||||||
|
source_labels = [ "__address__" ];
|
||||||
|
target_label = "__param_target";
|
||||||
|
}
|
||||||
|
{
|
||||||
|
source_labels = [ "__param_target" ];
|
||||||
|
target_label = "instance";
|
||||||
|
}
|
||||||
|
{
|
||||||
|
target_label = "__address__";
|
||||||
|
replacement = "127.0.0.1:9115";
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
# Sonarr exporter
|
||||||
|
{
|
||||||
|
job_name = "sonarr";
|
||||||
|
static_configs = [{ targets = [ "localhost:9709" ]; }];
|
||||||
|
}
|
||||||
|
# Proxmox VE exporter
|
||||||
|
{
|
||||||
|
job_name = "pve";
|
||||||
|
static_configs = [{ targets = [ "localhost:9221" ]; }];
|
||||||
|
}
|
||||||
|
] ++ autoScrapeConfigs;
|
||||||
|
in
|
||||||
|
{
|
||||||
|
# Static user for VictoriaMetrics (overrides DynamicUser) so vault.secrets
|
||||||
|
# and credential files can be owned by this user
|
||||||
|
users.users.victoriametrics = {
|
||||||
|
isSystemUser = true;
|
||||||
|
group = "victoriametrics";
|
||||||
|
};
|
||||||
|
users.groups.victoriametrics = { };
|
||||||
|
|
||||||
|
# Override DynamicUser since we need a static user for credential file access
|
||||||
|
systemd.services.victoriametrics.serviceConfig = {
|
||||||
|
DynamicUser = lib.mkForce false;
|
||||||
|
User = "victoriametrics";
|
||||||
|
Group = "victoriametrics";
|
||||||
|
};
|
||||||
|
|
||||||
|
# Systemd service to fetch AppRole token for OpenBao scraping
|
||||||
|
systemd.services.victoriametrics-openbao-token = {
|
||||||
|
description = "Fetch OpenBao token for VictoriaMetrics metrics scraping";
|
||||||
|
after = [ "network-online.target" ];
|
||||||
|
wants = [ "network-online.target" ];
|
||||||
|
before = [ "victoriametrics.service" ];
|
||||||
|
requiredBy = [ "victoriametrics.service" ];
|
||||||
|
|
||||||
|
serviceConfig = {
|
||||||
|
Type = "oneshot";
|
||||||
|
ExecStart = lib.getExe fetchOpenbaoToken;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# Timer to periodically refresh the token (AppRole tokens have 1-hour TTL)
|
||||||
|
systemd.timers.victoriametrics-openbao-token = {
|
||||||
|
description = "Refresh OpenBao token for VictoriaMetrics";
|
||||||
|
wantedBy = [ "timers.target" ];
|
||||||
|
timerConfig = {
|
||||||
|
OnBootSec = "5min";
|
||||||
|
OnUnitActiveSec = "30min";
|
||||||
|
RandomizedDelaySec = "5min";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# Fetch apiary bearer token from Vault
|
||||||
|
vault.secrets.victoriametrics-apiary-token = {
|
||||||
|
secretPath = "hosts/monitoring02/apiary-token";
|
||||||
|
extractKey = "password";
|
||||||
|
owner = "victoriametrics";
|
||||||
|
group = "victoriametrics";
|
||||||
|
services = [ "victoriametrics" ];
|
||||||
|
};
|
||||||
|
|
||||||
|
services.victoriametrics = {
|
||||||
|
enable = true;
|
||||||
|
retentionPeriod = "3"; # 3 months
|
||||||
|
# Disable config check since we reference external credential files
|
||||||
|
checkConfig = false;
|
||||||
|
prometheusConfig = {
|
||||||
|
global.scrape_interval = "15s";
|
||||||
|
scrape_configs = scrapeConfigs;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# vmalert for alerting rules
|
||||||
|
services.vmalert.instances.default = {
|
||||||
|
enable = true;
|
||||||
|
settings = {
|
||||||
|
"datasource.url" = "http://localhost:8428";
|
||||||
|
"notifier.url" = [ "http://localhost:9093" ];
|
||||||
|
"rule" = [ ../monitoring/rules.yml ];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# Caddy reverse proxy for VictoriaMetrics and vmalert
|
||||||
|
services.caddy.virtualHosts."metrics.home.2rjus.net".extraConfig = ''
|
||||||
|
reverse_proxy http://127.0.0.1:8428
|
||||||
|
'';
|
||||||
|
services.caddy.virtualHosts."vmalert.home.2rjus.net".extraConfig = ''
|
||||||
|
reverse_proxy http://127.0.0.1:8880
|
||||||
|
'';
|
||||||
|
|
||||||
|
# Alertmanager
|
||||||
|
services.caddy.virtualHosts."alertmanager.home.2rjus.net".extraConfig = ''
|
||||||
|
reverse_proxy http://127.0.0.1:9093
|
||||||
|
'';
|
||||||
|
|
||||||
|
services.prometheus.alertmanager = {
|
||||||
|
enable = true;
|
||||||
|
configuration = {
|
||||||
|
global = { };
|
||||||
|
route = {
|
||||||
|
receiver = "webhook_natstonotify";
|
||||||
|
group_wait = "30s";
|
||||||
|
group_interval = "5m";
|
||||||
|
repeat_interval = "1h";
|
||||||
|
group_by = [ "alertname" ];
|
||||||
|
};
|
||||||
|
receivers = [
|
||||||
|
{
|
||||||
|
name = "webhook_natstonotify";
|
||||||
|
webhook_configs = [
|
||||||
|
{
|
||||||
|
url = "http://localhost:5001/alert";
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -4,10 +4,12 @@
|
|||||||
./acme.nix
|
./acme.nix
|
||||||
./autoupgrade.nix
|
./autoupgrade.nix
|
||||||
./homelab-deploy.nix
|
./homelab-deploy.nix
|
||||||
|
./kanidm-client.nix
|
||||||
./monitoring
|
./monitoring
|
||||||
./motd.nix
|
./motd.nix
|
||||||
./packages.nix
|
./packages.nix
|
||||||
./nix.nix
|
./nix.nix
|
||||||
|
./pipe-to-loki.nix
|
||||||
./root-user.nix
|
./root-user.nix
|
||||||
./pki/root-ca.nix
|
./pki/root-ca.nix
|
||||||
./sshd.nix
|
./sshd.nix
|
||||||
|
|||||||
42
system/kanidm-client.nix
Normal file
42
system/kanidm-client.nix
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
{ lib, config, pkgs, ... }:
|
||||||
|
let
|
||||||
|
cfg = config.homelab.kanidm;
|
||||||
|
in
|
||||||
|
{
|
||||||
|
options.homelab.kanidm = {
|
||||||
|
enable = lib.mkEnableOption "Kanidm PAM/NSS client for central authentication";
|
||||||
|
|
||||||
|
server = lib.mkOption {
|
||||||
|
type = lib.types.str;
|
||||||
|
default = "https://auth.home.2rjus.net";
|
||||||
|
description = "URI of the Kanidm server";
|
||||||
|
};
|
||||||
|
|
||||||
|
allowedLoginGroups = lib.mkOption {
|
||||||
|
type = lib.types.listOf lib.types.str;
|
||||||
|
default = [ "ssh-users" ];
|
||||||
|
description = "Groups allowed to log in via PAM";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
config = lib.mkIf cfg.enable {
|
||||||
|
services.kanidm = {
|
||||||
|
package = pkgs.kanidm_1_8;
|
||||||
|
enablePam = true;
|
||||||
|
|
||||||
|
clientSettings = {
|
||||||
|
uri = cfg.server;
|
||||||
|
};
|
||||||
|
|
||||||
|
unixSettings = {
|
||||||
|
pam_allowed_login_groups = cfg.allowedLoginGroups;
|
||||||
|
# Use short names (torjus) instead of SPN format (torjus@home.2rjus.net)
|
||||||
|
# This prevents "PAM user mismatch" errors with SSH
|
||||||
|
uid_attr_map = "name";
|
||||||
|
gid_attr_map = "name";
|
||||||
|
# Create symlink /home/torjus -> /home/torjus@home.2rjus.net
|
||||||
|
home_alias = "name";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -1,4 +1,12 @@
|
|||||||
{ config, ... }:
|
{ config, lib, ... }:
|
||||||
|
let
|
||||||
|
hostLabels = {
|
||||||
|
hostname = config.networking.hostName;
|
||||||
|
tier = config.homelab.host.tier;
|
||||||
|
} // lib.optionalAttrs (config.homelab.host.role != null) {
|
||||||
|
role = config.homelab.host.role;
|
||||||
|
};
|
||||||
|
in
|
||||||
{
|
{
|
||||||
# Configure journald
|
# Configure journald
|
||||||
services.journald = {
|
services.journald = {
|
||||||
@@ -8,6 +16,16 @@
|
|||||||
SystemKeepFree=1G
|
SystemKeepFree=1G
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
|
|
||||||
|
# Fetch Loki push password from Vault (only on hosts with Vault enabled)
|
||||||
|
vault.secrets.promtail-loki-auth = lib.mkIf config.vault.enable {
|
||||||
|
secretPath = "shared/loki/push-auth";
|
||||||
|
extractKey = "password";
|
||||||
|
owner = "promtail";
|
||||||
|
group = "promtail";
|
||||||
|
services = [ "promtail" ];
|
||||||
|
};
|
||||||
|
|
||||||
# Configure promtail
|
# Configure promtail
|
||||||
services.promtail = {
|
services.promtail = {
|
||||||
enable = true;
|
enable = true;
|
||||||
@@ -21,7 +39,11 @@
|
|||||||
|
|
||||||
clients = [
|
clients = [
|
||||||
{
|
{
|
||||||
url = "http://monitoring01.home.2rjus.net:3100/loki/api/v1/push";
|
url = "https://loki.home.2rjus.net/loki/api/v1/push";
|
||||||
|
basic_auth = {
|
||||||
|
username = "promtail";
|
||||||
|
password_file = "/run/secrets/promtail-loki-auth";
|
||||||
|
};
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
|
|
||||||
@@ -32,17 +54,26 @@
|
|||||||
json = true;
|
json = true;
|
||||||
labels = {
|
labels = {
|
||||||
job = "systemd-journal";
|
job = "systemd-journal";
|
||||||
};
|
} // hostLabels;
|
||||||
};
|
};
|
||||||
relabel_configs = [
|
relabel_configs = [
|
||||||
{
|
{
|
||||||
source_labels = [ "__journal__systemd_unit" ];
|
source_labels = [ "__journal__systemd_unit" ];
|
||||||
target_label = "systemd_unit";
|
target_label = "systemd_unit";
|
||||||
}
|
}
|
||||||
|
];
|
||||||
|
pipeline_stages = [
|
||||||
|
# Extract PRIORITY from journal JSON
|
||||||
|
{ json.expressions.priority = "PRIORITY"; }
|
||||||
|
# Map numeric PRIORITY to level name
|
||||||
{
|
{
|
||||||
source_labels = [ "__journal__hostname" ];
|
template = {
|
||||||
target_label = "host";
|
source = "priority";
|
||||||
|
template = ''{{ if or (eq .Value "0") (eq .Value "1") (eq .Value "2") }}critical{{ else if eq .Value "3" }}error{{ else if eq .Value "4" }}warning{{ else if eq .Value "5" }}notice{{ else if eq .Value "6" }}info{{ else if eq .Value "7" }}debug{{ end }}'';
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
# Attach as level label
|
||||||
|
{ labels.level = "priority"; }
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@@ -53,8 +84,7 @@
|
|||||||
labels = {
|
labels = {
|
||||||
job = "varlog";
|
job = "varlog";
|
||||||
__path__ = "/var/log/**/*.log";
|
__path__ = "/var/log/**/*.log";
|
||||||
hostname = "${config.networking.hostName}";
|
} // hostLabels;
|
||||||
};
|
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,15 +19,34 @@
|
|||||||
];
|
];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
# Fetch NKey from Vault for NATS authentication
|
||||||
|
vault.secrets.nixos-exporter-nkey = {
|
||||||
|
secretPath = "shared/nixos-exporter/nkey";
|
||||||
|
extractKey = "nkey";
|
||||||
|
owner = "nixos-exporter";
|
||||||
|
group = "nixos-exporter";
|
||||||
|
};
|
||||||
|
|
||||||
services.prometheus.exporters.nixos = {
|
services.prometheus.exporters.nixos = {
|
||||||
enable = true;
|
enable = true;
|
||||||
# Default port: 9971
|
# Default port: 9971
|
||||||
flake = {
|
flake = {
|
||||||
enable = true;
|
enable = true;
|
||||||
url = "git+https://git.t-juice.club/torjus/nixos-servers.git";
|
url = "git+https://git.t-juice.club/torjus/nixos-servers.git";
|
||||||
|
nats = {
|
||||||
|
enable = true;
|
||||||
|
url = "nats://nats1.home.2rjus.net:4222";
|
||||||
|
nkeySeedFile = "/run/secrets/nixos-exporter-nkey";
|
||||||
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
# Ensure exporter starts after Vault secret is available
|
||||||
|
systemd.services.prometheus-nixos-exporter = {
|
||||||
|
after = [ "vault-secret-nixos-exporter-nkey.service" ];
|
||||||
|
requires = [ "vault-secret-nixos-exporter-nkey.service" ];
|
||||||
|
};
|
||||||
|
|
||||||
# Register nixos-exporter as a Prometheus scrape target
|
# Register nixos-exporter as a Prometheus scrape target
|
||||||
homelab.monitoring.scrapeTargets = [
|
homelab.monitoring.scrapeTargets = [
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ in
|
|||||||
"https://cuda-maintainers.cachix.org"
|
"https://cuda-maintainers.cachix.org"
|
||||||
];
|
];
|
||||||
trusted-public-keys = [
|
trusted-public-keys = [
|
||||||
"nix-cache.home.2rjus.net-1:2kowZOG6pvhoK4AHVO3alBlvcghH20wchzoR0V86UWI="
|
"nix-cache02.home.2rjus.net-1:QyT5FAvJtV+EPQrgQQ6iV9JMg1kRiWuIAJftM35QMls="
|
||||||
"cache.nixos.org-1:6NCHdD59X431o0gWypbMrAURkbJ16ZPMQFGspcDShjY="
|
"cache.nixos.org-1:6NCHdD59X431o0gWypbMrAURkbJ16ZPMQFGspcDShjY="
|
||||||
"cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
|
"cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
|
||||||
];
|
];
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user