Compare commits
178 Commits
4e8cc124f2
...
pipe-to-lo
| Author | SHA1 | Date | |
|---|---|---|---|
|
78eb04205f
|
|||
| 19cb61ebbc | |||
|
9ed09c9a9c
|
|||
|
b31c64f1b9
|
|||
|
54b6e37420
|
|||
|
b845a8bb8b
|
|||
|
bfbf0cea68
|
|||
|
3abe5e83a7
|
|||
|
67c27555f3
|
|||
|
1674b6a844
|
|||
|
311be282b6
|
|||
|
11cbb64097
|
|||
|
e2dd21c994
|
|||
|
463342133e
|
|||
|
de36b9d016
|
|||
|
3f1d966919
|
|||
|
7fcc043a4d
|
|||
|
70ec5f8109
|
|||
|
c2ec34cab9
|
|||
|
8fbf1224fa
|
|||
|
8959829f77
|
|||
|
93dbb45802
|
|||
|
538c2ad097
|
|||
|
d99c82c74c
|
|||
|
ca0e3fd629
|
|||
|
732e9b8c22
|
|||
|
3a14ffd6b5
|
|||
|
f9a3961457
|
|||
|
003d4ccf03
|
|||
|
735b8a9ee3
|
|||
|
94feae82a0
|
|||
|
3f94f7ee95
|
|||
|
b7e398c9a7
|
|||
|
8ec2a083bd
|
|||
|
ec4ac1477e
|
|||
|
e937c68965
|
|||
|
98e808cd6c
|
|||
|
ba9f47f914
|
|||
|
1066e81ba8
|
|||
|
f0950b33de
|
|||
|
bf199bd7c6
|
|||
| 4e8ecb8a99 | |||
|
38c104ea8c
|
|||
|
536daee4c7
|
|||
| 4c1debf0a3 | |||
|
f36457ee0d
|
|||
|
aedccbd9a0
|
|||
|
bdc6057689
|
|||
| 3a25e3f7bc | |||
|
46f03871f1
|
|||
|
9d019f2b9a
|
|||
|
21db7e9573
|
|||
|
979040aaf7
|
|||
|
8791c29402
|
|||
|
c7a067d7b3
|
|||
|
c518093578
|
|||
| 0b462f0a96 | |||
|
116abf3bec
|
|||
|
b794aa89db
|
|||
|
50a85daa44
|
|||
|
23e561cf49
|
|||
|
7d291f85bf
|
|||
|
2a842c655a
|
|||
|
1f4a5571dc
|
|||
| 13d6d0ea3a | |||
|
eea000b337
|
|||
|
f19ba2f4b6
|
|||
|
a90d9c33d5
|
|||
|
09c9df1bbe
|
|||
|
ae3039af19
|
|||
|
11261c4636
|
|||
|
4ca3c8890f
|
|||
|
78e8d7a600
|
|||
|
0cf72ec191
|
|||
|
6a3a51407e
|
|||
|
a1ae766eb8
|
|||
|
11999b37f3
|
|||
|
29b2b7db52
|
|||
|
b046a1b862
|
|||
|
38348c5980
|
|||
|
370cf2b03a
|
|||
|
7bc465b414
|
|||
|
8d7bc50108
|
|||
|
03e70ac094
|
|||
|
3b32c9479f
|
|||
|
b0d35f9a99
|
|||
|
26ca6817f0
|
|||
|
b03a9b3b64
|
|||
|
f805b9f629
|
|||
|
f3adf7e77f
|
|||
|
f6eca9decc
|
|||
| 6e93b8eae3 | |||
|
c214f8543c
|
|||
|
7933127d77
|
|||
|
13c3897e86
|
|||
|
0643f23281
|
|||
|
ad8570f8db
|
|||
| 2f195d26d3 | |||
|
a926d34287
|
|||
|
be2421746e
|
|||
|
12bf0683f5
|
|||
|
e8a43c6715
|
|||
|
eef52bb8c5
|
|||
|
c6cdbc6799
|
|||
|
4d724329a6
|
|||
|
881e70df27
|
|||
|
b9a269d280
|
|||
|
fcf1a66103
|
|||
|
2034004280
|
|||
| af43f88394 | |||
|
a834497fe8
|
|||
| d3de2a1511 | |||
|
97ff774d3f
|
|||
|
f2c30cc24f
|
|||
|
7e80d2e0bc
|
|||
|
1f5b7b13e2
|
|||
|
c53e36c3f3
|
|||
|
04a252b857
|
|||
|
5d26f52e0d
|
|||
|
506a692548
|
|||
|
fa8f4f0784
|
|||
|
025570dea1
|
|||
|
15c00393f1
|
|||
|
787c14c7a6
|
|||
|
eee3dde04f
|
|||
| 682b07b977 | |||
| 70661ac3d9 | |||
|
506e93a5e2
|
|||
|
b6c41aa910
|
|||
| aa6e00a327 | |||
|
258e350b89
|
|||
|
eba195c192
|
|||
|
bbb22e588e
|
|||
|
879e7aba60
|
|||
|
39a4ea98ab
|
|||
| 1d90dc2181 | |||
|
e9857afc11
|
|||
| 88e9036cb4 | |||
|
59e1962d75
|
|||
|
3dc4422ba0
|
|||
|
f0963624bc
|
|||
| 7b46f94e48 | |||
|
32968147b5
|
|||
|
c515a6b4e1
|
|||
|
4d8b94ce83
|
|||
|
8b0a4ea33a
|
|||
| 5be1f43c24 | |||
|
b322b1156b
|
|||
|
3cccfc0487
|
|||
|
41d4226812
|
|||
|
351fb6f720
|
|||
|
7d92c55d37
|
|||
|
6d117d68ca
|
|||
| a46fbdaa70 | |||
|
2c9d86eaf2
|
|||
|
ccb1c3fe2e
|
|||
|
0700033c0a
|
|||
|
4d33018285
|
|||
|
678fd3d6de
|
|||
|
9d74aa5c04
|
|||
|
fe80ec3576
|
|||
|
870fb3e532
|
|||
|
e602e8d70b
|
|||
|
28b8d7c115
|
|||
|
64f2688349
|
|||
|
09d9d71e2b
|
|||
|
cc799f5929
|
|||
|
0abdda8e8a
|
|||
| 4076361bf7 | |||
|
0ef63ad874
|
|||
| 8f29141dd1 | |||
|
3a9a47f1ad
|
|||
|
fa6380e767
|
|||
|
86a077e152
|
|||
| 9da57c6a2f | |||
| da9dd02d10 | |||
|
e7980978c7
|
|||
|
dd1b64de27
|
180
.claude/agents/auditor.md
Normal file
180
.claude/agents/auditor.md
Normal file
@@ -0,0 +1,180 @@
|
|||||||
|
---
|
||||||
|
name: auditor
|
||||||
|
description: Analyzes audit logs to investigate user activity, command execution, and suspicious behavior on hosts. Can be used standalone for security reviews or called by other agents for behavioral context.
|
||||||
|
tools: Read, Grep, Glob
|
||||||
|
mcpServers:
|
||||||
|
- lab-monitoring
|
||||||
|
---
|
||||||
|
|
||||||
|
You are a security auditor for a NixOS homelab infrastructure. Your task is to analyze audit logs and reconstruct user activity on hosts.
|
||||||
|
|
||||||
|
## Input
|
||||||
|
|
||||||
|
You may receive:
|
||||||
|
- A host or list of hosts to investigate
|
||||||
|
- A time window (e.g., "last hour", "today", "between 14:00 and 15:00")
|
||||||
|
- Optional context: specific events to look for, user to focus on, or suspicious activity to investigate
|
||||||
|
- Optional context from a parent investigation (e.g., "a service stopped at 14:32, what happened around that time?")
|
||||||
|
|
||||||
|
## Audit Log Structure
|
||||||
|
|
||||||
|
Logs are shipped to Loki via promtail. Audit events use these labels:
|
||||||
|
- `host` - hostname
|
||||||
|
- `systemd_unit` - typically `auditd.service` for audit logs
|
||||||
|
- `job` - typically `systemd-journal`
|
||||||
|
|
||||||
|
Audit log entries contain structured data:
|
||||||
|
- `EXECVE` - command execution with full arguments
|
||||||
|
- `USER_LOGIN` / `USER_LOGOUT` - session start/end
|
||||||
|
- `USER_CMD` - sudo command execution
|
||||||
|
- `CRED_ACQ` / `CRED_DISP` - credential acquisition/disposal
|
||||||
|
- `SERVICE_START` / `SERVICE_STOP` - systemd service events
|
||||||
|
|
||||||
|
## Investigation Techniques
|
||||||
|
|
||||||
|
### 1. SSH Session Activity
|
||||||
|
|
||||||
|
Find SSH logins and session activity:
|
||||||
|
```logql
|
||||||
|
{host="<hostname>", systemd_unit="sshd.service"}
|
||||||
|
```
|
||||||
|
|
||||||
|
Look for:
|
||||||
|
- Accepted/Failed authentication
|
||||||
|
- Session opened/closed
|
||||||
|
- Unusual source IPs or users
|
||||||
|
|
||||||
|
### 2. Command Execution
|
||||||
|
|
||||||
|
Query executed commands (filter out noise):
|
||||||
|
```logql
|
||||||
|
{host="<hostname>"} |= "EXECVE" != "PATH item" != "PROCTITLE" != "SYSCALL" != "BPF"
|
||||||
|
```
|
||||||
|
|
||||||
|
Further filtering:
|
||||||
|
- Exclude systemd noise: `!= "systemd" != "/nix/store"`
|
||||||
|
- Focus on specific commands: `|= "rm" |= "-rf"`
|
||||||
|
- Focus on specific user: `|= "uid=1000"`
|
||||||
|
|
||||||
|
### 3. Sudo Activity
|
||||||
|
|
||||||
|
Check for privilege escalation:
|
||||||
|
```logql
|
||||||
|
{host="<hostname>"} |= "sudo" |= "COMMAND"
|
||||||
|
```
|
||||||
|
|
||||||
|
Or via audit:
|
||||||
|
```logql
|
||||||
|
{host="<hostname>"} |= "USER_CMD"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Service Manipulation
|
||||||
|
|
||||||
|
Check if services were manually stopped/started:
|
||||||
|
```logql
|
||||||
|
{host="<hostname>"} |= "EXECVE" |= "systemctl"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. File Operations
|
||||||
|
|
||||||
|
Look for file modifications (if auditd rules are configured):
|
||||||
|
```logql
|
||||||
|
{host="<hostname>"} |= "EXECVE" |= "vim"
|
||||||
|
{host="<hostname>"} |= "EXECVE" |= "nano"
|
||||||
|
{host="<hostname>"} |= "EXECVE" |= "rm"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Query Guidelines
|
||||||
|
|
||||||
|
**Start narrow, expand if needed:**
|
||||||
|
- Begin with `limit: 20-30`
|
||||||
|
- Use tight time windows: `start: "15m"` or `start: "30m"`
|
||||||
|
- Add filters progressively
|
||||||
|
|
||||||
|
**Avoid:**
|
||||||
|
- Querying all audit logs without EXECVE filter (extremely verbose)
|
||||||
|
- Large time ranges without specific filters
|
||||||
|
- Limits over 50 without tight filters
|
||||||
|
|
||||||
|
**Time-bounded queries:**
|
||||||
|
When investigating around a specific event:
|
||||||
|
```logql
|
||||||
|
{host="<hostname>"} |= "EXECVE" != "systemd"
|
||||||
|
```
|
||||||
|
With `start: "2026-02-08T14:30:00Z"` and `end: "2026-02-08T14:35:00Z"`
|
||||||
|
|
||||||
|
## Suspicious Patterns to Watch For
|
||||||
|
|
||||||
|
1. **Unusual login times** - Activity outside normal hours
|
||||||
|
2. **Failed authentication** - Brute force attempts
|
||||||
|
3. **Privilege escalation** - Unexpected sudo usage
|
||||||
|
4. **Reconnaissance commands** - `whoami`, `id`, `uname`, `cat /etc/passwd`
|
||||||
|
5. **Data exfiltration indicators** - `curl`, `wget`, `scp`, `rsync` to external destinations
|
||||||
|
6. **Persistence mechanisms** - Cron modifications, systemd service creation
|
||||||
|
7. **Log tampering** - Commands targeting log files
|
||||||
|
8. **Lateral movement** - SSH to other internal hosts
|
||||||
|
9. **Service manipulation** - Stopping security services, disabling firewalls
|
||||||
|
10. **Cleanup activity** - Deleting bash history, clearing logs
|
||||||
|
|
||||||
|
## Output Format
|
||||||
|
|
||||||
|
### For Standalone Security Reviews
|
||||||
|
|
||||||
|
```
|
||||||
|
## Activity Summary
|
||||||
|
|
||||||
|
**Host:** <hostname>
|
||||||
|
**Time Period:** <start> to <end>
|
||||||
|
**Sessions Found:** <count>
|
||||||
|
|
||||||
|
## User Sessions
|
||||||
|
|
||||||
|
### Session 1: <user> from <source_ip>
|
||||||
|
- **Login:** HH:MM:SSZ
|
||||||
|
- **Logout:** HH:MM:SSZ (or ongoing)
|
||||||
|
- **Commands executed:**
|
||||||
|
- HH:MM:SSZ - <command>
|
||||||
|
- HH:MM:SSZ - <command>
|
||||||
|
|
||||||
|
## Suspicious Activity
|
||||||
|
|
||||||
|
[If any patterns from the watch list were detected]
|
||||||
|
- **Finding:** <description>
|
||||||
|
- **Evidence:** <log entries>
|
||||||
|
- **Risk Level:** Low / Medium / High
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
[Overall assessment: normal activity, concerning patterns, or clear malicious activity]
|
||||||
|
```
|
||||||
|
|
||||||
|
### When Called by Another Agent
|
||||||
|
|
||||||
|
Provide a focused response addressing the specific question:
|
||||||
|
|
||||||
|
```
|
||||||
|
## Audit Findings
|
||||||
|
|
||||||
|
**Query:** <what was asked>
|
||||||
|
**Time Window:** <investigated period>
|
||||||
|
|
||||||
|
## Relevant Activity
|
||||||
|
|
||||||
|
[Chronological list of relevant events]
|
||||||
|
- HH:MM:SSZ - <event>
|
||||||
|
- HH:MM:SSZ - <event>
|
||||||
|
|
||||||
|
## Assessment
|
||||||
|
|
||||||
|
[Direct answer to the question with supporting evidence]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Guidelines
|
||||||
|
|
||||||
|
- Reconstruct timelines chronologically
|
||||||
|
- Correlate events (login → commands → logout)
|
||||||
|
- Note gaps or missing data
|
||||||
|
- Distinguish between automated (systemd, cron) and interactive activity
|
||||||
|
- Consider the host's role and tier when assessing severity
|
||||||
|
- When called by another agent, focus on answering their specific question
|
||||||
|
- Don't speculate without evidence - state what the logs show and don't show
|
||||||
211
.claude/agents/investigate-alarm.md
Normal file
211
.claude/agents/investigate-alarm.md
Normal file
@@ -0,0 +1,211 @@
|
|||||||
|
---
|
||||||
|
name: investigate-alarm
|
||||||
|
description: Investigates a single system alarm by querying Prometheus metrics and Loki logs, analyzing configuration files for affected hosts/services, and providing root cause analysis.
|
||||||
|
tools: Read, Grep, Glob
|
||||||
|
mcpServers:
|
||||||
|
- lab-monitoring
|
||||||
|
- git-explorer
|
||||||
|
---
|
||||||
|
|
||||||
|
You are an alarm investigation specialist for a NixOS homelab infrastructure. Your task is to analyze a single alarm and determine its root cause.
|
||||||
|
|
||||||
|
## Input
|
||||||
|
|
||||||
|
You will receive information about an alarm, which may include:
|
||||||
|
- Alert name and severity
|
||||||
|
- Affected host or service
|
||||||
|
- Alert expression/threshold
|
||||||
|
- Current value or status
|
||||||
|
- When it started firing
|
||||||
|
|
||||||
|
## Investigation Process
|
||||||
|
|
||||||
|
### 1. Understand the Alert Context
|
||||||
|
|
||||||
|
Start by understanding what the alert is measuring:
|
||||||
|
- Use `get_alert` if you have a fingerprint, or `list_alerts` to find matching alerts
|
||||||
|
- Use `get_metric_metadata` to understand the metric being monitored
|
||||||
|
- Use `search_metrics` to find related metrics
|
||||||
|
|
||||||
|
### 2. Query Current State
|
||||||
|
|
||||||
|
Gather evidence about the current system state:
|
||||||
|
- Use `query` to check the current metric values and related metrics
|
||||||
|
- Use `list_targets` to verify the host/service is being scraped successfully
|
||||||
|
- Look for correlated metrics that might explain the issue
|
||||||
|
|
||||||
|
### 3. Check Service Logs
|
||||||
|
|
||||||
|
Search for relevant log entries using `query_logs`. Focus on service-specific logs and errors.
|
||||||
|
|
||||||
|
**Query strategies (start narrow, expand if needed):**
|
||||||
|
- Start with `limit: 20-30`, increase only if needed
|
||||||
|
- Use tight time windows: `start: "15m"` or `start: "30m"` initially
|
||||||
|
- Filter to specific services: `{host="<hostname>", systemd_unit="<service>.service"}`
|
||||||
|
- Search for errors: `{host="<hostname>"} |= "error"` or `|= "failed"`
|
||||||
|
|
||||||
|
**Common patterns:**
|
||||||
|
- Service logs: `{host="<hostname>", systemd_unit="<service>.service"}`
|
||||||
|
- All errors on host: `{host="<hostname>"} |= "error"`
|
||||||
|
- Journal for a unit: `{host="<hostname>", systemd_unit="nginx.service"} |= "failed"`
|
||||||
|
|
||||||
|
**Avoid:**
|
||||||
|
- Using `start: "1h"` with no filters on busy hosts
|
||||||
|
- Limits over 50 without specific filters
|
||||||
|
|
||||||
|
### 4. Investigate User Activity
|
||||||
|
|
||||||
|
For any analysis of user activity, **always spawn the `auditor` agent**. Do not query audit logs (EXECVE, USER_LOGIN, etc.) directly - delegate this to the auditor.
|
||||||
|
|
||||||
|
**Always call the auditor when:**
|
||||||
|
- A service stopped unexpectedly (may have been manually stopped)
|
||||||
|
- A process was killed or a config was changed
|
||||||
|
- You need to know who was logged in around the time of an incident
|
||||||
|
- You need to understand what commands led to the current state
|
||||||
|
- The cause isn't obvious from service logs alone
|
||||||
|
|
||||||
|
**Do NOT try to query audit logs yourself.** The auditor is specialized for:
|
||||||
|
- Parsing EXECVE records and reconstructing command lines
|
||||||
|
- Correlating SSH sessions with commands executed
|
||||||
|
- Identifying suspicious patterns
|
||||||
|
- Filtering out systemd/nix-store noise
|
||||||
|
|
||||||
|
**Example prompt for auditor:**
|
||||||
|
```
|
||||||
|
Investigate user activity on <hostname> between <start_time> and <end_time>.
|
||||||
|
Context: The prometheus-node-exporter service stopped at 14:32.
|
||||||
|
Determine if it was manually stopped and by whom.
|
||||||
|
```
|
||||||
|
|
||||||
|
Incorporate the auditor's findings into your timeline and root cause analysis.
|
||||||
|
|
||||||
|
### 5. Check Configuration (if relevant)
|
||||||
|
|
||||||
|
If the alert relates to a NixOS-managed service:
|
||||||
|
- Check host configuration in `/hosts/<hostname>/`
|
||||||
|
- Check service modules in `/services/<service>/`
|
||||||
|
- Look for thresholds, resource limits, or misconfigurations
|
||||||
|
- Check `homelab.host` options for tier/priority/role metadata
|
||||||
|
|
||||||
|
### 6. Check for Configuration Drift
|
||||||
|
|
||||||
|
Use the git-explorer MCP server to compare the host's deployed configuration against the current master branch. This helps identify:
|
||||||
|
- Hosts running outdated configurations
|
||||||
|
- Recent changes that might have caused the issue
|
||||||
|
- Whether a fix has already been committed but not deployed
|
||||||
|
|
||||||
|
**Step 1: Get the deployed revision from Prometheus**
|
||||||
|
```promql
|
||||||
|
nixos_flake_info{hostname="<hostname>"}
|
||||||
|
```
|
||||||
|
The `current_rev` label contains the deployed git commit hash.
|
||||||
|
|
||||||
|
**Step 2: Check if the host is behind master**
|
||||||
|
```
|
||||||
|
resolve_ref("master") # Get current master commit
|
||||||
|
is_ancestor(deployed, master) # Check if host is behind
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 3: See what commits are missing**
|
||||||
|
```
|
||||||
|
commits_between(deployed, master) # List commits not yet deployed
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 4: Check which files changed**
|
||||||
|
```
|
||||||
|
get_diff_files(deployed, master) # Files modified since deployment
|
||||||
|
```
|
||||||
|
Look for files in `hosts/<hostname>/`, `services/<relevant-service>/`, or `system/` that affect this host.
|
||||||
|
|
||||||
|
**Step 5: View configuration at the deployed revision**
|
||||||
|
```
|
||||||
|
get_file_at_commit(deployed, "services/<service>/default.nix")
|
||||||
|
```
|
||||||
|
Compare against the current file to understand differences.
|
||||||
|
|
||||||
|
**Step 6: Find when something changed**
|
||||||
|
```
|
||||||
|
search_commits("<service-name>") # Find commits mentioning the service
|
||||||
|
get_commit_info(<hash>) # Get full details of a specific change
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example workflow for a service-related alert:**
|
||||||
|
1. Query `nixos_flake_info{hostname="monitoring01"}` → `current_rev: 8959829`
|
||||||
|
2. `resolve_ref("master")` → `4633421`
|
||||||
|
3. `is_ancestor("8959829", "4633421")` → Yes, host is behind
|
||||||
|
4. `commits_between("8959829", "4633421")` → 7 commits missing
|
||||||
|
5. `get_diff_files("8959829", "4633421")` → Check if relevant service files changed
|
||||||
|
6. If a fix was committed after the deployed rev, recommend deployment
|
||||||
|
|
||||||
|
### 7. Consider Common Causes
|
||||||
|
|
||||||
|
For infrastructure alerts, common causes include:
|
||||||
|
- **Manual intervention**: Service manually stopped/restarted (call auditor to confirm)
|
||||||
|
- **Configuration drift**: Host running outdated config, fix already in master
|
||||||
|
- **Disk space**: Nix store growth, logs, temp files
|
||||||
|
- **Memory pressure**: Service memory leaks, insufficient limits
|
||||||
|
- **CPU**: Runaway processes, build jobs
|
||||||
|
- **Network**: DNS issues, connectivity problems
|
||||||
|
- **Service restarts**: Failed upgrades, configuration errors
|
||||||
|
- **Scrape failures**: Service down, firewall issues, port changes
|
||||||
|
|
||||||
|
**Note:** If a service stopped unexpectedly and service logs don't show a crash or error, it was likely manual intervention - call the auditor to investigate.
|
||||||
|
|
||||||
|
## Output Format
|
||||||
|
|
||||||
|
Provide a concise report with one of two outcomes:
|
||||||
|
|
||||||
|
### If Root Cause Identified:
|
||||||
|
|
||||||
|
```
|
||||||
|
## Root Cause
|
||||||
|
[1-2 sentence summary of the root cause]
|
||||||
|
|
||||||
|
## Timeline
|
||||||
|
[Chronological sequence of relevant events leading to the alert]
|
||||||
|
- HH:MM:SSZ - [Event description]
|
||||||
|
- HH:MM:SSZ - [Event description]
|
||||||
|
- HH:MM:SSZ - [Alert fired]
|
||||||
|
|
||||||
|
### Timeline sources
|
||||||
|
- HH:MM:SSZ - [Source for information about this event. Which metric or log file]
|
||||||
|
- HH:MM:SSZ - [Source for information about this event. Which metric or log file]
|
||||||
|
- HH:MM:SSZ - [Alert fired]
|
||||||
|
|
||||||
|
|
||||||
|
## Evidence
|
||||||
|
- [Specific metric values or log entries that support the conclusion]
|
||||||
|
- [Configuration details if relevant]
|
||||||
|
|
||||||
|
|
||||||
|
## Recommended Actions
|
||||||
|
1. [Specific remediation step]
|
||||||
|
2. [Follow-up actions if any]
|
||||||
|
```
|
||||||
|
|
||||||
|
### If Root Cause Unclear:
|
||||||
|
|
||||||
|
```
|
||||||
|
## Investigation Summary
|
||||||
|
[What was checked and what was found]
|
||||||
|
|
||||||
|
## Possible Causes
|
||||||
|
- [Hypothesis 1 with supporting/contradicting evidence]
|
||||||
|
- [Hypothesis 2 with supporting/contradicting evidence]
|
||||||
|
|
||||||
|
## Additional Information Needed
|
||||||
|
- [Specific data, logs, or access that would help]
|
||||||
|
- [Suggested queries or checks for the operator]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Guidelines
|
||||||
|
|
||||||
|
- Be concise and actionable
|
||||||
|
- Reference specific metric names and values as evidence
|
||||||
|
- Include log snippets when they're informative
|
||||||
|
- Don't speculate without evidence
|
||||||
|
- If the alert is a false positive or expected behavior, explain why
|
||||||
|
- Consider the host's tier (test vs prod) when assessing severity
|
||||||
|
- Build a timeline from log timestamps and metrics to show the sequence of events
|
||||||
|
- **Query logs incrementally**: start with narrow filters and small limits, expand only if needed
|
||||||
|
- **Always delegate to the auditor agent** for any user activity analysis - never query EXECVE or audit logs directly
|
||||||
357
.claude/skills/observability/SKILL.md
Normal file
357
.claude/skills/observability/SKILL.md
Normal file
@@ -0,0 +1,357 @@
|
|||||||
|
---
|
||||||
|
name: observability
|
||||||
|
description: Reference guide for exploring Prometheus metrics and Loki logs when troubleshooting homelab issues. Use when investigating system state, deployments, service health, or searching logs.
|
||||||
|
---
|
||||||
|
|
||||||
|
# Observability Troubleshooting Guide
|
||||||
|
|
||||||
|
Quick reference for exploring Prometheus metrics and Loki logs to troubleshoot homelab issues.
|
||||||
|
|
||||||
|
## Available Tools
|
||||||
|
|
||||||
|
Use the `lab-monitoring` MCP server tools:
|
||||||
|
|
||||||
|
**Metrics:**
|
||||||
|
- `search_metrics` - Find metrics by name substring
|
||||||
|
- `get_metric_metadata` - Get type/help for a specific metric
|
||||||
|
- `query` - Execute PromQL queries
|
||||||
|
- `list_targets` - Check scrape target health
|
||||||
|
- `list_alerts` / `get_alert` - View active alerts
|
||||||
|
|
||||||
|
**Logs:**
|
||||||
|
- `query_logs` - Execute LogQL queries against Loki
|
||||||
|
- `list_labels` - List available log labels
|
||||||
|
- `list_label_values` - List values for a specific label
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Logs Reference
|
||||||
|
|
||||||
|
### Label Reference
|
||||||
|
|
||||||
|
Available labels for log queries:
|
||||||
|
- `host` - Hostname (e.g., `ns1`, `monitoring01`, `ha1`)
|
||||||
|
- `systemd_unit` - Systemd unit name (e.g., `nsd.service`, `nixos-upgrade.service`)
|
||||||
|
- `job` - Either `systemd-journal` (most logs), `varlog` (file-based logs), or `bootstrap` (VM bootstrap logs)
|
||||||
|
- `filename` - For `varlog` job, the log file path
|
||||||
|
- `hostname` - Alternative to `host` for some streams
|
||||||
|
|
||||||
|
### Log Format
|
||||||
|
|
||||||
|
Journal logs are JSON-formatted. Key fields:
|
||||||
|
- `MESSAGE` - The actual log message
|
||||||
|
- `PRIORITY` - Syslog priority (6=info, 4=warning, 3=error)
|
||||||
|
- `SYSLOG_IDENTIFIER` - Program name
|
||||||
|
|
||||||
|
### Basic LogQL Queries
|
||||||
|
|
||||||
|
**Logs from a specific service on a host:**
|
||||||
|
```logql
|
||||||
|
{host="ns1", systemd_unit="nsd.service"}
|
||||||
|
```
|
||||||
|
|
||||||
|
**All logs from a host:**
|
||||||
|
```logql
|
||||||
|
{host="monitoring01"}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Logs from a service across all hosts:**
|
||||||
|
```logql
|
||||||
|
{systemd_unit="nixos-upgrade.service"}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Substring matching (case-sensitive):**
|
||||||
|
```logql
|
||||||
|
{host="ha1"} |= "error"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Exclude pattern:**
|
||||||
|
```logql
|
||||||
|
{host="ns1"} != "routine"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Regex matching:**
|
||||||
|
```logql
|
||||||
|
{systemd_unit="prometheus.service"} |~ "scrape.*failed"
|
||||||
|
```
|
||||||
|
|
||||||
|
**File-based logs (caddy access logs, etc):**
|
||||||
|
```logql
|
||||||
|
{job="varlog", hostname="nix-cache01"}
|
||||||
|
{job="varlog", filename="/var/log/caddy/nix-cache.log"}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Time Ranges
|
||||||
|
|
||||||
|
Default lookback is 1 hour. Use `start` parameter for older logs:
|
||||||
|
- `start: "1h"` - Last hour (default)
|
||||||
|
- `start: "24h"` - Last 24 hours
|
||||||
|
- `start: "168h"` - Last 7 days
|
||||||
|
|
||||||
|
### Common Services
|
||||||
|
|
||||||
|
Useful systemd units for troubleshooting:
|
||||||
|
- `nixos-upgrade.service` - Daily auto-upgrade logs
|
||||||
|
- `nsd.service` - DNS server (ns1/ns2)
|
||||||
|
- `prometheus.service` - Metrics collection
|
||||||
|
- `loki.service` - Log aggregation
|
||||||
|
- `caddy.service` - Reverse proxy
|
||||||
|
- `home-assistant.service` - Home automation
|
||||||
|
- `step-ca.service` - Internal CA
|
||||||
|
- `openbao.service` - Secrets management
|
||||||
|
- `sshd.service` - SSH daemon
|
||||||
|
- `nix-gc.service` - Nix garbage collection
|
||||||
|
|
||||||
|
### Bootstrap Logs
|
||||||
|
|
||||||
|
VMs provisioned from template2 send bootstrap progress directly to Loki via curl (before promtail is available). These logs use `job="bootstrap"` with additional labels:
|
||||||
|
|
||||||
|
- `host` - Target hostname
|
||||||
|
- `branch` - Git branch being deployed
|
||||||
|
- `stage` - Bootstrap stage (see table below)
|
||||||
|
|
||||||
|
**Bootstrap stages:**
|
||||||
|
|
||||||
|
| Stage | Message | Meaning |
|
||||||
|
|-------|---------|---------|
|
||||||
|
| `starting` | Bootstrap starting for \<host\> (branch: \<branch\>) | Bootstrap service has started |
|
||||||
|
| `network_ok` | Network connectivity confirmed | Can reach git server |
|
||||||
|
| `vault_ok` | Vault credentials unwrapped and stored | AppRole credentials provisioned |
|
||||||
|
| `vault_skip` | No Vault token provided - skipping credential setup | No wrapped token was provided |
|
||||||
|
| `vault_warn` | Failed to unwrap Vault token - continuing without secrets | Token unwrap failed (expired/used) |
|
||||||
|
| `building` | Starting nixos-rebuild boot | NixOS build starting |
|
||||||
|
| `success` | Build successful - rebooting into new configuration | Build complete, rebooting |
|
||||||
|
| `failed` | nixos-rebuild failed - manual intervention required | Build failed |
|
||||||
|
|
||||||
|
**Bootstrap queries:**
|
||||||
|
|
||||||
|
```logql
|
||||||
|
{job="bootstrap"} # All bootstrap logs
|
||||||
|
{job="bootstrap", host="myhost"} # Specific host
|
||||||
|
{job="bootstrap", stage="failed"} # All failures
|
||||||
|
{job="bootstrap", stage=~"building|success"} # Track build progress
|
||||||
|
```
|
||||||
|
|
||||||
|
### Extracting JSON Fields
|
||||||
|
|
||||||
|
Parse JSON and filter on fields:
|
||||||
|
```logql
|
||||||
|
{systemd_unit="prometheus.service"} | json | PRIORITY="3"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Metrics Reference
|
||||||
|
|
||||||
|
### Deployment & Version Status
|
||||||
|
|
||||||
|
Check which NixOS revision hosts are running:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
nixos_flake_info
|
||||||
|
```
|
||||||
|
|
||||||
|
Labels:
|
||||||
|
- `current_rev` - Git commit of the running NixOS configuration
|
||||||
|
- `remote_rev` - Latest commit on the remote repository
|
||||||
|
- `nixpkgs_rev` - Nixpkgs revision used to build the system
|
||||||
|
- `nixos_version` - Full NixOS version string (e.g., `25.11.20260203.e576e3c`)
|
||||||
|
|
||||||
|
Check if hosts are behind on updates:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
nixos_flake_revision_behind == 1
|
||||||
|
```
|
||||||
|
|
||||||
|
View flake input versions:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
nixos_flake_input_info
|
||||||
|
```
|
||||||
|
|
||||||
|
Labels: `input` (name), `rev` (revision), `type` (git/github)
|
||||||
|
|
||||||
|
Check flake input age:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
nixos_flake_input_age_seconds / 86400
|
||||||
|
```
|
||||||
|
|
||||||
|
Returns age in days for each flake input.
|
||||||
|
|
||||||
|
### System Health
|
||||||
|
|
||||||
|
Basic host availability:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
up{job="node-exporter"}
|
||||||
|
```
|
||||||
|
|
||||||
|
CPU usage by host:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
|
||||||
|
```
|
||||||
|
|
||||||
|
Memory usage:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)
|
||||||
|
```
|
||||||
|
|
||||||
|
Disk space (root filesystem):
|
||||||
|
|
||||||
|
```promql
|
||||||
|
node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Prometheus Jobs
|
||||||
|
|
||||||
|
All available Prometheus job names:
|
||||||
|
|
||||||
|
**System exporters (on all/most hosts):**
|
||||||
|
- `node-exporter` - System metrics (CPU, memory, disk, network)
|
||||||
|
- `nixos-exporter` - NixOS flake revision and generation info
|
||||||
|
- `systemd-exporter` - Systemd unit status metrics
|
||||||
|
- `homelab-deploy` - Deployment listener metrics
|
||||||
|
|
||||||
|
**Service-specific exporters:**
|
||||||
|
- `caddy` - Reverse proxy metrics (http-proxy)
|
||||||
|
- `nix-cache_caddy` - Nix binary cache metrics
|
||||||
|
- `home-assistant` - Home automation metrics (ha1)
|
||||||
|
- `jellyfin` - Media server metrics (jelly01)
|
||||||
|
- `kanidm` - Authentication server metrics (kanidm01)
|
||||||
|
- `nats` - NATS messaging metrics (nats1)
|
||||||
|
- `openbao` - Secrets management metrics (vault01)
|
||||||
|
- `unbound` - DNS resolver metrics (ns1, ns2)
|
||||||
|
- `wireguard` - VPN tunnel metrics (http-proxy)
|
||||||
|
|
||||||
|
**Monitoring stack (localhost on monitoring01):**
|
||||||
|
- `prometheus` - Prometheus self-metrics
|
||||||
|
- `loki` - Loki self-metrics
|
||||||
|
- `grafana` - Grafana self-metrics
|
||||||
|
- `alertmanager` - Alertmanager metrics
|
||||||
|
- `pushgateway` - Push-based metrics gateway
|
||||||
|
|
||||||
|
**External/infrastructure:**
|
||||||
|
- `pve-exporter` - Proxmox hypervisor metrics
|
||||||
|
- `smartctl` - Disk SMART health (gunter)
|
||||||
|
- `restic_rest` - Backup server metrics
|
||||||
|
- `ghettoptt` - PTT service metrics (gunter)
|
||||||
|
|
||||||
|
### Target Labels
|
||||||
|
|
||||||
|
All scrape targets have these labels:
|
||||||
|
|
||||||
|
**Standard labels:**
|
||||||
|
- `instance` - Full target address (`<hostname>.home.2rjus.net:<port>`)
|
||||||
|
- `job` - Job name (e.g., `node-exporter`, `unbound`, `nixos-exporter`)
|
||||||
|
- `hostname` - Short hostname (e.g., `ns1`, `monitoring01`) - use this for host filtering
|
||||||
|
|
||||||
|
**Host metadata labels** (when configured in `homelab.host`):
|
||||||
|
- `role` - Host role (e.g., `dns`, `build-host`, `vault`)
|
||||||
|
- `tier` - Deployment tier (`test` for test VMs, absent for prod)
|
||||||
|
- `dns_role` - DNS-specific role (`primary` or `secondary` for ns1/ns2)
|
||||||
|
|
||||||
|
### Filtering by Host
|
||||||
|
|
||||||
|
Use the `hostname` label for easy host filtering across all jobs:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
{hostname="ns1"} # All metrics from ns1
|
||||||
|
node_load1{hostname="monitoring01"} # Specific metric by hostname
|
||||||
|
up{hostname="ha1"} # Check if ha1 is up
|
||||||
|
```
|
||||||
|
|
||||||
|
This is simpler than wildcarding the `instance` label:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
# Old way (still works but verbose)
|
||||||
|
up{instance=~"monitoring01.*"}
|
||||||
|
|
||||||
|
# New way (preferred)
|
||||||
|
up{hostname="monitoring01"}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Filtering by Role/Tier
|
||||||
|
|
||||||
|
Filter hosts by their role or tier:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
up{role="dns"} # All DNS servers (ns1, ns2)
|
||||||
|
node_cpu_seconds_total{role="build-host"} # Build hosts only (nix-cache01)
|
||||||
|
up{tier="test"} # All test-tier VMs
|
||||||
|
up{dns_role="primary"} # Primary DNS only (ns1)
|
||||||
|
```
|
||||||
|
|
||||||
|
Current host labels:
|
||||||
|
| Host | Labels |
|
||||||
|
|------|--------|
|
||||||
|
| ns1 | `role=dns`, `dns_role=primary` |
|
||||||
|
| ns2 | `role=dns`, `dns_role=secondary` |
|
||||||
|
| nix-cache01 | `role=build-host` |
|
||||||
|
| vault01 | `role=vault` |
|
||||||
|
| kanidm01 | `role=auth`, `tier=test` |
|
||||||
|
| testvm01/02/03 | `tier=test` |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Troubleshooting Workflows
|
||||||
|
|
||||||
|
### Check Deployment Status Across Fleet
|
||||||
|
|
||||||
|
1. Query `nixos_flake_info` to see all hosts' current revisions
|
||||||
|
2. Check `nixos_flake_revision_behind` for hosts needing updates
|
||||||
|
3. Look at upgrade logs: `{systemd_unit="nixos-upgrade.service"}` with `start: "24h"`
|
||||||
|
|
||||||
|
### Investigate Service Issues
|
||||||
|
|
||||||
|
1. Check `up{job="<service>"}` or `up{hostname="<host>"}` for scrape failures
|
||||||
|
2. Use `list_targets` to see target health details
|
||||||
|
3. Query service logs: `{host="<host>", systemd_unit="<service>.service"}`
|
||||||
|
4. Search for errors: `{host="<host>"} |= "error"`
|
||||||
|
5. Check `list_alerts` for related alerts
|
||||||
|
6. Use role filters for group issues: `up{role="dns"}` to check all DNS servers
|
||||||
|
|
||||||
|
### After Deploying Changes
|
||||||
|
|
||||||
|
1. Verify `current_rev` updated in `nixos_flake_info`
|
||||||
|
2. Confirm `nixos_flake_revision_behind == 0`
|
||||||
|
3. Check service logs for startup issues
|
||||||
|
4. Check service metrics are being scraped
|
||||||
|
|
||||||
|
### Monitor VM Bootstrap
|
||||||
|
|
||||||
|
When provisioning new VMs, track bootstrap progress:
|
||||||
|
|
||||||
|
1. Watch bootstrap logs: `{job="bootstrap", host="<hostname>"}`
|
||||||
|
2. Check for failures: `{job="bootstrap", host="<hostname>", stage="failed"}`
|
||||||
|
3. After success, verify host appears in metrics: `up{hostname="<hostname>"}`
|
||||||
|
4. Check logs are flowing: `{host="<hostname>"}`
|
||||||
|
|
||||||
|
See [docs/host-creation.md](../../../docs/host-creation.md) for the full host creation pipeline.
|
||||||
|
|
||||||
|
### Debug SSH/Access Issues
|
||||||
|
|
||||||
|
```logql
|
||||||
|
{host="<host>", systemd_unit="sshd.service"}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Check Recent Upgrades
|
||||||
|
|
||||||
|
```logql
|
||||||
|
{systemd_unit="nixos-upgrade.service"}
|
||||||
|
```
|
||||||
|
|
||||||
|
With `start: "24h"` to see last 24 hours of upgrades across all hosts.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Default scrape interval is 15s for most metrics targets
|
||||||
|
- Default log lookback is 1h - use `start` parameter for older logs
|
||||||
|
- Use `rate()` for counter metrics, direct queries for gauges
|
||||||
|
- Use the `hostname` label to filter metrics by host (simpler than regex on `instance`)
|
||||||
|
- Host metadata labels (`role`, `tier`, `dns_role`) are propagated to all scrape targets
|
||||||
|
- Log `MESSAGE` field contains the actual log content in JSON format
|
||||||
89
.claude/skills/quick-plan/SKILL.md
Normal file
89
.claude/skills/quick-plan/SKILL.md
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
---
|
||||||
|
name: quick-plan
|
||||||
|
description: Create a planning document for a future homelab project. Use when the user wants to document ideas for future work without implementing immediately.
|
||||||
|
argument-hint: [topic or feature to plan]
|
||||||
|
---
|
||||||
|
|
||||||
|
# Quick Plan Generator
|
||||||
|
|
||||||
|
Create a planning document for a future homelab infrastructure project. Plans are for documenting ideas and approaches that will be implemented later, not immediately.
|
||||||
|
|
||||||
|
## Input
|
||||||
|
|
||||||
|
The user provides: $ARGUMENTS
|
||||||
|
|
||||||
|
## Process
|
||||||
|
|
||||||
|
1. **Understand the topic**: Research the codebase to understand:
|
||||||
|
- Current state of related systems
|
||||||
|
- Existing patterns and conventions
|
||||||
|
- Relevant NixOS options or packages
|
||||||
|
- Any constraints or dependencies
|
||||||
|
|
||||||
|
2. **Evaluate options**: If there are multiple approaches, research and compare them with pros/cons.
|
||||||
|
|
||||||
|
3. **Draft the plan**: Create a markdown document following the structure below.
|
||||||
|
|
||||||
|
4. **Save the plan**: Write to `docs/plans/<topic-slug>.md` using a kebab-case filename derived from the topic.
|
||||||
|
|
||||||
|
## Plan Structure
|
||||||
|
|
||||||
|
Use these sections as appropriate (not all plans need every section):
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
# Title
|
||||||
|
|
||||||
|
## Overview/Goal
|
||||||
|
Brief description of what this plan addresses and why.
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
What exists today that's relevant to this plan.
|
||||||
|
|
||||||
|
## Options Evaluated (if multiple approaches)
|
||||||
|
For each option:
|
||||||
|
- **Option Name**
|
||||||
|
- **Pros:** bullet points
|
||||||
|
- **Cons:** bullet points
|
||||||
|
- **Verdict:** brief assessment
|
||||||
|
|
||||||
|
Or use a comparison table for structured evaluation.
|
||||||
|
|
||||||
|
## Recommendation/Decision
|
||||||
|
What approach is recommended and why. Include rationale.
|
||||||
|
|
||||||
|
## Implementation Steps
|
||||||
|
Numbered phases or steps. Be specific but not overly detailed.
|
||||||
|
Can use sub-sections for major phases.
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
Things still to be determined. Use checkbox format:
|
||||||
|
- [ ] Question 1?
|
||||||
|
- [ ] Question 2?
|
||||||
|
|
||||||
|
## Notes (optional)
|
||||||
|
Additional context, caveats, or references.
|
||||||
|
```
|
||||||
|
|
||||||
|
## Style Guidelines
|
||||||
|
|
||||||
|
- **Concise**: Use bullet points, avoid verbose paragraphs
|
||||||
|
- **Technical but accessible**: Include NixOS config snippets when relevant
|
||||||
|
- **Future-oriented**: These are plans, not specifications
|
||||||
|
- **Acknowledge uncertainty**: Use "Open Questions" for unresolved decisions
|
||||||
|
- **Reference existing patterns**: Mention how this fits with existing infrastructure
|
||||||
|
- **Tables for comparisons**: Use markdown tables when comparing options
|
||||||
|
- **Practical focus**: Emphasize what needs to happen, not theory
|
||||||
|
|
||||||
|
## Examples of Good Plans
|
||||||
|
|
||||||
|
Reference these existing plans for style guidance:
|
||||||
|
- `docs/plans/auth-system-replacement.md` - Good option evaluation with table
|
||||||
|
- `docs/plans/truenas-migration.md` - Good decision documentation with rationale
|
||||||
|
- `docs/plans/remote-access.md` - Good multi-option comparison
|
||||||
|
- `docs/plans/prometheus-scrape-target-labels.md` - Good implementation detail level
|
||||||
|
|
||||||
|
## After Creating the Plan
|
||||||
|
|
||||||
|
1. Tell the user the plan was saved to `docs/plans/<filename>.md`
|
||||||
|
2. Summarize the key points
|
||||||
|
3. Ask if they want any adjustments before committing
|
||||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,5 +1,6 @@
|
|||||||
.direnv/
|
.direnv/
|
||||||
result
|
result
|
||||||
|
result-*
|
||||||
|
|
||||||
# Terraform/OpenTofu
|
# Terraform/OpenTofu
|
||||||
terraform/.terraform/
|
terraform/.terraform/
|
||||||
|
|||||||
21
.mcp.json
21
.mcp.json
@@ -19,7 +19,26 @@
|
|||||||
"args": ["run", "git+https://git.t-juice.club/torjus/labmcp#lab-monitoring", "--", "serve", "--enable-silences"],
|
"args": ["run", "git+https://git.t-juice.club/torjus/labmcp#lab-monitoring", "--", "serve", "--enable-silences"],
|
||||||
"env": {
|
"env": {
|
||||||
"PROMETHEUS_URL": "https://prometheus.home.2rjus.net",
|
"PROMETHEUS_URL": "https://prometheus.home.2rjus.net",
|
||||||
"ALERTMANAGER_URL": "https://alertmanager.home.2rjus.net"
|
"ALERTMANAGER_URL": "https://alertmanager.home.2rjus.net",
|
||||||
|
"LOKI_URL": "http://monitoring01.home.2rjus.net:3100"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"homelab-deploy": {
|
||||||
|
"command": "nix",
|
||||||
|
"args": [
|
||||||
|
"run",
|
||||||
|
"git+https://git.t-juice.club/torjus/homelab-deploy",
|
||||||
|
"--",
|
||||||
|
"mcp",
|
||||||
|
"--nats-url", "nats://nats1.home.2rjus.net:4222",
|
||||||
|
"--nkey-file", "/home/torjus/.config/homelab-deploy/test-deployer.nkey"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"git-explorer": {
|
||||||
|
"command": "nix",
|
||||||
|
"args": ["run", "git+https://git.t-juice.club/torjus/labmcp#git-explorer", "--", "serve"],
|
||||||
|
"env": {
|
||||||
|
"GIT_REPO_PATH": "/home/torjus/git/nixos-servers"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
72
.sops.yaml
72
.sops.yaml
@@ -1,72 +0,0 @@
|
|||||||
keys:
|
|
||||||
- &admin_torjus age1lznyk4ee7e7x8n92cq2n87kz9920473ks5u9jlhd3dczfzq4wamqept56u
|
|
||||||
- &server_ns1 age1hz2lz4k050ru3shrk5j3zk3f8azxmrp54pktw5a7nzjml4saudesx6jsl0
|
|
||||||
- &server_ns2 age1w2q4gm2lrcgdzscq8du3ssyvk6qtzm4fcszc92z9ftclq23yyydqdga5um
|
|
||||||
- &server_ns3 age1snmhmpavqy7xddmw4nuny0u4xusqmnqxqarjmghkm5zaluff84eq5xatrd
|
|
||||||
- &server_ns4 age12a3nyvjs8jrwmpkf3tgawel3nwcklwsr35ktmytnvhpawqwzrsfqpgcy0q
|
|
||||||
- &server_ha1 age1d2w5zece9647qwyq4vas9qyqegg96xwmg6c86440a6eg4uj6dd2qrq0w3l
|
|
||||||
- &server_nixos-test1 age1gcyfkxh4fq5zdp0dh484aj82ksz66wrly7qhnpv0r0p576sn9ekse8e9ju
|
|
||||||
- &server_inc1 age1g5luz2rtel3surgzuh62rkvtey7lythrvfenyq954vmeyfpxjqkqdj3wt8
|
|
||||||
- &server_http-proxy age1gq8434ku0xekqmvnseeunv83e779cg03c06gwrusnymdsr3rpufqx6vr3m
|
|
||||||
- &server_ca age1288993th0ge00reg4zqueyvmkrsvk829cs068eekjqfdprsrkeqql7mljk
|
|
||||||
- &server_monitoring01 age1vpns76ykll8jgdlu3h05cur4ew2t3k7u03kxdg8y6ypfhsfhq9fqyurjey
|
|
||||||
- &server_jelly01 age1hchvlf3apn8g8jq2743pw53sd6v6ay6xu6lqk0qufrjeccan9vzsc7hdfq
|
|
||||||
- &server_nix-cache01 age1w029fksjv0edrff9p7s03tgk3axecdkppqymfpwfn2nu2gsqqefqc37sxq
|
|
||||||
- &server_pgdb1 age1ha34qeksr4jeaecevqvv2afqem67eja2mvawlmrqsudch0e7fe7qtpsekv
|
|
||||||
- &server_nats1 age1cxt8kwqzx35yuldazcc49q88qvgy9ajkz30xu0h37uw3ts97jagqgmn2ga
|
|
||||||
- &server_auth01 age16prza00sqzuhwwcyakj6z4hvwkruwkqpmmrsn94a5ucgpkelncdq2ldctk
|
|
||||||
creation_rules:
|
|
||||||
- path_regex: secrets/[^/]+\.(yaml|json|env|ini)
|
|
||||||
key_groups:
|
|
||||||
- age:
|
|
||||||
- *admin_torjus
|
|
||||||
- *server_ns1
|
|
||||||
- *server_ns2
|
|
||||||
- *server_ns3
|
|
||||||
- *server_ns4
|
|
||||||
- *server_ha1
|
|
||||||
- *server_nixos-test1
|
|
||||||
- *server_inc1
|
|
||||||
- *server_http-proxy
|
|
||||||
- *server_ca
|
|
||||||
- *server_monitoring01
|
|
||||||
- *server_jelly01
|
|
||||||
- *server_nix-cache01
|
|
||||||
- *server_pgdb1
|
|
||||||
- *server_nats1
|
|
||||||
- *server_auth01
|
|
||||||
- path_regex: secrets/ns3/[^/]+\.(yaml|json|env|ini)
|
|
||||||
key_groups:
|
|
||||||
- age:
|
|
||||||
- *admin_torjus
|
|
||||||
- *server_ns3
|
|
||||||
- path_regex: secrets/ca/[^/]+\.(yaml|json|env|ini|)
|
|
||||||
key_groups:
|
|
||||||
- age:
|
|
||||||
- *admin_torjus
|
|
||||||
- *server_ca
|
|
||||||
- path_regex: secrets/monitoring01/[^/]+\.(yaml|json|env|ini)
|
|
||||||
key_groups:
|
|
||||||
- age:
|
|
||||||
- *admin_torjus
|
|
||||||
- *server_monitoring01
|
|
||||||
- path_regex: secrets/ca/keys/.+
|
|
||||||
key_groups:
|
|
||||||
- age:
|
|
||||||
- *admin_torjus
|
|
||||||
- *server_ca
|
|
||||||
- path_regex: secrets/nix-cache01/.+
|
|
||||||
key_groups:
|
|
||||||
- age:
|
|
||||||
- *admin_torjus
|
|
||||||
- *server_nix-cache01
|
|
||||||
- path_regex: secrets/http-proxy/.+
|
|
||||||
key_groups:
|
|
||||||
- age:
|
|
||||||
- *admin_torjus
|
|
||||||
- *server_http-proxy
|
|
||||||
- path_regex: secrets/auth01/[^/]+\.(yaml|json|env|ini|)
|
|
||||||
key_groups:
|
|
||||||
- age:
|
|
||||||
- *admin_torjus
|
|
||||||
- *server_auth01
|
|
||||||
244
CLAUDE.md
244
CLAUDE.md
@@ -35,6 +35,25 @@ nix build .#create-host
|
|||||||
|
|
||||||
Do not automatically deploy changes. Deployments are usually done by updating the master branch, and then triggering the auto update on the specific host.
|
Do not automatically deploy changes. Deployments are usually done by updating the master branch, and then triggering the auto update on the specific host.
|
||||||
|
|
||||||
|
### SSH Commands
|
||||||
|
|
||||||
|
Do not run SSH commands directly. If a command needs to be run on a remote host, provide the command to the user and ask them to run it manually.
|
||||||
|
|
||||||
|
### Testing Feature Branches on Hosts
|
||||||
|
|
||||||
|
All hosts have the `nixos-rebuild-test` helper script for testing feature branches before merging:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On the target host, test a feature branch
|
||||||
|
nixos-rebuild-test boot <branch-name>
|
||||||
|
nixos-rebuild-test switch <branch-name>
|
||||||
|
|
||||||
|
# Additional arguments are passed through to nixos-rebuild
|
||||||
|
nixos-rebuild-test boot my-feature --show-trace
|
||||||
|
```
|
||||||
|
|
||||||
|
When working on a feature branch that requires testing on a live host, suggest using this command instead of the full flake URL syntax.
|
||||||
|
|
||||||
### Flake Management
|
### Flake Management
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@@ -46,18 +65,45 @@ Do not run `nix flake update`. Should only be done manually by user.
|
|||||||
### Development Environment
|
### Development Environment
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Enter development shell (provides ansible, python3)
|
# Enter development shell
|
||||||
nix develop
|
nix develop
|
||||||
```
|
```
|
||||||
|
|
||||||
|
The devshell provides: `ansible`, `tofu` (OpenTofu), `bao` (OpenBao CLI), `create-host`, and `homelab-deploy`.
|
||||||
|
|
||||||
|
**Important:** When suggesting commands that use devshell tools, always use `nix develop -c <command>` syntax rather than assuming the user is already in a devshell. For example:
|
||||||
|
```bash
|
||||||
|
# Good - works regardless of current shell
|
||||||
|
nix develop -c tofu plan
|
||||||
|
|
||||||
|
# Avoid - requires user to be in devshell
|
||||||
|
tofu plan
|
||||||
|
```
|
||||||
|
|
||||||
|
**OpenTofu:** Use the `-chdir` option instead of `cd` when running tofu commands in subdirectories:
|
||||||
|
```bash
|
||||||
|
# Good - uses -chdir option
|
||||||
|
nix develop -c tofu -chdir=terraform plan
|
||||||
|
nix develop -c tofu -chdir=terraform/vault apply
|
||||||
|
|
||||||
|
# Avoid - changing directories
|
||||||
|
cd terraform && tofu plan
|
||||||
|
```
|
||||||
|
|
||||||
### Secrets Management
|
### Secrets Management
|
||||||
|
|
||||||
Secrets are handled by sops. Do not edit any `.sops.yaml` or any file within `secrets/`. Ask the user to modify if necessary.
|
Secrets are managed by OpenBao (Vault) using AppRole authentication. Most hosts use the
|
||||||
|
`vault.secrets` option defined in `system/vault-secrets.nix` to fetch secrets at boot.
|
||||||
|
Terraform manages the secrets and AppRole policies in `terraform/vault/`.
|
||||||
|
|
||||||
### Git Workflow
|
### Git Workflow
|
||||||
|
|
||||||
**Important:** Never commit directly to `master` unless the user explicitly asks for it. Always create a feature branch for changes.
|
**Important:** Never commit directly to `master` unless the user explicitly asks for it. Always create a feature branch for changes.
|
||||||
|
|
||||||
|
**Important:** Never amend commits to `master` unless the user explicitly asks for it. Amending rewrites history and causes issues for deployed configurations.
|
||||||
|
|
||||||
|
**Important:** Do not use `gh pr create` to create pull requests. The git server does not support GitHub CLI for PR creation. Instead, push the branch and let the user create the PR manually via the web interface.
|
||||||
|
|
||||||
When starting a new plan or task, the first step should typically be to create and checkout a new branch with an appropriate name (e.g., `git checkout -b dns-automation` or `git checkout -b fix-nginx-config`).
|
When starting a new plan or task, the first step should typically be to create and checkout a new branch with an appropriate name (e.g., `git checkout -b dns-automation` or `git checkout -b fix-nginx-config`).
|
||||||
|
|
||||||
### Plan Management
|
### Plan Management
|
||||||
@@ -110,6 +156,77 @@ Two MCP servers are available for searching NixOS options and packages:
|
|||||||
|
|
||||||
This ensures documentation matches the exact nixpkgs version (currently NixOS 25.11) used by this flake.
|
This ensures documentation matches the exact nixpkgs version (currently NixOS 25.11) used by this flake.
|
||||||
|
|
||||||
|
### Lab Monitoring
|
||||||
|
|
||||||
|
The **lab-monitoring** MCP server provides access to Prometheus metrics and Loki logs. Use the `/observability` skill for detailed reference on:
|
||||||
|
|
||||||
|
- Available Prometheus jobs and exporters
|
||||||
|
- Loki labels and LogQL query syntax
|
||||||
|
- Bootstrap log monitoring for new VMs
|
||||||
|
- Common troubleshooting workflows
|
||||||
|
|
||||||
|
The skill contains up-to-date information about all scrape targets, host labels, and example queries.
|
||||||
|
|
||||||
|
### Deploying to Test Hosts
|
||||||
|
|
||||||
|
The **homelab-deploy** MCP server enables remote deployments to test-tier hosts via NATS messaging.
|
||||||
|
|
||||||
|
**Available Tools:**
|
||||||
|
|
||||||
|
- `deploy` - Deploy NixOS configuration to test-tier hosts
|
||||||
|
- `list_hosts` - List available deployment targets
|
||||||
|
|
||||||
|
**Deploy Parameters:**
|
||||||
|
|
||||||
|
- `hostname` - Target a specific host (e.g., `vaulttest01`)
|
||||||
|
- `role` - Deploy to all hosts with a specific role (e.g., `vault`)
|
||||||
|
- `all` - Deploy to all test-tier hosts
|
||||||
|
- `action` - nixos-rebuild action: `switch` (default), `boot`, `test`, `dry-activate`
|
||||||
|
- `branch` - Git branch or commit to deploy (default: `master`)
|
||||||
|
|
||||||
|
**Examples:**
|
||||||
|
|
||||||
|
```
|
||||||
|
# List available hosts
|
||||||
|
list_hosts()
|
||||||
|
|
||||||
|
# Deploy to a specific host
|
||||||
|
deploy(hostname="vaulttest01", action="switch")
|
||||||
|
|
||||||
|
# Dry-run deployment
|
||||||
|
deploy(hostname="vaulttest01", action="dry-activate")
|
||||||
|
|
||||||
|
# Deploy to all hosts with a role
|
||||||
|
deploy(role="vault", action="switch")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Note:** Only test-tier hosts with `homelab.deploy.enable = true` and the listener service running will respond to deployments.
|
||||||
|
|
||||||
|
**Deploying to Prod Hosts:**
|
||||||
|
|
||||||
|
The MCP server only deploys to test-tier hosts. For prod hosts, use the CLI directly:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
nix develop -c homelab-deploy -- deploy \
|
||||||
|
--nats-url nats://nats1.home.2rjus.net:4222 \
|
||||||
|
--nkey-file ~/.config/homelab-deploy/admin-deployer.nkey \
|
||||||
|
--branch <branch-name> \
|
||||||
|
--action switch \
|
||||||
|
deploy.prod.<hostname>
|
||||||
|
```
|
||||||
|
|
||||||
|
Subject format: `deploy.<tier>.<hostname>` (e.g., `deploy.prod.monitoring01`, `deploy.test.testvm01`)
|
||||||
|
|
||||||
|
**Verifying Deployments:**
|
||||||
|
|
||||||
|
After deploying, use the `nixos_flake_info` metric from nixos-exporter to verify the host is running the expected revision:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
nixos_flake_info{instance=~"vaulttest01.*"}
|
||||||
|
```
|
||||||
|
|
||||||
|
The `current_rev` label contains the git commit hash of the deployed flake configuration.
|
||||||
|
|
||||||
## Architecture
|
## Architecture
|
||||||
|
|
||||||
### Directory Structure
|
### Directory Structure
|
||||||
@@ -119,24 +236,26 @@ This ensures documentation matches the exact nixpkgs version (currently NixOS 25
|
|||||||
- `default.nix` - Entry point, imports configuration.nix and services
|
- `default.nix` - Entry point, imports configuration.nix and services
|
||||||
- `configuration.nix` - Host-specific settings (networking, hardware, users)
|
- `configuration.nix` - Host-specific settings (networking, hardware, users)
|
||||||
- `/system/` - Shared system-level configurations applied to ALL hosts
|
- `/system/` - Shared system-level configurations applied to ALL hosts
|
||||||
- Core modules: nix.nix, sshd.nix, sops.nix, acme.nix, autoupgrade.nix
|
- Core modules: nix.nix, sshd.nix, vault-secrets.nix, acme.nix, autoupgrade.nix
|
||||||
|
- Additional modules: motd.nix (dynamic MOTD), packages.nix (base packages), root-user.nix (root config), homelab-deploy.nix (NATS listener)
|
||||||
- Monitoring: node-exporter and promtail on every host
|
- Monitoring: node-exporter and promtail on every host
|
||||||
- `/modules/` - Custom NixOS modules
|
- `/modules/` - Custom NixOS modules
|
||||||
- `homelab/` - Homelab-specific options (DNS automation, etc.)
|
- `homelab/` - Homelab-specific options (see "Homelab Module Options" section below)
|
||||||
- `/lib/` - Nix library functions
|
- `/lib/` - Nix library functions
|
||||||
- `dns-zone.nix` - DNS zone generation functions
|
- `dns-zone.nix` - DNS zone generation functions
|
||||||
|
- `monitoring.nix` - Prometheus scrape target generation functions
|
||||||
- `/services/` - Reusable service modules, selectively imported by hosts
|
- `/services/` - Reusable service modules, selectively imported by hosts
|
||||||
- `home-assistant/` - Home automation stack
|
- `home-assistant/` - Home automation stack
|
||||||
- `monitoring/` - Observability stack (Prometheus, Grafana, Loki, Tempo)
|
- `monitoring/` - Observability stack (Prometheus, Grafana, Loki, Tempo)
|
||||||
- `ns/` - DNS services (authoritative, resolver, zone generation)
|
- `ns/` - DNS services (authoritative, resolver, zone generation)
|
||||||
- `http-proxy/`, `ca/`, `postgres/`, `nats/`, `jellyfin/`, etc.
|
- `vault/` - OpenBao (Vault) secrets server
|
||||||
- `/secrets/` - SOPS-encrypted secrets with age encryption
|
- `actions-runner/` - GitHub Actions runner
|
||||||
|
- `http-proxy/`, `postgres/`, `nats/`, `jellyfin/`, etc.
|
||||||
- `/common/` - Shared configurations (e.g., VM guest agent)
|
- `/common/` - Shared configurations (e.g., VM guest agent)
|
||||||
- `/docs/` - Documentation and plans
|
- `/docs/` - Documentation and plans
|
||||||
- `plans/` - Future plans and proposals
|
- `plans/` - Future plans and proposals
|
||||||
- `plans/completed/` - Completed plans (moved here when done)
|
- `plans/completed/` - Completed plans (moved here when done)
|
||||||
- `/playbooks/` - Ansible playbooks for fleet management
|
- `/playbooks/` - Ansible playbooks for fleet management
|
||||||
- `/.sops.yaml` - SOPS configuration with age keys for all servers
|
|
||||||
|
|
||||||
### Configuration Inheritance
|
### Configuration Inheritance
|
||||||
|
|
||||||
@@ -152,39 +271,41 @@ hosts/<hostname>/default.nix
|
|||||||
All hosts automatically get:
|
All hosts automatically get:
|
||||||
- Nix binary cache (nix-cache.home.2rjus.net)
|
- Nix binary cache (nix-cache.home.2rjus.net)
|
||||||
- SSH with root login enabled
|
- SSH with root login enabled
|
||||||
- SOPS secrets management with auto-generated age keys
|
- OpenBao (Vault) secrets management via AppRole
|
||||||
- Internal ACME CA integration (ca.home.2rjus.net)
|
- Internal ACME CA integration (OpenBao PKI at vault.home.2rjus.net)
|
||||||
- Daily auto-upgrades with auto-reboot
|
- Daily auto-upgrades with auto-reboot
|
||||||
- Prometheus node-exporter + Promtail (logs to monitoring01)
|
- Prometheus node-exporter + Promtail (logs to monitoring01)
|
||||||
|
- Monitoring scrape target auto-registration via `homelab.monitoring` options
|
||||||
- Custom root CA trust
|
- Custom root CA trust
|
||||||
- DNS zone auto-registration via `homelab.dns` options
|
- DNS zone auto-registration via `homelab.dns` options
|
||||||
|
|
||||||
### Active Hosts
|
### Active Hosts
|
||||||
|
|
||||||
Production servers managed by `rebuild-all.sh`:
|
Production servers:
|
||||||
- `ns1`, `ns2` - Primary/secondary DNS servers (10.69.13.5/6)
|
- `ns1`, `ns2` - Primary/secondary DNS servers (10.69.13.5/6)
|
||||||
- `ca` - Internal Certificate Authority
|
- `vault01` - OpenBao (Vault) secrets server + PKI CA
|
||||||
- `ha1` - Home Assistant + Zigbee2MQTT + Mosquitto
|
- `ha1` - Home Assistant + Zigbee2MQTT + Mosquitto
|
||||||
- `http-proxy` - Reverse proxy
|
- `http-proxy` - Reverse proxy
|
||||||
- `monitoring01` - Full observability stack (Prometheus, Grafana, Loki, Tempo, Pyroscope)
|
- `monitoring01` - Full observability stack (Prometheus, Grafana, Loki, Tempo, Pyroscope)
|
||||||
- `jelly01` - Jellyfin media server
|
- `jelly01` - Jellyfin media server
|
||||||
- `nix-cache01` - Binary cache server
|
- `nix-cache01` - Binary cache server + GitHub Actions runner
|
||||||
- `pgdb1` - PostgreSQL database
|
- `pgdb1` - PostgreSQL database
|
||||||
- `nats1` - NATS messaging server
|
- `nats1` - NATS messaging server
|
||||||
- `auth01` - Authentication service
|
|
||||||
|
|
||||||
Template/test hosts:
|
Test/staging hosts:
|
||||||
- `template1` - Base template for cloning new hosts
|
- `testvm01`, `testvm02`, `testvm03` - Test-tier VMs for branch testing and deployment validation
|
||||||
- `nixos-test1` - Test environment
|
|
||||||
|
Template hosts:
|
||||||
|
- `template1`, `template2` - Base templates for cloning new hosts
|
||||||
|
|
||||||
### Flake Inputs
|
### Flake Inputs
|
||||||
|
|
||||||
- `nixpkgs` - NixOS 25.11 stable (primary)
|
- `nixpkgs` - NixOS 25.11 stable (primary)
|
||||||
- `nixpkgs-unstable` - Unstable channel (available via overlay as `pkgs.unstable.<package>`)
|
- `nixpkgs-unstable` - Unstable channel (available via overlay as `pkgs.unstable.<package>`)
|
||||||
- `sops-nix` - Secrets management
|
- `nixos-exporter` - NixOS module for exposing flake revision metrics (used to verify deployments)
|
||||||
|
- `homelab-deploy` - NATS-based remote deployment tool for test-tier hosts
|
||||||
- Custom packages from git.t-juice.club:
|
- Custom packages from git.t-juice.club:
|
||||||
- `alerttonotify` - Alert routing
|
- `alerttonotify` - Alert routing
|
||||||
- `labmon` - Lab monitoring
|
|
||||||
|
|
||||||
### Network Architecture
|
### Network Architecture
|
||||||
|
|
||||||
@@ -197,12 +318,16 @@ Template/test hosts:
|
|||||||
|
|
||||||
### Secrets Management
|
### Secrets Management
|
||||||
|
|
||||||
- Uses SOPS with age encryption
|
Most hosts use OpenBao (Vault) for secrets:
|
||||||
- Each server has unique age key in `.sops.yaml`
|
- Vault server at `vault01.home.2rjus.net:8200`
|
||||||
- Keys auto-generated at `/var/lib/sops-nix/key.txt` on first boot
|
- AppRole authentication with credentials at `/var/lib/vault/approle/`
|
||||||
- Shared secrets: `/secrets/secrets.yaml`
|
- Secrets defined in Terraform (`terraform/vault/secrets.tf`)
|
||||||
- Per-host secrets: `/secrets/<hostname>/`
|
- AppRole policies in Terraform (`terraform/vault/approle.tf`)
|
||||||
- All production servers can decrypt shared secrets; host-specific secrets require specific host keys
|
- NixOS module: `system/vault-secrets.nix` with `vault.secrets.<name>` options
|
||||||
|
- `extractKey` option extracts a single key from vault JSON as a plain file
|
||||||
|
- Secrets fetched at boot by `vault-secret-<name>.service` systemd units
|
||||||
|
- Fallback to cached secrets in `/var/lib/vault/cache/` when Vault is unreachable
|
||||||
|
- Provision AppRole credentials: `nix develop -c ansible-playbook playbooks/provision-approle.yml -e hostname=<host>`
|
||||||
|
|
||||||
### Auto-Upgrade System
|
### Auto-Upgrade System
|
||||||
|
|
||||||
@@ -264,9 +389,21 @@ Example VM deployment includes:
|
|||||||
- Custom CPU/memory/disk sizing
|
- Custom CPU/memory/disk sizing
|
||||||
- VLAN tagging
|
- VLAN tagging
|
||||||
- QEMU guest agent
|
- QEMU guest agent
|
||||||
|
- Automatic Vault credential provisioning via `vault_wrapped_token`
|
||||||
|
|
||||||
OpenTofu outputs the VM's IP address after deployment for easy SSH access.
|
OpenTofu outputs the VM's IP address after deployment for easy SSH access.
|
||||||
|
|
||||||
|
**Automatic Vault Credential Provisioning:**
|
||||||
|
|
||||||
|
VMs can receive Vault (OpenBao) credentials automatically during bootstrap:
|
||||||
|
|
||||||
|
1. OpenTofu generates a wrapped token via `terraform/vault/` and stores it in the VM configuration
|
||||||
|
2. Cloud-init passes `VAULT_WRAPPED_TOKEN` and `NIXOS_FLAKE_BRANCH` to the bootstrap script
|
||||||
|
3. The bootstrap script unwraps the token to obtain AppRole credentials
|
||||||
|
4. Credentials are written to `/var/lib/vault/approle/` before the NixOS rebuild
|
||||||
|
|
||||||
|
This eliminates the need for manual `provision-approle.yml` playbook runs on new VMs. Bootstrap progress is logged to Loki with `job="bootstrap"` labels.
|
||||||
|
|
||||||
#### Template Rebuilding and Terraform State
|
#### Template Rebuilding and Terraform State
|
||||||
|
|
||||||
When the Proxmox template is rebuilt (via `build-and-deploy-template.yml`), the template name may change. This would normally cause Terraform to want to recreate all existing VMs, but that's unnecessary since VMs are independent once cloned.
|
When the Proxmox template is rebuilt (via `build-and-deploy-template.yml`), the template name may change. This would normally cause Terraform to want to recreate all existing VMs, but that's unnecessary since VMs are independent once cloned.
|
||||||
@@ -297,20 +434,13 @@ This means:
|
|||||||
|
|
||||||
### Adding a New Host
|
### Adding a New Host
|
||||||
|
|
||||||
1. Create `/hosts/<hostname>/` directory
|
See [docs/host-creation.md](docs/host-creation.md) for the complete host creation pipeline, including:
|
||||||
2. Copy structure from `template1` or similar host
|
- Using the `create-host` script to generate host configurations
|
||||||
3. Add host entry to `flake.nix` nixosConfigurations
|
- Deploying VMs and secrets with OpenTofu
|
||||||
4. Configure networking in `configuration.nix` (static IP via `systemd.network.networks`, DNS servers)
|
- Monitoring the bootstrap process via Loki
|
||||||
5. (Optional) Add `homelab.dns.cnames` if the host needs CNAME aliases
|
- Verification and troubleshooting steps
|
||||||
6. User clones template host
|
|
||||||
7. User runs `prepare-host.sh` on new host, this deletes files which should be regenerated, like ssh host keys, machine-id etc. It also creates a new age key, and prints the public key
|
|
||||||
8. This key is then added to `.sops.yaml`
|
|
||||||
9. Create `/secrets/<hostname>/` if needed
|
|
||||||
10. Commit changes, and merge to master.
|
|
||||||
11. Deploy by running `nixos-rebuild boot --flake URL#<hostname>` on the host.
|
|
||||||
12. Run auto-upgrade on DNS servers (ns1, ns2) to pick up the new host's DNS entry
|
|
||||||
|
|
||||||
**Note:** DNS A records are auto-generated from the host's `systemd.network.networks` static IP configuration. No manual zone file editing is required.
|
**Note:** DNS A records and Prometheus node-exporter scrape targets are auto-generated from the host's `systemd.network.networks` static IP configuration. No manual zone file or Prometheus config editing is required.
|
||||||
|
|
||||||
### Important Patterns
|
### Important Patterns
|
||||||
|
|
||||||
@@ -324,6 +454,8 @@ This means:
|
|||||||
|
|
||||||
**Firewall**: Disabled on most hosts (trusted network). Enable selectively in host configuration if needed.
|
**Firewall**: Disabled on most hosts (trusted network). Enable selectively in host configuration if needed.
|
||||||
|
|
||||||
|
**Shell scripts**: Use `pkgs.writeShellApplication` instead of `pkgs.writeShellScript` or `pkgs.writeShellScriptBin` for creating shell scripts. `writeShellApplication` provides automatic shellcheck validation, sets strict bash options (`set -euo pipefail`), and allows declaring `runtimeInputs` for dependencies. When referencing the executable path (e.g., in `ExecStart`), use `lib.getExe myScript` to get the proper `bin/` path.
|
||||||
|
|
||||||
### Monitoring Stack
|
### Monitoring Stack
|
||||||
|
|
||||||
All hosts ship metrics and logs to `monitoring01`:
|
All hosts ship metrics and logs to `monitoring01`:
|
||||||
@@ -333,6 +465,19 @@ All hosts ship metrics and logs to `monitoring01`:
|
|||||||
- **Tracing**: Tempo for distributed tracing
|
- **Tracing**: Tempo for distributed tracing
|
||||||
- **Profiling**: Pyroscope for continuous profiling
|
- **Profiling**: Pyroscope for continuous profiling
|
||||||
|
|
||||||
|
**Scrape Target Auto-Generation:**
|
||||||
|
|
||||||
|
Prometheus scrape targets are automatically generated from host configurations, following the same pattern as DNS zone generation:
|
||||||
|
|
||||||
|
- **Node-exporter**: All flake hosts with static IPs are automatically added as node-exporter targets
|
||||||
|
- **Service targets**: Defined via `homelab.monitoring.scrapeTargets` in service modules
|
||||||
|
- **External targets**: Non-flake hosts defined in `/services/monitoring/external-targets.nix`
|
||||||
|
- **Library**: `lib/monitoring.nix` provides `generateNodeExporterTargets` and `generateScrapeConfigs`
|
||||||
|
|
||||||
|
Service modules declare their scrape targets directly via `homelab.monitoring.scrapeTargets`. The Prometheus config on monitoring01 auto-generates scrape configs from all hosts. See "Homelab Module Options" section for available options.
|
||||||
|
|
||||||
|
To add monitoring targets for non-NixOS hosts, edit `/services/monitoring/external-targets.nix`.
|
||||||
|
|
||||||
### DNS Architecture
|
### DNS Architecture
|
||||||
|
|
||||||
- `ns1` (10.69.13.5) - Primary authoritative DNS + resolver
|
- `ns1` (10.69.13.5) - Primary authoritative DNS + resolver
|
||||||
@@ -348,13 +493,30 @@ DNS zone entries are automatically generated from host configurations:
|
|||||||
- **External hosts**: Non-flake hosts defined in `/services/ns/external-hosts.nix`
|
- **External hosts**: Non-flake hosts defined in `/services/ns/external-hosts.nix`
|
||||||
- **Serial number**: Uses `self.sourceInfo.lastModified` (git commit timestamp)
|
- **Serial number**: Uses `self.sourceInfo.lastModified` (git commit timestamp)
|
||||||
|
|
||||||
Host DNS options (`homelab.dns.*`):
|
|
||||||
- `enable` (default: `true`) - Include host in DNS zone generation
|
|
||||||
- `cnames` (default: `[]`) - List of CNAME aliases pointing to this host
|
|
||||||
|
|
||||||
Hosts are automatically excluded from DNS if:
|
Hosts are automatically excluded from DNS if:
|
||||||
- `homelab.dns.enable = false` (e.g., template hosts)
|
- `homelab.dns.enable = false` (e.g., template hosts)
|
||||||
- No static IP configured (e.g., DHCP-only hosts)
|
- No static IP configured (e.g., DHCP-only hosts)
|
||||||
- Network interface is a VPN/tunnel (wg*, tun*, tap*)
|
- Network interface is a VPN/tunnel (wg*, tun*, tap*)
|
||||||
|
|
||||||
To add DNS entries for non-NixOS hosts, edit `/services/ns/external-hosts.nix`.
|
To add DNS entries for non-NixOS hosts, edit `/services/ns/external-hosts.nix`.
|
||||||
|
|
||||||
|
### Homelab Module Options
|
||||||
|
|
||||||
|
The `modules/homelab/` directory defines custom options used across hosts for automation and metadata.
|
||||||
|
|
||||||
|
**Host options (`homelab.host.*`):**
|
||||||
|
- `tier` - Deployment tier: `test` or `prod`. Test-tier hosts can receive remote deployments and have different credential access.
|
||||||
|
- `priority` - Alerting priority: `high` or `low`. Controls alerting thresholds for the host.
|
||||||
|
- `role` - Primary role designation (e.g., `dns`, `database`, `bastion`, `vault`)
|
||||||
|
- `labels` - Free-form key-value metadata for host categorization
|
||||||
|
|
||||||
|
**DNS options (`homelab.dns.*`):**
|
||||||
|
- `enable` (default: `true`) - Include host in DNS zone generation
|
||||||
|
- `cnames` (default: `[]`) - List of CNAME aliases pointing to this host
|
||||||
|
|
||||||
|
**Monitoring options (`homelab.monitoring.*`):**
|
||||||
|
- `enable` (default: `true`) - Include host in Prometheus node-exporter scrape targets
|
||||||
|
- `scrapeTargets` (default: `[]`) - Additional scrape targets exposed by this host
|
||||||
|
|
||||||
|
**Deploy options (`homelab.deploy.*`):**
|
||||||
|
- `enable` (default: `false`) - Enable NATS-based remote deployment listener. When enabled, the host listens for deployment commands via NATS and can be targeted by the `homelab-deploy` MCP server.
|
||||||
|
|||||||
@@ -7,18 +7,14 @@ NixOS Flake-based configuration repository for a homelab infrastructure. All hos
|
|||||||
| Host | Role |
|
| Host | Role |
|
||||||
|------|------|
|
|------|------|
|
||||||
| `ns1`, `ns2` | Primary/secondary authoritative DNS |
|
| `ns1`, `ns2` | Primary/secondary authoritative DNS |
|
||||||
| `ns3`, `ns4` | Additional DNS servers |
|
|
||||||
| `ca` | Internal Certificate Authority |
|
| `ca` | Internal Certificate Authority |
|
||||||
| `ha1` | Home Assistant + Zigbee2MQTT + Mosquitto |
|
| `ha1` | Home Assistant + Zigbee2MQTT + Mosquitto |
|
||||||
| `http-proxy` | Reverse proxy |
|
| `http-proxy` | Reverse proxy |
|
||||||
| `monitoring01` | Prometheus, Grafana, Loki, Tempo, Pyroscope |
|
| `monitoring01` | Prometheus, Grafana, Loki, Tempo, Pyroscope |
|
||||||
| `jelly01` | Jellyfin media server |
|
| `jelly01` | Jellyfin media server |
|
||||||
| `nix-cache01` | Nix binary cache |
|
| `nix-cache01` | Nix binary cache |
|
||||||
| `pgdb1` | PostgreSQL |
|
|
||||||
| `nats1` | NATS messaging |
|
| `nats1` | NATS messaging |
|
||||||
| `auth01` | Authentication (LLDAP + Authelia) |
|
|
||||||
| `vault01` | OpenBao (Vault) secrets management |
|
| `vault01` | OpenBao (Vault) secrets management |
|
||||||
| `media1` | Media services |
|
|
||||||
| `template1`, `template2` | VM templates for cloning new hosts |
|
| `template1`, `template2` | VM templates for cloning new hosts |
|
||||||
|
|
||||||
## Directory Structure
|
## Directory Structure
|
||||||
@@ -30,7 +26,7 @@ system/ # Shared modules applied to ALL hosts
|
|||||||
services/ # Reusable service modules, selectively imported per host
|
services/ # Reusable service modules, selectively imported per host
|
||||||
modules/ # Custom NixOS module definitions
|
modules/ # Custom NixOS module definitions
|
||||||
lib/ # Nix library functions (DNS zone generation, etc.)
|
lib/ # Nix library functions (DNS zone generation, etc.)
|
||||||
secrets/ # SOPS-encrypted secrets (age encryption)
|
secrets/ # SOPS-encrypted secrets (legacy, only used by ca)
|
||||||
common/ # Shared configurations (e.g., VM guest agent)
|
common/ # Shared configurations (e.g., VM guest agent)
|
||||||
terraform/ # OpenTofu configs for Proxmox VM provisioning
|
terraform/ # OpenTofu configs for Proxmox VM provisioning
|
||||||
terraform/vault/ # OpenTofu configs for OpenBao (secrets, PKI, AppRoles)
|
terraform/vault/ # OpenTofu configs for OpenBao (secrets, PKI, AppRoles)
|
||||||
@@ -42,7 +38,7 @@ scripts/ # Helper scripts (create-host, vault-fetch)
|
|||||||
|
|
||||||
**Automatic DNS zone generation** - A records are derived from each host's static IP configuration. CNAME aliases are defined via `homelab.dns.cnames`. No manual zone file editing required.
|
**Automatic DNS zone generation** - A records are derived from each host's static IP configuration. CNAME aliases are defined via `homelab.dns.cnames`. No manual zone file editing required.
|
||||||
|
|
||||||
**SOPS secrets management** - Each host has a unique age key. Shared secrets live in `secrets/secrets.yaml`, per-host secrets in `secrets/<hostname>/`.
|
**OpenBao (Vault) secrets** - Hosts authenticate via AppRole and fetch secrets at boot. Secrets and policies are managed as code in `terraform/vault/`. Legacy SOPS remains only for the `ca` host.
|
||||||
|
|
||||||
**Daily auto-upgrades** - All hosts pull from the master branch and automatically rebuild and reboot on a randomized schedule.
|
**Daily auto-upgrades** - All hosts pull from the master branch and automatically rebuild and reboot on a randomized schedule.
|
||||||
|
|
||||||
|
|||||||
21
common/ssh-audit.nix
Normal file
21
common/ssh-audit.nix
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
# SSH session command auditing
|
||||||
|
#
|
||||||
|
# Logs all commands executed by users who logged in interactively (SSH).
|
||||||
|
# System services and nix builds are excluded via auid filter.
|
||||||
|
#
|
||||||
|
# Logs are sent to journald and forwarded to Loki via promtail.
|
||||||
|
# Query with: {host="<hostname>"} |= "EXECVE"
|
||||||
|
{
|
||||||
|
# Enable Linux audit subsystem
|
||||||
|
security.audit.enable = true;
|
||||||
|
security.auditd.enable = true;
|
||||||
|
|
||||||
|
# Log execve syscalls only from interactive login sessions
|
||||||
|
# auid!=4294967295 means "audit login uid is set" (excludes system services, nix builds)
|
||||||
|
security.audit.rules = [
|
||||||
|
"-a exit,always -F arch=b64 -S execve -F auid!=4294967295"
|
||||||
|
];
|
||||||
|
|
||||||
|
# Forward audit logs to journald (so promtail ships them to Loki)
|
||||||
|
services.journald.audit = true;
|
||||||
|
}
|
||||||
217
docs/host-creation.md
Normal file
217
docs/host-creation.md
Normal file
@@ -0,0 +1,217 @@
|
|||||||
|
# Host Creation Pipeline
|
||||||
|
|
||||||
|
This document describes the process for creating new hosts in the homelab infrastructure.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
We use the `create-host` script to create new hosts, which generates default configurations from a template. We then use OpenTofu to deploy both secrets and VMs. The VMs boot using a template image (built from `hosts/template2`), which starts a bootstrap process. This bootstrap process applies the host's NixOS configuration and then reboots into the new config.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
All tools are available in the devshell: `create-host`, `bao` (OpenBao CLI), `tofu`.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
nix develop
|
||||||
|
```
|
||||||
|
|
||||||
|
## Steps
|
||||||
|
|
||||||
|
Steps marked with **USER** must be performed by the user due to credential requirements.
|
||||||
|
|
||||||
|
1. **USER**: Run `create-host --hostname <name> --ip <ip/prefix>`
|
||||||
|
2. Edit the auto-generated configurations in `hosts/<hostname>/` to import whatever modules are needed for its purpose
|
||||||
|
3. Add any secrets needed to `terraform/vault/`
|
||||||
|
4. Edit the VM specs in `terraform/vms.tf` if needed. To deploy from a branch other than master, add `flake_branch = "<branch>"` to the VM definition
|
||||||
|
5. Push configuration to master (or the branch specified by `flake_branch`)
|
||||||
|
6. **USER**: Apply terraform:
|
||||||
|
```bash
|
||||||
|
nix develop -c tofu -chdir=terraform/vault apply
|
||||||
|
nix develop -c tofu -chdir=terraform apply
|
||||||
|
```
|
||||||
|
7. Once terraform completes, a VM boots in Proxmox using the template image
|
||||||
|
8. The VM runs the `nixos-bootstrap` service, which applies the host config and reboots
|
||||||
|
9. After reboot, the host should be operational
|
||||||
|
10. Trigger auto-upgrade on `ns1` and `ns2` to propagate DNS records for the new host
|
||||||
|
11. Trigger auto-upgrade on `monitoring01` to add the host to Prometheus scrape targets
|
||||||
|
|
||||||
|
## Tier Specification
|
||||||
|
|
||||||
|
New hosts should set `homelab.host.tier` in their configuration:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
homelab.host.tier = "test"; # or "prod"
|
||||||
|
```
|
||||||
|
|
||||||
|
- **test** - Test-tier hosts can receive remote deployments via the `homelab-deploy` MCP server and have different credential access. Use for staging/testing.
|
||||||
|
- **prod** - Production hosts. Deployments require direct access or the CLI with appropriate credentials.
|
||||||
|
|
||||||
|
## Observability
|
||||||
|
|
||||||
|
During the bootstrap process, status updates are sent to Loki. Query bootstrap logs with:
|
||||||
|
|
||||||
|
```
|
||||||
|
{job="bootstrap", host="<hostname>"}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Bootstrap Stages
|
||||||
|
|
||||||
|
The bootstrap process reports these stages via the `stage` label:
|
||||||
|
|
||||||
|
| Stage | Message | Meaning |
|
||||||
|
|-------|---------|---------|
|
||||||
|
| `starting` | Bootstrap starting for \<host\> (branch: \<branch\>) | Bootstrap service has started |
|
||||||
|
| `network_ok` | Network connectivity confirmed | Can reach git server |
|
||||||
|
| `vault_ok` | Vault credentials unwrapped and stored | AppRole credentials provisioned |
|
||||||
|
| `vault_skip` | No Vault token provided - skipping credential setup | No wrapped token was provided |
|
||||||
|
| `vault_warn` | Failed to unwrap Vault token - continuing without secrets | Token unwrap failed (expired/used) |
|
||||||
|
| `building` | Starting nixos-rebuild boot | NixOS build starting |
|
||||||
|
| `success` | Build successful - rebooting into new configuration | Build complete, rebooting |
|
||||||
|
| `failed` | nixos-rebuild failed - manual intervention required | Build failed |
|
||||||
|
|
||||||
|
### Useful Queries
|
||||||
|
|
||||||
|
```
|
||||||
|
# All bootstrap activity for a host
|
||||||
|
{job="bootstrap", host="myhost"}
|
||||||
|
|
||||||
|
# Track all failures
|
||||||
|
{job="bootstrap", stage="failed"}
|
||||||
|
|
||||||
|
# Monitor builds in progress
|
||||||
|
{job="bootstrap", stage=~"building|success"}
|
||||||
|
```
|
||||||
|
|
||||||
|
Once the VM reboots with its full configuration, it will start publishing metrics to Prometheus and logs to Loki via Promtail.
|
||||||
|
|
||||||
|
## Verification
|
||||||
|
|
||||||
|
1. Check bootstrap completed successfully:
|
||||||
|
```
|
||||||
|
{job="bootstrap", host="<hostname>", stage="success"}
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Verify the host is up and reporting metrics:
|
||||||
|
```promql
|
||||||
|
up{instance=~"<hostname>.*"}
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Verify the correct flake revision is deployed:
|
||||||
|
```promql
|
||||||
|
nixos_flake_info{instance=~"<hostname>.*"}
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Check logs are flowing:
|
||||||
|
```
|
||||||
|
{host="<hostname>"}
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Confirm expected services are running and producing logs
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Bootstrap Failed
|
||||||
|
|
||||||
|
#### Common Issues
|
||||||
|
|
||||||
|
* VM has trouble running initial nixos-rebuild. Usually caused if it needs to compile packages from scratch if they are not available in our local nix-cache.
|
||||||
|
|
||||||
|
#### Troubleshooting
|
||||||
|
|
||||||
|
1. Check bootstrap logs in Loki - if they never progress past `building`, the rebuild likely consumed all resources:
|
||||||
|
```
|
||||||
|
{job="bootstrap", host="<hostname>"}
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **USER**: SSH into the host and check the bootstrap service:
|
||||||
|
```bash
|
||||||
|
ssh root@<hostname>
|
||||||
|
journalctl -u nixos-bootstrap.service
|
||||||
|
```
|
||||||
|
|
||||||
|
3. If the build failed due to resource constraints, increase VM specs in `terraform/vms.tf` and redeploy, or manually run the rebuild:
|
||||||
|
```bash
|
||||||
|
nixos-rebuild boot --flake git+https://git.t-juice.club/torjus/nixos-servers.git#<hostname>
|
||||||
|
```
|
||||||
|
|
||||||
|
4. If the host config doesn't exist in the flake, ensure step 5 was completed (config pushed to the correct branch).
|
||||||
|
|
||||||
|
### Vault Credentials Not Working
|
||||||
|
|
||||||
|
Usually caused by running the `create-host` script without proper credentials, or the wrapped token has expired/already been used.
|
||||||
|
|
||||||
|
#### Troubleshooting
|
||||||
|
|
||||||
|
1. Check if credentials exist on the host:
|
||||||
|
```bash
|
||||||
|
ssh root@<hostname>
|
||||||
|
ls -la /var/lib/vault/approle/
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Check bootstrap logs for vault-related stages:
|
||||||
|
```
|
||||||
|
{job="bootstrap", host="<hostname>", stage=~"vault.*"}
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **USER**: Regenerate and provision credentials manually:
|
||||||
|
```bash
|
||||||
|
nix develop -c ansible-playbook playbooks/provision-approle.yml -e hostname=<hostname>
|
||||||
|
```
|
||||||
|
|
||||||
|
### Host Not Appearing in DNS
|
||||||
|
|
||||||
|
Usually caused by not having deployed the commit with the new host to ns1/ns2.
|
||||||
|
|
||||||
|
#### Troubleshooting
|
||||||
|
|
||||||
|
1. Verify the host config has a static IP configured in `systemd.network.networks`
|
||||||
|
|
||||||
|
2. Check that `homelab.dns.enable` is not set to `false`
|
||||||
|
|
||||||
|
3. **USER**: Trigger auto-upgrade on DNS servers:
|
||||||
|
```bash
|
||||||
|
ssh root@ns1 systemctl start nixos-upgrade.service
|
||||||
|
ssh root@ns2 systemctl start nixos-upgrade.service
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Verify DNS resolution after upgrade completes:
|
||||||
|
```bash
|
||||||
|
dig @ns1.home.2rjus.net <hostname>.home.2rjus.net
|
||||||
|
```
|
||||||
|
|
||||||
|
### Host Not Being Scraped by Prometheus
|
||||||
|
|
||||||
|
Usually caused by not having deployed the commit with the new host to the monitoring host.
|
||||||
|
|
||||||
|
#### Troubleshooting
|
||||||
|
|
||||||
|
1. Check that `homelab.monitoring.enable` is not set to `false`
|
||||||
|
|
||||||
|
2. **USER**: Trigger auto-upgrade on monitoring01:
|
||||||
|
```bash
|
||||||
|
ssh root@monitoring01 systemctl start nixos-upgrade.service
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Verify the target appears in Prometheus:
|
||||||
|
```promql
|
||||||
|
up{instance=~"<hostname>.*"}
|
||||||
|
```
|
||||||
|
|
||||||
|
4. If the target is down, check that node-exporter is running on the host:
|
||||||
|
```bash
|
||||||
|
ssh root@<hostname> systemctl status prometheus-node-exporter.service
|
||||||
|
```
|
||||||
|
|
||||||
|
## Related Files
|
||||||
|
|
||||||
|
| Path | Description |
|
||||||
|
|------|-------------|
|
||||||
|
| `scripts/create-host/` | The `create-host` script that generates host configurations |
|
||||||
|
| `hosts/template2/` | Template VM configuration (base image for new VMs) |
|
||||||
|
| `hosts/template2/bootstrap.nix` | Bootstrap service that applies NixOS config on first boot |
|
||||||
|
| `terraform/vms.tf` | VM definitions (specs, IPs, branch overrides) |
|
||||||
|
| `terraform/cloud-init.tf` | Cloud-init configuration (passes hostname, branch, vault token) |
|
||||||
|
| `terraform/vault/approle.tf` | AppRole policies for each host |
|
||||||
|
| `terraform/vault/secrets.tf` | Secret definitions in Vault |
|
||||||
|
| `terraform/vault/hosts-generated.tf` | Auto-generated wrapped tokens for VM bootstrap |
|
||||||
|
| `playbooks/provision-approle.yml` | Ansible playbook for manual credential provisioning |
|
||||||
|
| `flake.nix` | Flake with all host configurations (add new hosts here) |
|
||||||
164
docs/plans/auth-system-replacement.md
Normal file
164
docs/plans/auth-system-replacement.md
Normal file
@@ -0,0 +1,164 @@
|
|||||||
|
# Authentication System Replacement Plan
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Deploy a modern, unified authentication solution for the homelab. Provides central user management, SSO for web services, and consistent UID/GID mapping for NAS permissions.
|
||||||
|
|
||||||
|
## Goals
|
||||||
|
|
||||||
|
1. **Central user database** - Manage users across all homelab hosts from a single source
|
||||||
|
2. **Linux PAM/NSS integration** - Users can SSH into hosts using central credentials
|
||||||
|
3. **UID/GID consistency** - Proper POSIX attributes for NAS share permissions
|
||||||
|
4. **OIDC provider** - Single sign-on for homelab web services (Grafana, etc.)
|
||||||
|
|
||||||
|
## Solution: Kanidm
|
||||||
|
|
||||||
|
Kanidm was chosen for the following reasons:
|
||||||
|
|
||||||
|
| Requirement | Kanidm Support |
|
||||||
|
|-------------|----------------|
|
||||||
|
| Central user database | Native |
|
||||||
|
| Linux PAM/NSS (host login) | Native NixOS module |
|
||||||
|
| UID/GID for NAS | POSIX attributes supported |
|
||||||
|
| OIDC for services | Built-in |
|
||||||
|
| Declarative config | Excellent NixOS provisioning |
|
||||||
|
| Simplicity | Modern API, LDAP optional |
|
||||||
|
| NixOS integration | First-class |
|
||||||
|
|
||||||
|
### Configuration Files
|
||||||
|
|
||||||
|
- **Host configuration:** `hosts/kanidm01/`
|
||||||
|
- **Service module:** `services/kanidm/default.nix`
|
||||||
|
|
||||||
|
## NAS Integration
|
||||||
|
|
||||||
|
### Current: TrueNAS CORE (FreeBSD)
|
||||||
|
|
||||||
|
TrueNAS CORE has a built-in LDAP client. Kanidm's read-only LDAP interface will work for NFS share permissions:
|
||||||
|
|
||||||
|
- **NFS shares**: Only need consistent UID/GID mapping - Kanidm's LDAP provides this
|
||||||
|
- **No SMB requirement**: SMB would need Samba schema attributes (deprecated in TrueNAS 13.0+), but we're NFS-only
|
||||||
|
|
||||||
|
Configuration approach:
|
||||||
|
1. Enable Kanidm's LDAP interface (`ldapbindaddress = "0.0.0.0:636"`)
|
||||||
|
2. Import internal CA certificate into TrueNAS
|
||||||
|
3. Configure TrueNAS LDAP client with Kanidm's Base DN and bind credentials
|
||||||
|
4. Users/groups appear in TrueNAS permission dropdowns
|
||||||
|
|
||||||
|
Note: Kanidm's LDAP is read-only and uses LDAPS only (no StartTLS). This is fine for our use case.
|
||||||
|
|
||||||
|
### Future: NixOS NAS
|
||||||
|
|
||||||
|
When the NAS is migrated to NixOS, it becomes a first-class citizen:
|
||||||
|
|
||||||
|
- Native Kanidm PAM/NSS integration (same as other hosts)
|
||||||
|
- No LDAP compatibility layer needed
|
||||||
|
- Full integration with the rest of the homelab
|
||||||
|
|
||||||
|
This future migration path is a strong argument for Kanidm over LDAP-only solutions.
|
||||||
|
|
||||||
|
## Implementation Steps
|
||||||
|
|
||||||
|
1. **Create kanidm01 host and service module** ✅
|
||||||
|
- Host: `kanidm01.home.2rjus.net` (10.69.13.23, test tier)
|
||||||
|
- Service module: `services/kanidm/`
|
||||||
|
- TLS via internal ACME (`auth.home.2rjus.net`)
|
||||||
|
- Vault integration for idm_admin password
|
||||||
|
- LDAPS on port 636
|
||||||
|
|
||||||
|
2. **Configure provisioning** ✅
|
||||||
|
- Groups provisioned declaratively: `admins`, `users`, `ssh-users`
|
||||||
|
- Users managed imperatively via CLI (allows setting POSIX passwords in one step)
|
||||||
|
- POSIX attributes enabled (UID/GID range 65,536-69,999)
|
||||||
|
|
||||||
|
3. **Test NAS integration** (in progress)
|
||||||
|
- ✅ LDAP interface verified working
|
||||||
|
- Configure TrueNAS LDAP client to connect to Kanidm
|
||||||
|
- Verify UID/GID mapping works with NFS shares
|
||||||
|
|
||||||
|
4. **Add OIDC clients** for homelab services
|
||||||
|
- Grafana
|
||||||
|
- Other services as needed
|
||||||
|
|
||||||
|
5. **Create client module** in `system/` for PAM/NSS ✅
|
||||||
|
- Module: `system/kanidm-client.nix`
|
||||||
|
- `homelab.kanidm.enable = true` enables PAM/NSS
|
||||||
|
- Short usernames (not SPN format)
|
||||||
|
- Home directory symlinks via `home_alias`
|
||||||
|
- Enabled on test tier: testvm01, testvm02, testvm03
|
||||||
|
|
||||||
|
6. **Documentation** ✅
|
||||||
|
- `docs/user-management.md` - CLI workflows, troubleshooting
|
||||||
|
- User/group creation procedures verified working
|
||||||
|
|
||||||
|
## Progress
|
||||||
|
|
||||||
|
### Completed (2026-02-08)
|
||||||
|
|
||||||
|
**Kanidm server deployed on kanidm01 (test tier):**
|
||||||
|
- Host: `kanidm01.home.2rjus.net` (10.69.13.23)
|
||||||
|
- WebUI: `https://auth.home.2rjus.net`
|
||||||
|
- LDAPS: port 636
|
||||||
|
- Valid certificate from internal CA
|
||||||
|
|
||||||
|
**Configuration:**
|
||||||
|
- Kanidm 1.8 with secret provisioning support
|
||||||
|
- Daily backups at 22:00 (7 versions retained)
|
||||||
|
- Vault integration for idm_admin password
|
||||||
|
- Prometheus monitoring scrape target configured
|
||||||
|
|
||||||
|
**Provisioned entities:**
|
||||||
|
- Groups: `admins`, `users`, `ssh-users` (declarative)
|
||||||
|
- Users managed via CLI (imperative)
|
||||||
|
|
||||||
|
**Verified working:**
|
||||||
|
- WebUI login with idm_admin
|
||||||
|
- LDAP bind and search with POSIX-enabled user
|
||||||
|
- LDAPS with valid internal CA certificate
|
||||||
|
|
||||||
|
### Completed (2026-02-08) - PAM/NSS Client
|
||||||
|
|
||||||
|
**Client module deployed (`system/kanidm-client.nix`):**
|
||||||
|
- `homelab.kanidm.enable = true` enables PAM/NSS integration
|
||||||
|
- Connects to auth.home.2rjus.net
|
||||||
|
- Short usernames (`torjus` instead of `torjus@home.2rjus.net`)
|
||||||
|
- Home directory symlinks (`/home/torjus` → UUID-based dir)
|
||||||
|
- Login restricted to `ssh-users` group
|
||||||
|
|
||||||
|
**Enabled on test tier:**
|
||||||
|
- testvm01, testvm02, testvm03
|
||||||
|
|
||||||
|
**Verified working:**
|
||||||
|
- User/group resolution via `getent`
|
||||||
|
- SSH login with Kanidm unix passwords
|
||||||
|
- Home directory creation with symlinks
|
||||||
|
- Imperative user/group creation via CLI
|
||||||
|
|
||||||
|
**Documentation:**
|
||||||
|
- `docs/user-management.md` with full CLI workflows
|
||||||
|
- Password requirements (min 10 chars)
|
||||||
|
- Troubleshooting guide (nscd, cache invalidation)
|
||||||
|
|
||||||
|
### UID/GID Range (Resolved)
|
||||||
|
|
||||||
|
**Range: 65,536 - 69,999** (manually allocated)
|
||||||
|
|
||||||
|
- Users: 65,536 - 67,999 (up to ~2500 users)
|
||||||
|
- Groups: 68,000 - 69,999 (up to ~2000 groups)
|
||||||
|
|
||||||
|
Rationale:
|
||||||
|
- Starts at Kanidm's recommended minimum (65,536)
|
||||||
|
- Well above NixOS system users (typically <1000)
|
||||||
|
- Avoids Podman/container issues with very high GIDs
|
||||||
|
|
||||||
|
### Next Steps
|
||||||
|
|
||||||
|
1. Enable PAM/NSS on production hosts (after test tier validation)
|
||||||
|
2. Configure TrueNAS LDAP client for NAS integration testing
|
||||||
|
3. Add OAuth2 clients (Grafana first)
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [Kanidm Documentation](https://kanidm.github.io/kanidm/stable/)
|
||||||
|
- [NixOS Kanidm Module](https://search.nixos.org/options?query=services.kanidm)
|
||||||
|
- [Kanidm PAM/NSS Integration](https://kanidm.github.io/kanidm/stable/pam_and_nsswitch.html)
|
||||||
72
docs/plans/cert-monitoring.md
Normal file
72
docs/plans/cert-monitoring.md
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
# Certificate Monitoring Plan
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
This document describes the removal of labmon certificate monitoring and outlines future needs for certificate monitoring in the homelab.
|
||||||
|
|
||||||
|
## What Was Removed
|
||||||
|
|
||||||
|
### labmon Service
|
||||||
|
|
||||||
|
The `labmon` service was a custom Go application that provided:
|
||||||
|
|
||||||
|
1. **StepMonitor**: Monitoring for step-ca (Smallstep CA) certificate provisioning and health
|
||||||
|
2. **TLSConnectionMonitor**: Periodic TLS connection checks to verify certificate validity and expiration
|
||||||
|
|
||||||
|
The service exposed Prometheus metrics at `:9969` including:
|
||||||
|
- `labmon_tlsconmon_certificate_seconds_left` - Time until certificate expiration
|
||||||
|
- `labmon_tlsconmon_certificate_check_error` - Whether the TLS check failed
|
||||||
|
- `labmon_stepmon_certificate_seconds_left` - Step-CA internal certificate expiration
|
||||||
|
|
||||||
|
### Affected Files
|
||||||
|
|
||||||
|
- `hosts/monitoring01/configuration.nix` - Removed labmon configuration block
|
||||||
|
- `services/monitoring/prometheus.nix` - Removed labmon scrape target
|
||||||
|
- `services/monitoring/rules.yml` - Removed `certificate_rules` alert group
|
||||||
|
- `services/monitoring/alloy.nix` - Deleted (was only used for labmon profiling)
|
||||||
|
- `services/monitoring/default.nix` - Removed alloy.nix import
|
||||||
|
|
||||||
|
### Removed Alerts
|
||||||
|
|
||||||
|
- `certificate_expiring_soon` - Warned when any monitored TLS cert had < 24h validity
|
||||||
|
- `step_ca_serving_cert_expiring` - Critical alert for step-ca's own serving certificate
|
||||||
|
- `certificate_check_error` - Warned when TLS connection check failed
|
||||||
|
- `step_ca_certificate_expiring` - Critical alert for step-ca issued certificates
|
||||||
|
|
||||||
|
## Why It Was Removed
|
||||||
|
|
||||||
|
1. **step-ca decommissioned**: The primary monitoring target (step-ca) is no longer in use
|
||||||
|
2. **Outdated codebase**: labmon was a custom tool that required maintenance
|
||||||
|
3. **Limited value**: With ACME auto-renewal, certificates should renew automatically
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
ACME certificates are now issued by OpenBao PKI at `vault.home.2rjus.net:8200`. The ACME protocol handles automatic renewal, and certificates are typically renewed well before expiration.
|
||||||
|
|
||||||
|
## Future Needs
|
||||||
|
|
||||||
|
While ACME handles renewal automatically, we should consider monitoring for:
|
||||||
|
|
||||||
|
1. **ACME renewal failures**: Alert when a certificate fails to renew
|
||||||
|
- Could monitor ACME client logs (via Loki queries)
|
||||||
|
- Could check certificate file modification times
|
||||||
|
|
||||||
|
2. **Certificate expiration as backup**: Even with auto-renewal, a last-resort alert for certificates approaching expiration would catch renewal failures
|
||||||
|
|
||||||
|
3. **Certificate transparency**: Monitor for unexpected certificate issuance
|
||||||
|
|
||||||
|
### Potential Solutions
|
||||||
|
|
||||||
|
1. **Prometheus blackbox_exporter**: Can probe TLS endpoints and export certificate expiration metrics
|
||||||
|
- `probe_ssl_earliest_cert_expiry` metric
|
||||||
|
- Already a standard tool, well-maintained
|
||||||
|
|
||||||
|
2. **Custom Loki alerting**: Query ACME service logs for renewal failures
|
||||||
|
- Works with existing infrastructure
|
||||||
|
- No additional services needed
|
||||||
|
|
||||||
|
3. **Node-exporter textfile collector**: Script that checks local certificate files and writes expiration metrics
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
**Not yet implemented.** This document serves as a placeholder for future work on certificate monitoring.
|
||||||
35
docs/plans/completed/bootstrap-cache.md
Normal file
35
docs/plans/completed/bootstrap-cache.md
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
# Plan: Configure Template2 to Use Nix Cache
|
||||||
|
|
||||||
|
## Problem
|
||||||
|
|
||||||
|
New VMs bootstrapped from template2 don't use our local nix cache (nix-cache.home.2rjus.net) during the initial `nixos-rebuild boot`. This means the first build downloads everything from cache.nixos.org, which is slower and uses more bandwidth.
|
||||||
|
|
||||||
|
## Solution
|
||||||
|
|
||||||
|
Update the template2 base image to include the nix cache configuration, so new VMs immediately benefit from cached builds during bootstrap.
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
1. Add nix cache configuration to `hosts/template2/configuration.nix`:
|
||||||
|
```nix
|
||||||
|
nix.settings = {
|
||||||
|
substituters = [ "https://nix-cache.home.2rjus.net" "https://cache.nixos.org" ];
|
||||||
|
trusted-public-keys = [
|
||||||
|
"nix-cache.home.2rjus.net:..." # Add the cache's public key
|
||||||
|
"cache.nixos.org-1:..."
|
||||||
|
];
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Rebuild and redeploy the Proxmox template:
|
||||||
|
```bash
|
||||||
|
nix develop -c ansible-playbook -i playbooks/inventory.ini playbooks/build-and-deploy-template.yml
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Update `default_template_name` in `terraform/variables.tf` if the template name changed
|
||||||
|
|
||||||
|
## Benefits
|
||||||
|
|
||||||
|
- Faster VM bootstrap times
|
||||||
|
- Reduced bandwidth to external cache
|
||||||
|
- Most derivations will already be cached from other hosts
|
||||||
23
docs/plans/completed/host-cleanup.md
Normal file
23
docs/plans/completed/host-cleanup.md
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
# Host Cleanup
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Remove decommissioned/unused host configurations that are no longer reachable on the network.
|
||||||
|
|
||||||
|
## Hosts to review
|
||||||
|
|
||||||
|
The following hosts return "no route to host" from Prometheus scraping and are likely no longer needed:
|
||||||
|
|
||||||
|
- `media1` (10.69.12.82)
|
||||||
|
- `ns3` (10.69.13.7)
|
||||||
|
- `ns4` (10.69.13.8)
|
||||||
|
- `nixos-test1` (10.69.13.10)
|
||||||
|
|
||||||
|
## Steps
|
||||||
|
|
||||||
|
1. Confirm each host is truly decommissioned (not just temporarily powered off)
|
||||||
|
2. Remove host directory from `hosts/`
|
||||||
|
3. Remove `nixosConfigurations` entry from `flake.nix`
|
||||||
|
4. Remove host's age key from `.sops.yaml`
|
||||||
|
5. Remove per-host secrets from `secrets/<hostname>/` if any
|
||||||
|
6. Verify DNS zone and Prometheus targets no longer include the removed hosts after rebuild
|
||||||
128
docs/plans/completed/monitoring-gaps.md
Normal file
128
docs/plans/completed/monitoring-gaps.md
Normal file
@@ -0,0 +1,128 @@
|
|||||||
|
# Monitoring Gaps Audit
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Audit of services running in the homelab that lack monitoring coverage, either missing Prometheus scrape targets, alerting rules, or both.
|
||||||
|
|
||||||
|
## Services with No Monitoring
|
||||||
|
|
||||||
|
### PostgreSQL (`pgdb1`)
|
||||||
|
|
||||||
|
- **Current state:** No scrape targets, no alert rules
|
||||||
|
- **Risk:** A database outage would go completely unnoticed by Prometheus
|
||||||
|
- **Recommendation:** Enable `services.prometheus.exporters.postgres` (available in nixpkgs). This exposes connection counts, query throughput, replication lag, table/index stats, and more. Add alerts for at least `postgres_down` (systemd unit state) and connection pool exhaustion.
|
||||||
|
|
||||||
|
### Authelia (`auth01`)
|
||||||
|
|
||||||
|
- **Current state:** No scrape targets, no alert rules
|
||||||
|
- **Risk:** The authentication gateway being down blocks access to all proxied services
|
||||||
|
- **Recommendation:** Authelia exposes Prometheus metrics natively at `/metrics`. Add a scrape target and at minimum an `authelia_down` systemd unit state alert.
|
||||||
|
|
||||||
|
### LLDAP (`auth01`)
|
||||||
|
|
||||||
|
- **Current state:** No scrape targets, no alert rules
|
||||||
|
- **Risk:** LLDAP is a dependency of Authelia -- if LDAP is down, authentication breaks even if Authelia is running
|
||||||
|
- **Recommendation:** Add an `lldap_down` systemd unit state alert. LLDAP does not expose Prometheus metrics natively, so systemd unit monitoring via node-exporter may be sufficient.
|
||||||
|
|
||||||
|
### Vault / OpenBao (`vault01`)
|
||||||
|
|
||||||
|
- **Current state:** No scrape targets, no alert rules
|
||||||
|
- **Risk:** Secrets management service failures go undetected
|
||||||
|
- **Recommendation:** OpenBao supports Prometheus telemetry output natively. Add a scrape target for the telemetry endpoint and alerts for `vault_down` (systemd unit) and seal status.
|
||||||
|
|
||||||
|
### Gitea Actions Runner
|
||||||
|
|
||||||
|
- **Current state:** No scrape targets, no alert rules
|
||||||
|
- **Risk:** CI/CD failures go undetected
|
||||||
|
- **Recommendation:** Add at minimum a systemd unit state alert. The runner itself has limited metrics exposure.
|
||||||
|
|
||||||
|
## Services with Partial Monitoring
|
||||||
|
|
||||||
|
### Jellyfin (`jelly01`)
|
||||||
|
|
||||||
|
- **Current state:** Has scrape targets (port 8096), metrics are being collected, but zero alert rules
|
||||||
|
- **Metrics available:** 184 metrics, all .NET runtime / ASP.NET Core level. No Jellyfin-specific metrics (active streams, library size, transcoding sessions). Key useful metrics:
|
||||||
|
- `microsoft_aspnetcore_hosting_failed_requests` - rate of HTTP errors
|
||||||
|
- `microsoft_aspnetcore_hosting_current_requests` - in-flight requests
|
||||||
|
- `process_working_set_bytes` - memory usage (~256 MB currently)
|
||||||
|
- `dotnet_gc_pause_ratio` - GC pressure
|
||||||
|
- `up{job="jellyfin"}` - basic availability
|
||||||
|
- **Recommendation:** Add a `jellyfin_down` alert using either `up{job="jellyfin"} == 0` or systemd unit state. Consider alerting on sustained `failed_requests` rate increase.
|
||||||
|
|
||||||
|
### NATS (`nats1`)
|
||||||
|
|
||||||
|
- **Current state:** Has a `nats_down` alert (systemd unit state via node-exporter), but no NATS-specific metrics
|
||||||
|
- **Metrics available:** NATS has a built-in `/metrics` endpoint exposing connection counts, message throughput, JetStream consumer lag, and more
|
||||||
|
- **Recommendation:** Add a scrape target for the NATS metrics endpoint. Consider alerts for connection count spikes, slow consumers, and JetStream storage usage.
|
||||||
|
|
||||||
|
### DNS - Unbound (`ns1`, `ns2`)
|
||||||
|
|
||||||
|
- **Current state:** Has `unbound_down` alert (systemd unit state), but no DNS query metrics
|
||||||
|
- **Available in nixpkgs:** `services.prometheus.exporters.unbound.enable` (package: `prometheus-unbound-exporter` v0.5.0). Exposes query counts, cache hit ratios, response types (SERVFAIL, NXDOMAIN), upstream latency.
|
||||||
|
- **Recommendation:** Enable the unbound exporter on ns1/ns2. Add alerts for cache hit ratio drops and SERVFAIL rate spikes.
|
||||||
|
|
||||||
|
### DNS - NSD (`ns1`, `ns2`)
|
||||||
|
|
||||||
|
- **Current state:** Has `nsd_down` alert (systemd unit state), no NSD-specific metrics
|
||||||
|
- **Available in nixpkgs:** Nothing. No exporter package or NixOS module. Community `nsd_exporter` exists but is not packaged.
|
||||||
|
- **Recommendation:** The existing systemd unit alert is likely sufficient. NSD is a simple authoritative-only server with limited operational metrics. Not worth packaging a custom exporter for now.
|
||||||
|
|
||||||
|
## Existing Monitoring (for reference)
|
||||||
|
|
||||||
|
These services have adequate alerting and/or scrape targets:
|
||||||
|
|
||||||
|
| Service | Scrape Targets | Alert Rules |
|
||||||
|
|---|---|---|
|
||||||
|
| Monitoring stack (Prometheus, Grafana, Loki, Tempo, Pyroscope) | Yes | 7 alerts |
|
||||||
|
| Home Assistant (+ Zigbee2MQTT, Mosquitto) | Yes (port 8123) | 3 alerts |
|
||||||
|
| HTTP Proxy (Caddy) | Yes (port 80) | 3 alerts |
|
||||||
|
| Nix Cache (Harmonia, build-flakes) | Via Caddy | 4 alerts |
|
||||||
|
| CA (step-ca) | Yes (port 9000) | 4 certificate alerts |
|
||||||
|
|
||||||
|
## Per-Service Resource Metrics (systemd-exporter)
|
||||||
|
|
||||||
|
### Current State
|
||||||
|
|
||||||
|
No per-service CPU, memory, or IO metrics are collected. The existing node-exporter systemd collector only provides unit state (active/inactive/failed), socket stats, and timer triggers. While systemd tracks per-unit resource usage via cgroups internally (visible in `systemctl status` and `systemd-cgtop`), this data is not exported to Prometheus.
|
||||||
|
|
||||||
|
### Available Solution
|
||||||
|
|
||||||
|
The `prometheus-systemd-exporter` package (v0.7.0) is available in nixpkgs with a ready-made NixOS module:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
services.prometheus.exporters.systemd.enable = true;
|
||||||
|
```
|
||||||
|
|
||||||
|
**Options:** `enable`, `port`, `extraFlags`, `user`, `group`
|
||||||
|
|
||||||
|
This exporter reads cgroup data and exposes per-unit metrics including:
|
||||||
|
- CPU seconds consumed per service
|
||||||
|
- Memory usage per service
|
||||||
|
- Task/process counts per service
|
||||||
|
- Restart counts
|
||||||
|
- IO usage
|
||||||
|
|
||||||
|
### Recommendation
|
||||||
|
|
||||||
|
Enable on all hosts via the shared `system/` config (same pattern as node-exporter). Add a corresponding scrape job on monitoring01. This would give visibility into resource consumption per service across the fleet, useful for capacity planning and diagnosing noisy-neighbor issues on shared hosts.
|
||||||
|
|
||||||
|
## Suggested Priority
|
||||||
|
|
||||||
|
1. **PostgreSQL** - Critical infrastructure, easy to add with existing nixpkgs module
|
||||||
|
2. **Authelia + LLDAP** - Auth outage affects all proxied services
|
||||||
|
3. **Unbound exporter** - Ready-to-go NixOS module, just needs enabling
|
||||||
|
4. **Jellyfin alerts** - Metrics already collected, just needs alert rules
|
||||||
|
5. **NATS metrics** - Built-in endpoint, just needs a scrape target
|
||||||
|
6. **Vault/OpenBao** - Native telemetry support
|
||||||
|
7. **Actions Runner** - Lower priority, basic systemd alert sufficient
|
||||||
|
|
||||||
|
## Node-Exporter Targets Currently Down
|
||||||
|
|
||||||
|
Noted during audit -- these node-exporter targets are failing:
|
||||||
|
|
||||||
|
- `nixos-test1.home.2rjus.net:9100` - no route to host
|
||||||
|
- `media1.home.2rjus.net:9100` - no route to host
|
||||||
|
- `ns3.home.2rjus.net:9100` - no route to host
|
||||||
|
- `ns4.home.2rjus.net:9100` - no route to host
|
||||||
|
|
||||||
|
These may be decommissioned or powered-off hosts that should be removed from the scrape config.
|
||||||
371
docs/plans/completed/nats-deploy-service.md
Normal file
371
docs/plans/completed/nats-deploy-service.md
Normal file
@@ -0,0 +1,371 @@
|
|||||||
|
# NATS-Based Deployment Service
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Create a message-based deployment system that allows triggering NixOS configuration updates on-demand, rather than waiting for the daily auto-upgrade timer. This enables faster iteration when testing changes and immediate fleet-wide deployments.
|
||||||
|
|
||||||
|
## Goals
|
||||||
|
|
||||||
|
1. **On-demand deployment** - Trigger config updates immediately via NATS message
|
||||||
|
2. **Targeted deployment** - Deploy to specific hosts or all hosts
|
||||||
|
3. **Branch/revision support** - Test feature branches before merging to master
|
||||||
|
4. **MCP integration** - Allow Claude Code to trigger deployments during development
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
- **Auto-upgrade**: All hosts run `nixos-upgrade.service` daily, pulling from master
|
||||||
|
- **Manual testing**: `nixos-rebuild-test <action> <branch>` helper exists on all hosts
|
||||||
|
- **NATS**: Running on nats1 with JetStream enabled, using NKey authentication
|
||||||
|
- **Accounts**: ADMIN (system) and HOMELAB (user workloads with JetStream)
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────┐ ┌─────────────┐
|
||||||
|
│ MCP Tool │ deploy.test.> │ Admin CLI │ deploy.test.> + deploy.prod.>
|
||||||
|
│ (claude) │────────────┐ ┌─────│ (torjus) │
|
||||||
|
└─────────────┘ │ │ └─────────────┘
|
||||||
|
▼ ▼
|
||||||
|
┌──────────────┐
|
||||||
|
│ nats1 │
|
||||||
|
│ (authz) │
|
||||||
|
└──────┬───────┘
|
||||||
|
│
|
||||||
|
┌─────────────────┼─────────────────┐
|
||||||
|
│ │ │
|
||||||
|
▼ ▼ ▼
|
||||||
|
┌──────────┐ ┌──────────┐ ┌──────────┐
|
||||||
|
│ template1│ │ ns1 │ │ ha1 │
|
||||||
|
│ tier=test│ │ tier=prod│ │ tier=prod│
|
||||||
|
└──────────┘ └──────────┘ └──────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## Repository Structure
|
||||||
|
|
||||||
|
The project lives in a **separate repository** (e.g., `homelab-deploy`) containing:
|
||||||
|
|
||||||
|
```
|
||||||
|
homelab-deploy/
|
||||||
|
├── flake.nix # Nix flake with Go package + NixOS module
|
||||||
|
├── go.mod
|
||||||
|
├── go.sum
|
||||||
|
├── cmd/
|
||||||
|
│ └── homelab-deploy/
|
||||||
|
│ └── main.go # CLI entrypoint with subcommands
|
||||||
|
├── internal/
|
||||||
|
│ ├── listener/ # Listener mode logic
|
||||||
|
│ ├── mcp/ # MCP server mode logic
|
||||||
|
│ └── deploy/ # Shared deployment logic
|
||||||
|
└── nixos/
|
||||||
|
└── module.nix # NixOS module for listener service
|
||||||
|
```
|
||||||
|
|
||||||
|
This repo imports the flake as an input and uses the NixOS module.
|
||||||
|
|
||||||
|
## Single Binary with Subcommands
|
||||||
|
|
||||||
|
The `homelab-deploy` binary supports multiple modes:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run as listener on a host (systemd service)
|
||||||
|
homelab-deploy listener --hostname ns1 --nats-url nats://nats1:4222
|
||||||
|
|
||||||
|
# Run as MCP server (for Claude Code)
|
||||||
|
homelab-deploy mcp --nats-url nats://nats1:4222
|
||||||
|
|
||||||
|
# CLI commands for manual use
|
||||||
|
homelab-deploy deploy ns1 --branch feature-x --action switch # single host
|
||||||
|
homelab-deploy deploy --tier test --all --action boot # all test hosts
|
||||||
|
homelab-deploy deploy --tier prod --all --action boot # all prod hosts (admin only)
|
||||||
|
homelab-deploy deploy --tier prod --role dns --action switch # all prod dns hosts
|
||||||
|
homelab-deploy status
|
||||||
|
```
|
||||||
|
|
||||||
|
## Components
|
||||||
|
|
||||||
|
### Listener Mode
|
||||||
|
|
||||||
|
A systemd service on each host that:
|
||||||
|
- Subscribes to multiple subjects for targeted and group deployments
|
||||||
|
- Validates incoming messages (revision, action)
|
||||||
|
- Executes `nixos-rebuild` with specified parameters
|
||||||
|
- Reports status back via NATS
|
||||||
|
|
||||||
|
**Subject structure:**
|
||||||
|
```
|
||||||
|
deploy.<tier>.<hostname> # specific host (e.g., deploy.prod.ns1)
|
||||||
|
deploy.<tier>.all # all hosts in tier (e.g., deploy.test.all)
|
||||||
|
deploy.<tier>.role.<role> # all hosts with role in tier (e.g., deploy.prod.role.dns)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Listener subscriptions** (based on `homelab.host` config):
|
||||||
|
- `deploy.<tier>.<hostname>` - direct messages to this host
|
||||||
|
- `deploy.<tier>.all` - broadcast to all hosts in tier
|
||||||
|
- `deploy.<tier>.role.<role>` - broadcast to hosts with matching role (if role is set)
|
||||||
|
|
||||||
|
Example: ns1 with `tier=prod, role=dns` subscribes to:
|
||||||
|
- `deploy.prod.ns1`
|
||||||
|
- `deploy.prod.all`
|
||||||
|
- `deploy.prod.role.dns`
|
||||||
|
|
||||||
|
**NixOS module configuration:**
|
||||||
|
```nix
|
||||||
|
services.homelab-deploy.listener = {
|
||||||
|
enable = true;
|
||||||
|
timeout = 600; # seconds, default 10 minutes
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
The listener reads tier and role from `config.homelab.host` (see Host Metadata below).
|
||||||
|
|
||||||
|
**Request message format:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"action": "switch" | "boot" | "test" | "dry-activate",
|
||||||
|
"revision": "master" | "feature-branch" | "abc123...",
|
||||||
|
"reply_to": "deploy.responses.<request-id>"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response message format:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"status": "accepted" | "rejected" | "started" | "completed" | "failed",
|
||||||
|
"error": "invalid_revision" | "already_running" | "build_failed" | null,
|
||||||
|
"message": "human-readable details"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Request/Reply flow:**
|
||||||
|
1. MCP/CLI sends deploy request with unique `reply_to` subject
|
||||||
|
2. Listener validates request (e.g., `git ls-remote` to check revision exists)
|
||||||
|
3. Listener sends immediate response:
|
||||||
|
- `{"status": "rejected", "error": "invalid_revision", "message": "branch 'foo' not found"}`, or
|
||||||
|
- `{"status": "started", "message": "starting nixos-rebuild switch"}`
|
||||||
|
4. If started, listener runs nixos-rebuild
|
||||||
|
5. Listener sends final response:
|
||||||
|
- `{"status": "completed", "message": "successfully switched to generation 42"}`, or
|
||||||
|
- `{"status": "failed", "error": "build_failed", "message": "nixos-rebuild exited with code 1"}`
|
||||||
|
|
||||||
|
This provides immediate feedback on validation errors (bad revision, already running) without waiting for the build to fail.
|
||||||
|
|
||||||
|
### MCP Mode
|
||||||
|
|
||||||
|
Runs as an MCP server providing tools for Claude Code.
|
||||||
|
|
||||||
|
**Tools:**
|
||||||
|
| Tool | Description | Tier Access |
|
||||||
|
|------|-------------|-------------|
|
||||||
|
| `deploy` | Deploy to test hosts (individual, all, or by role) | test only |
|
||||||
|
| `deploy_admin` | Deploy to any host (requires `--enable-admin` flag) | test + prod |
|
||||||
|
| `deploy_status` | Check deployment status/history | n/a |
|
||||||
|
| `list_hosts` | List available deployment targets | n/a |
|
||||||
|
|
||||||
|
**CLI flags:**
|
||||||
|
```bash
|
||||||
|
# Default: only test-tier deployments available
|
||||||
|
homelab-deploy mcp --nats-url nats://nats1:4222
|
||||||
|
|
||||||
|
# Enable admin tool (requires admin NKey to be configured)
|
||||||
|
homelab-deploy mcp --nats-url nats://nats1:4222 --enable-admin --admin-nkey-file /path/to/admin.nkey
|
||||||
|
```
|
||||||
|
|
||||||
|
**Security layers:**
|
||||||
|
1. **MCP flag**: `deploy_admin` tool only exposed when `--enable-admin` is passed
|
||||||
|
2. **NATS authz**: Even if tool is exposed, NATS rejects publishes without valid admin NKey
|
||||||
|
3. **Claude Code permissions**: Can set `mcp__homelab-deploy__deploy_admin` to `ask` mode for confirmation popup
|
||||||
|
|
||||||
|
By default, the MCP only loads test-tier credentials and exposes the `deploy` tool. Claude can:
|
||||||
|
- Deploy to individual test hosts
|
||||||
|
- Deploy to all test hosts at once (`deploy.test.all`)
|
||||||
|
- Deploy to test hosts by role (`deploy.test.role.<role>`)
|
||||||
|
|
||||||
|
### Tiered Permissions
|
||||||
|
|
||||||
|
Authorization is enforced at the NATS layer using subject-based permissions. Different deployer credentials have different publish rights:
|
||||||
|
|
||||||
|
**NATS user configuration (on nats1):**
|
||||||
|
```nix
|
||||||
|
accounts = {
|
||||||
|
HOMELAB = {
|
||||||
|
users = [
|
||||||
|
# MCP/Claude - test tier only
|
||||||
|
{
|
||||||
|
nkey = "UABC..."; # mcp-deployer
|
||||||
|
permissions = {
|
||||||
|
publish = [ "deploy.test.>" ];
|
||||||
|
subscribe = [ "deploy.responses.>" ];
|
||||||
|
};
|
||||||
|
}
|
||||||
|
# Admin - full access to all tiers
|
||||||
|
{
|
||||||
|
nkey = "UXYZ..."; # admin-deployer
|
||||||
|
permissions = {
|
||||||
|
publish = [ "deploy.test.>" "deploy.prod.>" ];
|
||||||
|
subscribe = [ "deploy.responses.>" ];
|
||||||
|
};
|
||||||
|
}
|
||||||
|
# Host listeners - subscribe to their tier, publish responses
|
||||||
|
{
|
||||||
|
nkey = "UDEF..."; # host-listener (one per host)
|
||||||
|
permissions = {
|
||||||
|
subscribe = [ "deploy.*.>" ];
|
||||||
|
publish = [ "deploy.responses.>" ];
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**Host tier assignments** (via `homelab.host.tier`):
|
||||||
|
| Tier | Hosts |
|
||||||
|
|------|-------|
|
||||||
|
| test | template1, nix-cache01, future test hosts |
|
||||||
|
| prod | ns1, ns2, ha1, monitoring01, http-proxy, etc. |
|
||||||
|
|
||||||
|
**Example deployment scenarios:**
|
||||||
|
|
||||||
|
| Command | Subject | MCP | Admin |
|
||||||
|
|---------|---------|-----|-------|
|
||||||
|
| Deploy to ns1 | `deploy.prod.ns1` | ❌ | ✅ |
|
||||||
|
| Deploy to template1 | `deploy.test.template1` | ✅ | ✅ |
|
||||||
|
| Deploy to all test hosts | `deploy.test.all` | ✅ | ✅ |
|
||||||
|
| Deploy to all prod hosts | `deploy.prod.all` | ❌ | ✅ |
|
||||||
|
| Deploy to all DNS servers | `deploy.prod.role.dns` | ❌ | ✅ |
|
||||||
|
|
||||||
|
All NKeys stored in Vault - MCP gets limited credentials, admin CLI gets full-access credentials.
|
||||||
|
|
||||||
|
### Host Metadata
|
||||||
|
|
||||||
|
Rather than defining `tier` in the listener config, use a central `homelab.host` module that provides host metadata for multiple consumers. This aligns with the approach proposed in `docs/plans/prometheus-scrape-target-labels.md`.
|
||||||
|
|
||||||
|
**Status:** The `homelab.host` module is implemented in `modules/homelab/host.nix`.
|
||||||
|
Hosts can be filtered by tier using `config.homelab.host.tier`.
|
||||||
|
|
||||||
|
**Module definition (in `modules/homelab/host.nix`):**
|
||||||
|
```nix
|
||||||
|
homelab.host = {
|
||||||
|
tier = lib.mkOption {
|
||||||
|
type = lib.types.enum [ "test" "prod" ];
|
||||||
|
default = "prod";
|
||||||
|
description = "Deployment tier - controls which credentials can deploy to this host";
|
||||||
|
};
|
||||||
|
|
||||||
|
priority = lib.mkOption {
|
||||||
|
type = lib.types.enum [ "high" "low" ];
|
||||||
|
default = "high";
|
||||||
|
description = "Alerting priority - low priority hosts have relaxed thresholds";
|
||||||
|
};
|
||||||
|
|
||||||
|
role = lib.mkOption {
|
||||||
|
type = lib.types.nullOr lib.types.str;
|
||||||
|
default = null;
|
||||||
|
description = "Primary role of this host (dns, database, monitoring, etc.)";
|
||||||
|
};
|
||||||
|
|
||||||
|
labels = lib.mkOption {
|
||||||
|
type = lib.types.attrsOf lib.types.str;
|
||||||
|
default = { };
|
||||||
|
description = "Additional free-form labels";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**Consumers:**
|
||||||
|
- `homelab-deploy` listener reads `config.homelab.host.tier` for subject subscription
|
||||||
|
- Prometheus scrape config reads `priority`, `role`, `labels` for target labels
|
||||||
|
- Future services can consume the same metadata
|
||||||
|
|
||||||
|
**Example host config:**
|
||||||
|
```nix
|
||||||
|
# hosts/nix-cache01/configuration.nix
|
||||||
|
homelab.host = {
|
||||||
|
tier = "test"; # can be deployed by MCP
|
||||||
|
priority = "low"; # relaxed alerting thresholds
|
||||||
|
role = "build-host";
|
||||||
|
};
|
||||||
|
|
||||||
|
# hosts/ns1/configuration.nix
|
||||||
|
homelab.host = {
|
||||||
|
tier = "prod"; # requires admin credentials
|
||||||
|
priority = "high";
|
||||||
|
role = "dns";
|
||||||
|
labels.dns_role = "primary";
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
## Implementation Steps
|
||||||
|
|
||||||
|
### Phase 1: Core Binary + Listener
|
||||||
|
|
||||||
|
1. **Create homelab-deploy repository**
|
||||||
|
- Initialize Go module
|
||||||
|
- Set up flake.nix with Go package build
|
||||||
|
|
||||||
|
2. **Implement listener mode**
|
||||||
|
- NATS subscription logic
|
||||||
|
- nixos-rebuild execution
|
||||||
|
- Status reporting via NATS reply
|
||||||
|
|
||||||
|
3. **Create NixOS module**
|
||||||
|
- Systemd service definition
|
||||||
|
- Configuration options (hostname, NATS URL, NKey path)
|
||||||
|
- Vault secret integration for NKeys
|
||||||
|
|
||||||
|
4. **Create `homelab.host` module** (in nixos-servers)
|
||||||
|
- Define `tier`, `priority`, `role`, `labels` options
|
||||||
|
- This module is shared with Prometheus label work (see `docs/plans/prometheus-scrape-target-labels.md`)
|
||||||
|
|
||||||
|
5. **Integrate with nixos-servers**
|
||||||
|
- Add flake input for homelab-deploy
|
||||||
|
- Import listener module in `system/`
|
||||||
|
- Set `homelab.host.tier` per host (test vs prod)
|
||||||
|
|
||||||
|
6. **Configure NATS tiered permissions**
|
||||||
|
- Add deployer users to nats1 config (mcp-deployer, admin-deployer)
|
||||||
|
- Set up subject ACLs per user (test-only vs full access)
|
||||||
|
- Add deployer NKeys to Vault
|
||||||
|
- Create Terraform resources for NKey secrets
|
||||||
|
|
||||||
|
### Phase 2: MCP + CLI
|
||||||
|
|
||||||
|
7. **Implement MCP mode**
|
||||||
|
- MCP server with deploy/status tools
|
||||||
|
- Request/reply pattern for deployment feedback
|
||||||
|
|
||||||
|
8. **Implement CLI commands**
|
||||||
|
- `deploy` command for manual deployments
|
||||||
|
- `status` command to check deployment state
|
||||||
|
|
||||||
|
9. **Configure Claude Code**
|
||||||
|
- Add MCP server to configuration
|
||||||
|
- Document usage
|
||||||
|
|
||||||
|
### Phase 3: Enhancements
|
||||||
|
|
||||||
|
10. Add deployment locking (prevent concurrent deploys)
|
||||||
|
11. Prometheus metrics for deployment status
|
||||||
|
|
||||||
|
## Security Considerations
|
||||||
|
|
||||||
|
- **Privilege escalation**: Listener runs as root to execute nixos-rebuild
|
||||||
|
- **Input validation**: Strictly validate revision format (branch name or commit hash)
|
||||||
|
- **Rate limiting**: Prevent rapid-fire deployments
|
||||||
|
- **Audit logging**: Log all deployment requests with source identity
|
||||||
|
- **Network isolation**: NATS only accessible from internal network
|
||||||
|
|
||||||
|
## Decisions
|
||||||
|
|
||||||
|
All open questions have been resolved. See Notes section for decision rationale.
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- The existing `nixos-rebuild-test` helper provides a good reference for the rebuild logic
|
||||||
|
- Uses NATS request/reply pattern for immediate validation feedback and completion status
|
||||||
|
- Consider using NATS headers for metadata (request ID, timestamp)
|
||||||
|
- **Timeout decision**: Metrics show no-change upgrades complete in 5-55 seconds. A 10-minute default provides ample headroom for actual updates with package downloads. Per-host override available for hosts with known longer build times.
|
||||||
|
- **Rollback**: Not needed as a separate feature - deploy an older commit hash to effectively rollback.
|
||||||
|
- **Offline hosts**: No message persistence - if host is offline, deploy fails. Daily auto-upgrade is the safety net. Avoids complexity of JetStream deduplication (host coming online and applying 10 queued updates instead of just the latest).
|
||||||
|
- **Deploy history**: Use existing Loki - listener logs deployments to journald, queryable via Loki. No need for separate JetStream persistence.
|
||||||
|
- **Naming**: `homelab-deploy` - ties it to the infrastructure rather than implementation details.
|
||||||
176
docs/plans/completed/nixos-exporter.md
Normal file
176
docs/plans/completed/nixos-exporter.md
Normal file
@@ -0,0 +1,176 @@
|
|||||||
|
# NixOS Prometheus Exporter
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Build a generic Prometheus exporter for NixOS-specific metrics. This exporter should be useful for any NixOS deployment, not just our homelab.
|
||||||
|
|
||||||
|
## Goal
|
||||||
|
|
||||||
|
Provide visibility into NixOS system state that standard exporters don't cover:
|
||||||
|
- Generation management (count, age, current vs booted)
|
||||||
|
- Flake input freshness
|
||||||
|
- Upgrade status
|
||||||
|
|
||||||
|
## Metrics
|
||||||
|
|
||||||
|
### Core Metrics
|
||||||
|
|
||||||
|
| Metric | Description | Source |
|
||||||
|
|--------|-------------|--------|
|
||||||
|
| `nixos_generation_count` | Number of system generations | Count entries in `/nix/var/nix/profiles/system-*` |
|
||||||
|
| `nixos_current_generation` | Active generation number | Parse `readlink /run/current-system` |
|
||||||
|
| `nixos_booted_generation` | Generation that was booted | Parse `/run/booted-system` |
|
||||||
|
| `nixos_generation_age_seconds` | Age of current generation | File mtime of current system profile |
|
||||||
|
| `nixos_config_mismatch` | 1 if booted != current, 0 otherwise | Compare symlink targets |
|
||||||
|
|
||||||
|
### Flake Metrics (optional collector)
|
||||||
|
|
||||||
|
| Metric | Description | Source |
|
||||||
|
|--------|-------------|--------|
|
||||||
|
| `nixos_flake_input_age_seconds` | Age of each flake.lock input | Parse `lastModified` from flake.lock |
|
||||||
|
| `nixos_flake_input_info` | Info gauge with rev label | Parse `rev` from flake.lock |
|
||||||
|
|
||||||
|
Labels: `input` (e.g., "nixpkgs", "home-manager")
|
||||||
|
|
||||||
|
### Future Metrics
|
||||||
|
|
||||||
|
| Metric | Description | Source |
|
||||||
|
|--------|-------------|--------|
|
||||||
|
| `nixos_upgrade_pending` | 1 if remote differs from local | Compare flake refs (expensive) |
|
||||||
|
| `nixos_store_size_bytes` | Size of /nix/store | `du` or filesystem stats |
|
||||||
|
| `nixos_store_path_count` | Number of store paths | Count entries |
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
Single binary with optional collectors enabled via config or flags.
|
||||||
|
|
||||||
|
```
|
||||||
|
nixos-exporter
|
||||||
|
├── main.go
|
||||||
|
├── collector/
|
||||||
|
│ ├── generation.go # Core generation metrics
|
||||||
|
│ └── flake.go # Flake input metrics
|
||||||
|
└── config/
|
||||||
|
└── config.go
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
listen_addr: ":9971"
|
||||||
|
collectors:
|
||||||
|
generation:
|
||||||
|
enabled: true
|
||||||
|
flake:
|
||||||
|
enabled: false
|
||||||
|
lock_path: "/etc/nixos/flake.lock" # or auto-detect from /run/current-system
|
||||||
|
```
|
||||||
|
|
||||||
|
Command-line alternative:
|
||||||
|
```bash
|
||||||
|
nixos-exporter --listen=:9971 --collector.flake --flake.lock-path=/etc/nixos/flake.lock
|
||||||
|
```
|
||||||
|
|
||||||
|
## NixOS Module
|
||||||
|
|
||||||
|
```nix
|
||||||
|
services.prometheus.exporters.nixos = {
|
||||||
|
enable = true;
|
||||||
|
port = 9971;
|
||||||
|
collectors = [ "generation" "flake" ];
|
||||||
|
flake.lockPath = "/etc/nixos/flake.lock";
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
The module should integrate with nixpkgs' existing `services.prometheus.exporters.*` pattern.
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
### Language
|
||||||
|
|
||||||
|
Go - mature prometheus client library, single static binary, easy cross-compilation.
|
||||||
|
|
||||||
|
### Phase 1: Core
|
||||||
|
1. Create git repository
|
||||||
|
2. Implement generation collector (count, current, booted, age, mismatch)
|
||||||
|
3. Basic HTTP server with `/metrics` endpoint
|
||||||
|
4. NixOS module
|
||||||
|
|
||||||
|
### Phase 2: Flake Collector
|
||||||
|
1. Parse flake.lock JSON format
|
||||||
|
2. Extract lastModified timestamps per input
|
||||||
|
3. Add input labels
|
||||||
|
|
||||||
|
### Phase 3: Packaging
|
||||||
|
1. Add to nixpkgs or publish as flake
|
||||||
|
2. Documentation
|
||||||
|
3. Example Grafana dashboard
|
||||||
|
|
||||||
|
## Example Output
|
||||||
|
|
||||||
|
```
|
||||||
|
# HELP nixos_generation_count Total number of system generations
|
||||||
|
# TYPE nixos_generation_count gauge
|
||||||
|
nixos_generation_count 47
|
||||||
|
|
||||||
|
# HELP nixos_current_generation Currently active generation number
|
||||||
|
# TYPE nixos_current_generation gauge
|
||||||
|
nixos_current_generation 47
|
||||||
|
|
||||||
|
# HELP nixos_booted_generation Generation that was booted
|
||||||
|
# TYPE nixos_booted_generation gauge
|
||||||
|
nixos_booted_generation 46
|
||||||
|
|
||||||
|
# HELP nixos_generation_age_seconds Age of current generation in seconds
|
||||||
|
# TYPE nixos_generation_age_seconds gauge
|
||||||
|
nixos_generation_age_seconds 3600
|
||||||
|
|
||||||
|
# HELP nixos_config_mismatch 1 if booted generation differs from current
|
||||||
|
# TYPE nixos_config_mismatch gauge
|
||||||
|
nixos_config_mismatch 1
|
||||||
|
|
||||||
|
# HELP nixos_flake_input_age_seconds Age of flake input in seconds
|
||||||
|
# TYPE nixos_flake_input_age_seconds gauge
|
||||||
|
nixos_flake_input_age_seconds{input="nixpkgs"} 259200
|
||||||
|
nixos_flake_input_age_seconds{input="home-manager"} 86400
|
||||||
|
```
|
||||||
|
|
||||||
|
## Alert Examples
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- alert: NixOSConfigStale
|
||||||
|
expr: nixos_generation_age_seconds > 7 * 24 * 3600
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "NixOS config on {{ $labels.instance }} is over 7 days old"
|
||||||
|
|
||||||
|
- alert: NixOSRebootRequired
|
||||||
|
expr: nixos_config_mismatch == 1
|
||||||
|
for: 24h
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: "{{ $labels.instance }} needs reboot to apply config"
|
||||||
|
|
||||||
|
- alert: NixpkgsInputStale
|
||||||
|
expr: nixos_flake_input_age_seconds{input="nixpkgs"} > 30 * 24 * 3600
|
||||||
|
for: 1d
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: "nixpkgs input on {{ $labels.instance }} is over 30 days old"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- [ ] How to detect flake.lock path automatically? (check /run/current-system for flake info)
|
||||||
|
- [ ] Should generation collector need root? (probably not, just reading symlinks)
|
||||||
|
- [ ] Include in nixpkgs or distribute as standalone flake?
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Port 9971 suggested (9970 reserved for homelab-exporter)
|
||||||
|
- Keep scope focused on NixOS-specific metrics - don't duplicate node-exporter
|
||||||
|
- Consider submitting to prometheus exporter registry once stable
|
||||||
107
docs/plans/completed/ns1-recreation.md
Normal file
107
docs/plans/completed/ns1-recreation.md
Normal file
@@ -0,0 +1,107 @@
|
|||||||
|
# ns1 Recreation Plan
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Recreate ns1 using the OpenTofu workflow after the existing VM entered emergency mode due to incorrect hardware-configuration.nix (hardcoded UUIDs that don't match actual disk layout).
|
||||||
|
|
||||||
|
## Current ns1 Configuration to Preserve
|
||||||
|
|
||||||
|
- **IP:** 10.69.13.5/24
|
||||||
|
- **Gateway:** 10.69.13.1
|
||||||
|
- **Role:** Primary DNS (authoritative + resolver)
|
||||||
|
- **Services:**
|
||||||
|
- `../../services/ns/master-authorative.nix`
|
||||||
|
- `../../services/ns/resolver.nix`
|
||||||
|
- **Metadata:**
|
||||||
|
- `homelab.host.role = "dns"`
|
||||||
|
- `homelab.host.labels.dns_role = "primary"`
|
||||||
|
- **Vault:** enabled
|
||||||
|
- **Deploy:** enabled
|
||||||
|
|
||||||
|
## Execution Steps
|
||||||
|
|
||||||
|
### Phase 1: Remove Old Configuration
|
||||||
|
|
||||||
|
```bash
|
||||||
|
nix develop -c create-host --remove --hostname ns1 --force
|
||||||
|
```
|
||||||
|
|
||||||
|
This removes:
|
||||||
|
- `hosts/ns1/` directory
|
||||||
|
- Entry from `flake.nix`
|
||||||
|
- Any terraform entries (none exist currently)
|
||||||
|
|
||||||
|
### Phase 2: Create New Configuration
|
||||||
|
|
||||||
|
```bash
|
||||||
|
nix develop -c create-host --hostname ns1 --ip 10.69.13.5/24
|
||||||
|
```
|
||||||
|
|
||||||
|
This creates:
|
||||||
|
- `hosts/ns1/` with template2-based configuration
|
||||||
|
- Entry in `flake.nix`
|
||||||
|
- Entry in `terraform/vms.tf`
|
||||||
|
- Vault wrapped token for bootstrap
|
||||||
|
|
||||||
|
### Phase 3: Customize Configuration
|
||||||
|
|
||||||
|
After create-host, manually update `hosts/ns1/configuration.nix` to add:
|
||||||
|
|
||||||
|
1. DNS service imports:
|
||||||
|
```nix
|
||||||
|
../../services/ns/master-authorative.nix
|
||||||
|
../../services/ns/resolver.nix
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Host metadata:
|
||||||
|
```nix
|
||||||
|
homelab.host = {
|
||||||
|
tier = "prod";
|
||||||
|
role = "dns";
|
||||||
|
labels.dns_role = "primary";
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Disable resolved (conflicts with Unbound):
|
||||||
|
```nix
|
||||||
|
services.resolved.enable = false;
|
||||||
|
```
|
||||||
|
|
||||||
|
### Phase 4: Commit Changes
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add -A
|
||||||
|
git commit -m "ns1: recreate with OpenTofu workflow
|
||||||
|
|
||||||
|
Old VM had incorrect hardware-configuration.nix with hardcoded UUIDs
|
||||||
|
that didn't match actual disk layout, causing boot failure.
|
||||||
|
|
||||||
|
Recreated using template2-based configuration for OpenTofu provisioning."
|
||||||
|
```
|
||||||
|
|
||||||
|
### Phase 5: Infrastructure
|
||||||
|
|
||||||
|
1. Delete old ns1 VM in Proxmox (it's broken anyway)
|
||||||
|
2. Run `nix develop -c tofu -chdir=terraform apply`
|
||||||
|
3. Wait for bootstrap to complete
|
||||||
|
4. Verify ns1 is functional:
|
||||||
|
- DNS resolution working
|
||||||
|
- Zone transfer to ns2 working
|
||||||
|
- All exporters responding
|
||||||
|
|
||||||
|
### Phase 6: Finalize
|
||||||
|
|
||||||
|
- Push to master
|
||||||
|
- Move this plan to `docs/plans/completed/`
|
||||||
|
|
||||||
|
## Rollback
|
||||||
|
|
||||||
|
If the new VM fails:
|
||||||
|
1. ns2 is still operational as secondary DNS
|
||||||
|
2. Can recreate with different settings if needed
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- ns2 will continue serving DNS during the migration
|
||||||
|
- Zone data is generated from flake, so no data loss
|
||||||
|
- The old VM's disk can be kept briefly in Proxmox as backup if desired
|
||||||
205
docs/plans/completed/prometheus-scrape-target-labels.md
Normal file
205
docs/plans/completed/prometheus-scrape-target-labels.md
Normal file
@@ -0,0 +1,205 @@
|
|||||||
|
# Prometheus Scrape Target Labels
|
||||||
|
|
||||||
|
## Implementation Status
|
||||||
|
|
||||||
|
| Step | Status | Notes |
|
||||||
|
|------|--------|-------|
|
||||||
|
| 1. Create `homelab.host` module | ✅ Complete | `modules/homelab/host.nix` |
|
||||||
|
| 2. Update `lib/monitoring.nix` | ✅ Complete | Labels extracted and propagated |
|
||||||
|
| 3. Update Prometheus config | ✅ Complete | Uses structured static_configs |
|
||||||
|
| 4. Set metadata on hosts | ✅ Complete | All relevant hosts configured |
|
||||||
|
| 5. Update alert rules | ✅ Complete | Role-based filtering implemented |
|
||||||
|
| 6. Labels for service targets | ✅ Complete | Host labels propagated to all services |
|
||||||
|
| 7. Add hostname label | ✅ Complete | All targets have `hostname` label for easy filtering |
|
||||||
|
|
||||||
|
**Hosts with metadata configured:**
|
||||||
|
- `ns1`, `ns2`: `role = "dns"`, `labels.dns_role = "primary"/"secondary"`
|
||||||
|
- `nix-cache01`: `role = "build-host"`
|
||||||
|
- `vault01`: `role = "vault"`
|
||||||
|
- `testvm01/02/03`: `tier = "test"`
|
||||||
|
|
||||||
|
**Implementation complete.** Branch: `prometheus-scrape-target-labels`
|
||||||
|
|
||||||
|
**Query examples:**
|
||||||
|
- `{hostname="ns1"}` - all metrics from ns1 (any job/port)
|
||||||
|
- `node_cpu_seconds_total{hostname="monitoring01"}` - specific metric by hostname
|
||||||
|
- `up{role="dns"}` - all DNS servers
|
||||||
|
- `up{tier="test"}` - all test-tier hosts
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Goal
|
||||||
|
|
||||||
|
Add support for custom per-host labels on Prometheus scrape targets, enabling alert rules to reference host metadata (priority, role) instead of hardcoding instance names.
|
||||||
|
|
||||||
|
**Related:** This plan shares the `homelab.host` module with `docs/plans/completed/nats-deploy-service.md`, which uses the same metadata for deployment tier assignment.
|
||||||
|
|
||||||
|
## Motivation
|
||||||
|
|
||||||
|
Some hosts have workloads that make generic alert thresholds inappropriate. For example, `nix-cache01` regularly hits high CPU during builds, requiring a longer `for` duration on `high_cpu_load`. Currently this is handled by excluding specific instance names in PromQL expressions, which is brittle and doesn't scale.
|
||||||
|
|
||||||
|
With per-host labels, alert rules can use semantic filters like `{priority!="low"}` instead of `{instance!="nix-cache01.home.2rjus.net:9100"}`.
|
||||||
|
|
||||||
|
## Proposed Labels
|
||||||
|
|
||||||
|
### `priority`
|
||||||
|
|
||||||
|
Indicates alerting importance. Hosts with `priority = "low"` can have relaxed thresholds or longer durations in alert rules.
|
||||||
|
|
||||||
|
Values: `"high"` (default), `"low"`
|
||||||
|
|
||||||
|
### `role`
|
||||||
|
|
||||||
|
Describes the function of the host. Useful for grouping in dashboards and targeting role-specific alert rules.
|
||||||
|
|
||||||
|
Values: free-form string, e.g. `"dns"`, `"build-host"`, `"database"`, `"monitoring"`
|
||||||
|
|
||||||
|
**Note on multiple roles:** Prometheus labels are strictly string values, not lists. For hosts that serve multiple roles there are a few options:
|
||||||
|
|
||||||
|
- **Separate boolean labels:** `role_build_host = "true"`, `role_cache_server = "true"` -- flexible but verbose, and requires updating the module when new roles are added.
|
||||||
|
- **Delimited string:** `role = "build-host,cache-server"` -- works with regex matchers (`{role=~".*build-host.*"}`), but regex matching is less clean and more error-prone.
|
||||||
|
- **Pick a primary role:** `role = "build-host"` -- simplest, and probably sufficient since most hosts have one primary role.
|
||||||
|
|
||||||
|
Recommendation: start with a single primary role string. If multi-role matching becomes a real need, switch to separate boolean labels.
|
||||||
|
|
||||||
|
### `dns_role`
|
||||||
|
|
||||||
|
For DNS servers specifically, distinguish between primary and secondary resolvers. The secondary resolver (ns2) receives very little traffic and has a cold cache, making generic cache hit ratio alerts inappropriate.
|
||||||
|
|
||||||
|
Values: `"primary"`, `"secondary"`
|
||||||
|
|
||||||
|
Example use case: The `unbound_low_cache_hit_ratio` alert fires on ns2 because its cache hit ratio (~62%) is lower than ns1 (~90%). This is expected behavior since ns2 gets ~100x less traffic. With a `dns_role` label, the alert can either exclude secondaries or use different thresholds:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
# Only alert on primary DNS
|
||||||
|
unbound_cache_hit_ratio < 0.7 and on(instance) unbound_up{dns_role="primary"}
|
||||||
|
|
||||||
|
# Or use different thresholds
|
||||||
|
(unbound_cache_hit_ratio < 0.7 and on(instance) unbound_up{dns_role="primary"})
|
||||||
|
or
|
||||||
|
(unbound_cache_hit_ratio < 0.5 and on(instance) unbound_up{dns_role="secondary"})
|
||||||
|
```
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
This implementation uses a shared `homelab.host` module that provides host metadata for multiple consumers (Prometheus labels, deployment tiers, etc.). See also `docs/plans/completed/nats-deploy-service.md` which uses the same module for deployment tier assignment.
|
||||||
|
|
||||||
|
### 1. Create `homelab.host` module
|
||||||
|
|
||||||
|
✅ **Complete.** The module is in `modules/homelab/host.nix`.
|
||||||
|
|
||||||
|
Create `modules/homelab/host.nix` with shared host metadata options:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
{ lib, ... }:
|
||||||
|
{
|
||||||
|
options.homelab.host = {
|
||||||
|
tier = lib.mkOption {
|
||||||
|
type = lib.types.enum [ "test" "prod" ];
|
||||||
|
default = "prod";
|
||||||
|
description = "Deployment tier - controls which credentials can deploy to this host";
|
||||||
|
};
|
||||||
|
|
||||||
|
priority = lib.mkOption {
|
||||||
|
type = lib.types.enum [ "high" "low" ];
|
||||||
|
default = "high";
|
||||||
|
description = "Alerting priority - low priority hosts have relaxed thresholds";
|
||||||
|
};
|
||||||
|
|
||||||
|
role = lib.mkOption {
|
||||||
|
type = lib.types.nullOr lib.types.str;
|
||||||
|
default = null;
|
||||||
|
description = "Primary role of this host (dns, database, monitoring, etc.)";
|
||||||
|
};
|
||||||
|
|
||||||
|
labels = lib.mkOption {
|
||||||
|
type = lib.types.attrsOf lib.types.str;
|
||||||
|
default = { };
|
||||||
|
description = "Additional free-form labels (e.g., dns_role = 'primary')";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Import this module in `modules/homelab/default.nix`.
|
||||||
|
|
||||||
|
### 2. Update `lib/monitoring.nix`
|
||||||
|
|
||||||
|
✅ **Complete.** Labels are now extracted and propagated.
|
||||||
|
|
||||||
|
- `extractHostMonitoring` should also extract `homelab.host` values (priority, role, labels).
|
||||||
|
- Build the combined label set from `homelab.host`:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
# Combine structured options + free-form labels
|
||||||
|
effectiveLabels =
|
||||||
|
(lib.optionalAttrs (host.priority != "high") { priority = host.priority; })
|
||||||
|
// (lib.optionalAttrs (host.role != null) { role = host.role; })
|
||||||
|
// host.labels;
|
||||||
|
```
|
||||||
|
|
||||||
|
- `generateNodeExporterTargets` returns structured `static_configs` entries, grouping targets by their label sets:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
# Before (flat list):
|
||||||
|
["ns1.home.2rjus.net:9100", "ns2.home.2rjus.net:9100", ...]
|
||||||
|
|
||||||
|
# After (grouped by labels):
|
||||||
|
[
|
||||||
|
{ targets = ["ns1.home.2rjus.net:9100", "ns2.home.2rjus.net:9100", ...]; }
|
||||||
|
{ targets = ["nix-cache01.home.2rjus.net:9100"]; labels = { priority = "low"; role = "build-host"; }; }
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
This requires grouping hosts by their label attrset and producing one `static_configs` entry per unique label combination. Hosts with default values (priority=high, no role, no labels) get grouped together with no extra labels (preserving current behavior).
|
||||||
|
|
||||||
|
### 3. Update `services/monitoring/prometheus.nix`
|
||||||
|
|
||||||
|
✅ **Complete.** Now uses structured static_configs output.
|
||||||
|
|
||||||
|
Change the node-exporter scrape config to use the new structured output:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
# Before:
|
||||||
|
static_configs = [{ targets = nodeExporterTargets; }];
|
||||||
|
|
||||||
|
# After:
|
||||||
|
static_configs = nodeExporterTargets;
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Set metadata on hosts
|
||||||
|
|
||||||
|
✅ **Complete.** All relevant hosts have metadata configured. Note: The implementation filters by `role` rather than `priority`, which matches the existing nix-cache01 configuration.
|
||||||
|
|
||||||
|
Example in `hosts/nix-cache01/configuration.nix`:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
homelab.host = {
|
||||||
|
priority = "low"; # relaxed alerting thresholds
|
||||||
|
role = "build-host";
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**Note:** Current implementation only sets `role = "build-host"`. Consider adding `priority = "low"` when label propagation is implemented.
|
||||||
|
|
||||||
|
Example in `hosts/ns1/configuration.nix`:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
homelab.host = {
|
||||||
|
role = "dns";
|
||||||
|
labels.dns_role = "primary";
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**Note:** `tier` and `priority` use defaults ("prod" and "high"), which is the intended behavior. The current ns1/ns2 configurations match this pattern.
|
||||||
|
|
||||||
|
### 5. Update alert rules
|
||||||
|
|
||||||
|
✅ **Complete.** Updated `services/monitoring/rules.yml`:
|
||||||
|
|
||||||
|
- `high_cpu_load`: Replaced `instance!="nix-cache01..."` with `role!="build-host"` for standard hosts (15m duration) and `role="build-host"` for build hosts (2h duration).
|
||||||
|
- `unbound_low_cache_hit_ratio`: Added `dns_role="primary"` filter to only alert on the primary DNS resolver (secondary has a cold cache).
|
||||||
|
|
||||||
|
### 6. Labels for `generateScrapeConfigs` (service targets)
|
||||||
|
|
||||||
|
✅ **Complete.** Host labels are now propagated to all auto-generated service scrape targets (unbound, homelab-deploy, nixos-exporter, etc.). This enables semantic filtering on any service metric, such as using `dns_role="primary"` with the unbound job.
|
||||||
86
docs/plans/completed/sops-to-openbao-migration.md
Normal file
86
docs/plans/completed/sops-to-openbao-migration.md
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
# Sops to OpenBao Secrets Migration Plan
|
||||||
|
|
||||||
|
## Status: Complete (except ca, deferred)
|
||||||
|
|
||||||
|
## Remaining sops cleanup
|
||||||
|
|
||||||
|
The `sops-nix` flake input, `system/sops.nix`, `.sops.yaml`, and `secrets/` directory are
|
||||||
|
still present because `ca` still uses sops for its step-ca secrets (5 secrets in
|
||||||
|
`services/ca/default.nix`). The `services/authelia/` and `services/lldap/` modules also
|
||||||
|
reference sops but are only used by auth01 (decommissioned).
|
||||||
|
|
||||||
|
Once `ca` is migrated to OpenBao PKI (Phase 4c in host-migration-to-opentofu.md), remove:
|
||||||
|
- `sops-nix` input from `flake.nix`
|
||||||
|
- `sops-nix.nixosModules.sops` from all host module lists in `flake.nix`
|
||||||
|
- `inherit sops-nix` from all specialArgs in `flake.nix`
|
||||||
|
- `system/sops.nix` and its import in `system/default.nix`
|
||||||
|
- `.sops.yaml`
|
||||||
|
- `secrets/` directory
|
||||||
|
- All `sops.secrets.*` declarations in `services/ca/`, `services/authelia/`, `services/lldap/`
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Migrate all hosts from sops-nix secrets to OpenBao (vault) secrets management. Pilot with ha1, then roll out to remaining hosts in waves.
|
||||||
|
|
||||||
|
## Pre-requisites (completed)
|
||||||
|
|
||||||
|
1. Hardcoded root password hash in `system/root-user.nix` (removes sops dependency for all hosts)
|
||||||
|
2. Added `extractKey` option to `system/vault-secrets.nix` (extracts single key as file)
|
||||||
|
|
||||||
|
## Deployment Order
|
||||||
|
|
||||||
|
### Pilot: ha1
|
||||||
|
- Terraform: shared/backup/password secret, ha1 AppRole policy
|
||||||
|
- Provision AppRole credentials via `playbooks/provision-approle.yml`
|
||||||
|
- NixOS: vault.enable + backup-helper vault secret
|
||||||
|
|
||||||
|
### Wave 1: nats1, jelly01, pgdb1
|
||||||
|
- No service secrets (only root password, already handled)
|
||||||
|
- Just need AppRole policies + credential provisioning
|
||||||
|
|
||||||
|
### Wave 2: monitoring01
|
||||||
|
- 3 secrets: backup password, nats nkey, pve-exporter config
|
||||||
|
- Updates: alerttonotify.nix, pve.nix, configuration.nix
|
||||||
|
|
||||||
|
### Wave 3: ns1, then ns2 (critical - deploy ns1 first, verify, then ns2)
|
||||||
|
- DNS zone transfer key (shared/dns/xfer-key)
|
||||||
|
|
||||||
|
### Wave 4: http-proxy
|
||||||
|
- WireGuard private key
|
||||||
|
|
||||||
|
### Wave 5: nix-cache01
|
||||||
|
- Cache signing key + Gitea Actions token
|
||||||
|
|
||||||
|
### Wave 6: ca (DEFERRED - waiting for PKI migration)
|
||||||
|
|
||||||
|
### Skipped: auth01 (decommissioned)
|
||||||
|
|
||||||
|
## Terraform variables needed
|
||||||
|
|
||||||
|
User must extract from sops and add to `terraform/vault/terraform.tfvars`:
|
||||||
|
|
||||||
|
| Variable | Source |
|
||||||
|
|----------|--------|
|
||||||
|
| `backup_helper_secret` | `sops -d secrets/secrets.yaml` |
|
||||||
|
| `ns_xfer_key` | `sops -d secrets/secrets.yaml` |
|
||||||
|
| `nats_nkey` | `sops -d secrets/secrets.yaml` |
|
||||||
|
| `pve_exporter_config` | `sops -d secrets/monitoring01/pve-exporter.yaml` |
|
||||||
|
| `wireguard_private_key` | `sops -d secrets/http-proxy/wireguard.yaml` |
|
||||||
|
| `cache_signing_key` | `sops -d secrets/nix-cache01/cache-secret` |
|
||||||
|
| `actions_token_1` | `sops -d secrets/nix-cache01/actions_token_1` |
|
||||||
|
|
||||||
|
## Provisioning AppRole credentials
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export BAO_ADDR='https://vault01.home.2rjus.net:8200'
|
||||||
|
export BAO_TOKEN='<root-token>'
|
||||||
|
nix develop -c ansible-playbook playbooks/provision-approle.yml -e hostname=<host>
|
||||||
|
```
|
||||||
|
|
||||||
|
## Verification (per host)
|
||||||
|
|
||||||
|
1. `systemctl status vault-secret-*` - all secret fetch services succeeded
|
||||||
|
2. Check secret files exist at expected paths with correct permissions
|
||||||
|
3. Verify dependent services are running
|
||||||
|
4. Check `/var/lib/vault/cache/` is populated (fallback ready)
|
||||||
|
5. Reboot host to verify boot-time secret fetching works
|
||||||
109
docs/plans/completed/zigbee-sensor-battery-monitoring.md
Normal file
109
docs/plans/completed/zigbee-sensor-battery-monitoring.md
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
# Zigbee Sensor Battery Monitoring
|
||||||
|
|
||||||
|
**Status:** Completed
|
||||||
|
**Branch:** `zigbee-battery-fix`
|
||||||
|
**Commit:** `c515a6b home-assistant: fix zigbee sensor battery reporting`
|
||||||
|
|
||||||
|
## Problem
|
||||||
|
|
||||||
|
Three Aqara Zigbee temperature sensors report `battery: 0` in their MQTT payload, making the `hass_sensor_battery_percent` Prometheus metric useless for battery monitoring on these devices.
|
||||||
|
|
||||||
|
Affected sensors:
|
||||||
|
- **Temp Living Room** (`0x54ef441000a54d3c`) — WSDCGQ12LM
|
||||||
|
- **Temp Office** (`0x54ef441000a547bd`) — WSDCGQ12LM
|
||||||
|
- **temp_server** (`0x54ef441000a564b6`) — WSDCGQ12LM
|
||||||
|
|
||||||
|
The **Temp Bedroom** sensor (`0x00124b0025495463`) is a SONOFF SNZB-02 and reports battery correctly.
|
||||||
|
|
||||||
|
## Findings
|
||||||
|
|
||||||
|
- All three sensors are actively reporting temperature, humidity, and pressure data — they are not dead.
|
||||||
|
- The Zigbee2MQTT payload includes a `voltage` field (e.g., `2707` = 2.707V), which indicates healthy battery levels (~40-60% for a CR2032 coin cell).
|
||||||
|
- CR2032 voltage reference: ~3.0V fresh, ~2.7V mid-life, ~2.1V dead.
|
||||||
|
- The `voltage` field is not exposed as a Prometheus metric — it exists only in the MQTT payload.
|
||||||
|
- This is a known firmware quirk with some Aqara WSDCGQ12LM sensors that always report 0% battery.
|
||||||
|
|
||||||
|
## Device Inventory
|
||||||
|
|
||||||
|
Full list of Zigbee devices on ha1 (12 total):
|
||||||
|
|
||||||
|
| Device | IEEE Address | Model | Type |
|
||||||
|
|--------|-------------|-------|------|
|
||||||
|
| temp_server | 0x54ef441000a564b6 | WSDCGQ12LM | Temperature sensor (battery fix applied) |
|
||||||
|
| (Temp Living Room) | 0x54ef441000a54d3c | WSDCGQ12LM | Temperature sensor (battery fix applied) |
|
||||||
|
| (Temp Office) | 0x54ef441000a547bd | WSDCGQ12LM | Temperature sensor (battery fix applied) |
|
||||||
|
| (Temp Bedroom) | 0x00124b0025495463 | SNZB-02 | Temperature sensor (battery works) |
|
||||||
|
| (Water leak) | 0x54ef4410009ac117 | SJCGQ12LM | Water leak sensor |
|
||||||
|
| btn_livingroom | 0x54ef441000a1f907 | WXKG13LM | Wireless mini switch |
|
||||||
|
| btn_bedroom | 0x54ef441000a1ee71 | WXKG13LM | Wireless mini switch |
|
||||||
|
| (Hue bulb) | 0x001788010dc35d06 | 9290024688 | Hue E27 1100lm (Router) |
|
||||||
|
| (Hue bulb) | 0x001788010dc5f003 | 9290024688 | Hue E27 1100lm (Router) |
|
||||||
|
| (Hue ceiling) | 0x001788010e371aa4 | 915005997301 | Hue Infuse medium (Router) |
|
||||||
|
| (Hue ceiling) | 0x001788010d253b99 | 915005997301 | Hue Infuse medium (Router) |
|
||||||
|
| (Hue wall) | 0x001788010d1b599a | 929003052901 | Hue Sana wall light (Router, transition=5) |
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
### Solution 1: Calculate battery from voltage in Zigbee2MQTT (Implemented)
|
||||||
|
|
||||||
|
Override the Home Assistant battery entity's `value_template` in Zigbee2MQTT device configuration to calculate battery percentage from voltage.
|
||||||
|
|
||||||
|
**Formula:** `(voltage - 2100) / 9` (maps 2100-3000mV to 0-100%)
|
||||||
|
|
||||||
|
**Changes in `services/home-assistant/default.nix`:**
|
||||||
|
- Device configuration moved from external `devices.yaml` to inline NixOS config
|
||||||
|
- Three affected sensors have `homeassistant.sensor_battery.value_template` override
|
||||||
|
- All 12 devices now declaratively managed
|
||||||
|
|
||||||
|
**Expected battery values based on current voltages:**
|
||||||
|
| Sensor | Voltage | Expected Battery |
|
||||||
|
|--------|---------|------------------|
|
||||||
|
| Temp Living Room | 2710 mV | ~68% |
|
||||||
|
| Temp Office | 2658 mV | ~62% |
|
||||||
|
| temp_server | 2765 mV | ~74% |
|
||||||
|
|
||||||
|
### Solution 2: Alert on sensor staleness (Implemented)
|
||||||
|
|
||||||
|
Added Prometheus alert `zigbee_sensor_stale` in `services/monitoring/rules.yml` that fires when a Zigbee temperature sensor hasn't updated in over 1 hour. This provides defense-in-depth for detecting dead sensors regardless of battery reporting accuracy.
|
||||||
|
|
||||||
|
**Alert details:**
|
||||||
|
- Expression: `(time() - hass_last_updated_time_seconds{entity=~"sensor\\.(0x[0-9a-f]+|temp_server)_temperature"}) > 3600`
|
||||||
|
- Severity: warning
|
||||||
|
- For: 5m
|
||||||
|
|
||||||
|
## Pre-Deployment Verification
|
||||||
|
|
||||||
|
### Backup Verification
|
||||||
|
|
||||||
|
Before deployment, verified ha1 backup configuration and ran manual backup:
|
||||||
|
|
||||||
|
**Backup paths:**
|
||||||
|
- `/var/lib/hass` ✓
|
||||||
|
- `/var/lib/zigbee2mqtt` ✓
|
||||||
|
- `/var/lib/mosquitto` ✓
|
||||||
|
|
||||||
|
**Manual backup (2026-02-05 22:45:23):**
|
||||||
|
- Snapshot ID: `59704dfa`
|
||||||
|
- Files: 77 total (0 new, 13 changed, 64 unmodified)
|
||||||
|
- Data: 62.635 MiB processed, 6.928 MiB stored (compressed)
|
||||||
|
|
||||||
|
### Other directories reviewed
|
||||||
|
|
||||||
|
- `/var/lib/vault` — Contains AppRole credentials; not backed up (can be re-provisioned via Ansible)
|
||||||
|
- `/var/lib/sops-nix` — Legacy; ha1 uses Vault now
|
||||||
|
|
||||||
|
## Post-Deployment Steps
|
||||||
|
|
||||||
|
After deploying to ha1:
|
||||||
|
|
||||||
|
1. Restart zigbee2mqtt service (automatic on NixOS rebuild)
|
||||||
|
2. In Home Assistant, the battery entities may need to be re-discovered:
|
||||||
|
- Go to Settings → Devices & Services → MQTT
|
||||||
|
- The new `value_template` should take effect after entity re-discovery
|
||||||
|
- If not, try disabling and re-enabling the battery entities
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Device configuration is now declarative in NixOS. Future device additions via Zigbee2MQTT frontend will need to be added to the NixOS config to persist.
|
||||||
|
- The `devices.yaml` file on ha1 will be overwritten on service start but can be removed after confirming the new config works.
|
||||||
|
- The NixOS zigbee2mqtt module defaults to `devices = "devices.yaml"` but our explicit inline config overrides this.
|
||||||
179
docs/plans/homelab-exporter.md
Normal file
179
docs/plans/homelab-exporter.md
Normal file
@@ -0,0 +1,179 @@
|
|||||||
|
# Homelab Infrastructure Exporter
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Build a Prometheus exporter for metrics specific to our homelab infrastructure. Unlike the generic nixos-exporter, this covers services and patterns unique to our environment.
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
### Existing Exporters
|
||||||
|
- **node-exporter** (all hosts): System metrics
|
||||||
|
- **systemd-exporter** (all hosts): Service restart counts, IP accounting
|
||||||
|
- **labmon** (monitoring01): TLS certificate monitoring, step-ca health
|
||||||
|
- **Service-specific**: unbound, postgres, nats, jellyfin, home-assistant, caddy, step-ca
|
||||||
|
|
||||||
|
### Gaps
|
||||||
|
- No visibility into Vault/OpenBao lease expiry
|
||||||
|
- No ACME certificate expiry from internal CA
|
||||||
|
- No Proxmox guest agent metrics from inside VMs
|
||||||
|
|
||||||
|
## Metrics
|
||||||
|
|
||||||
|
### Vault/OpenBao Metrics
|
||||||
|
|
||||||
|
| Metric | Description | Source |
|
||||||
|
|--------|-------------|--------|
|
||||||
|
| `homelab_vault_token_expiry_seconds` | Seconds until AppRole token expires | Token metadata or lease file |
|
||||||
|
| `homelab_vault_token_renewable` | 1 if token is renewable | Token metadata |
|
||||||
|
|
||||||
|
Labels: `role` (AppRole name)
|
||||||
|
|
||||||
|
### ACME Certificate Metrics
|
||||||
|
|
||||||
|
| Metric | Description | Source |
|
||||||
|
|--------|-------------|--------|
|
||||||
|
| `homelab_acme_cert_expiry_seconds` | Seconds until certificate expires | Parse cert from `/var/lib/acme/*/cert.pem` |
|
||||||
|
| `homelab_acme_cert_not_after` | Unix timestamp of cert expiry | Certificate NotAfter field |
|
||||||
|
|
||||||
|
Labels: `domain`, `issuer`
|
||||||
|
|
||||||
|
Note: labmon already monitors external TLS endpoints. This covers local ACME-managed certs.
|
||||||
|
|
||||||
|
### Proxmox Guest Metrics (future)
|
||||||
|
|
||||||
|
| Metric | Description | Source |
|
||||||
|
|--------|-------------|--------|
|
||||||
|
| `homelab_proxmox_guest_info` | Info gauge with VM ID, name | QEMU guest agent |
|
||||||
|
| `homelab_proxmox_guest_agent_running` | 1 if guest agent is responsive | Agent ping |
|
||||||
|
|
||||||
|
### DNS Zone Metrics (future)
|
||||||
|
|
||||||
|
| Metric | Description | Source |
|
||||||
|
|--------|-------------|--------|
|
||||||
|
| `homelab_dns_zone_serial` | Current zone serial number | DNS AXFR or zone file |
|
||||||
|
|
||||||
|
Labels: `zone`
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
Single binary with collectors enabled via config. Runs on hosts that need specific collectors.
|
||||||
|
|
||||||
|
```
|
||||||
|
homelab-exporter
|
||||||
|
├── main.go
|
||||||
|
├── collector/
|
||||||
|
│ ├── vault.go # Vault/OpenBao token metrics
|
||||||
|
│ ├── acme.go # ACME certificate metrics
|
||||||
|
│ └── proxmox.go # Proxmox guest agent (future)
|
||||||
|
└── config/
|
||||||
|
└── config.go
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
listen_addr: ":9970"
|
||||||
|
collectors:
|
||||||
|
vault:
|
||||||
|
enabled: true
|
||||||
|
token_path: "/var/lib/vault/token"
|
||||||
|
acme:
|
||||||
|
enabled: true
|
||||||
|
cert_dirs:
|
||||||
|
- "/var/lib/acme"
|
||||||
|
proxmox:
|
||||||
|
enabled: false
|
||||||
|
```
|
||||||
|
|
||||||
|
## NixOS Module
|
||||||
|
|
||||||
|
```nix
|
||||||
|
services.homelab-exporter = {
|
||||||
|
enable = true;
|
||||||
|
port = 9970;
|
||||||
|
collectors = {
|
||||||
|
vault = {
|
||||||
|
enable = true;
|
||||||
|
tokenPath = "/var/lib/vault/token";
|
||||||
|
};
|
||||||
|
acme = {
|
||||||
|
enable = true;
|
||||||
|
certDirs = [ "/var/lib/acme" ];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# Auto-register scrape target
|
||||||
|
homelab.monitoring.scrapeTargets = [{
|
||||||
|
job_name = "homelab-exporter";
|
||||||
|
port = 9970;
|
||||||
|
}];
|
||||||
|
```
|
||||||
|
|
||||||
|
## Integration
|
||||||
|
|
||||||
|
### Deployment
|
||||||
|
|
||||||
|
Deploy on hosts that have relevant data:
|
||||||
|
- **All hosts with ACME certs**: acme collector
|
||||||
|
- **All hosts with Vault**: vault collector
|
||||||
|
- **Proxmox VMs**: proxmox collector (when implemented)
|
||||||
|
|
||||||
|
### Relationship with nixos-exporter
|
||||||
|
|
||||||
|
These are complementary:
|
||||||
|
- **nixos-exporter** (port 9971): Generic NixOS metrics, deploy everywhere
|
||||||
|
- **homelab-exporter** (port 9970): Infrastructure-specific, deploy selectively
|
||||||
|
|
||||||
|
Both can run on the same host if needed.
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
### Language
|
||||||
|
|
||||||
|
Go - consistent with labmon and nixos-exporter.
|
||||||
|
|
||||||
|
### Phase 1: Core + ACME
|
||||||
|
1. Create git repository (git.t-juice.club/torjus/homelab-exporter)
|
||||||
|
2. Implement ACME certificate collector
|
||||||
|
3. HTTP server with `/metrics`
|
||||||
|
4. NixOS module
|
||||||
|
|
||||||
|
### Phase 2: Vault Collector
|
||||||
|
1. Implement token expiry detection
|
||||||
|
2. Handle missing/expired tokens gracefully
|
||||||
|
|
||||||
|
### Phase 3: Dashboard
|
||||||
|
1. Create Grafana dashboard for infrastructure health
|
||||||
|
2. Add to existing monitoring service module
|
||||||
|
|
||||||
|
## Alert Examples
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- alert: VaultTokenExpiringSoon
|
||||||
|
expr: homelab_vault_token_expiry_seconds < 3600
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Vault token on {{ $labels.instance }} expires in < 1 hour"
|
||||||
|
|
||||||
|
- alert: ACMECertExpiringSoon
|
||||||
|
expr: homelab_acme_cert_expiry_seconds < 7 * 24 * 3600
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "ACME cert {{ $labels.domain }} on {{ $labels.instance }} expires in < 7 days"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- [ ] How to read Vault token expiry without re-authenticating?
|
||||||
|
- [ ] Should ACME collector also check key/cert match?
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Port 9970 (labmon uses 9969, nixos-exporter will use 9971)
|
||||||
|
- Keep infrastructure-specific logic here, generic NixOS stuff in nixos-exporter
|
||||||
|
- Consider merging Proxmox metrics with pve-exporter if overlap is significant
|
||||||
216
docs/plans/host-migration-to-opentofu.md
Normal file
216
docs/plans/host-migration-to-opentofu.md
Normal file
@@ -0,0 +1,216 @@
|
|||||||
|
# Host Migration to OpenTofu
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Migrate all existing hosts (provisioned manually before the OpenTofu pipeline) into the new
|
||||||
|
OpenTofu-managed provisioning workflow. Hosts are categorized by their state requirements:
|
||||||
|
stateless hosts are simply recreated, stateful hosts require backup and restore, and some
|
||||||
|
hosts are decommissioned or deferred.
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
Hosts already managed by OpenTofu: `vault01`, `testvm01`, `testvm02`, `testvm03`, `ns2`, `ns1`
|
||||||
|
|
||||||
|
Hosts to migrate:
|
||||||
|
|
||||||
|
| Host | Category | Notes |
|
||||||
|
|------|----------|-------|
|
||||||
|
| ~~ns1~~ | ~~Stateless~~ | ✓ Complete |
|
||||||
|
| nix-cache01 | Stateless | Binary cache, recreate |
|
||||||
|
| http-proxy | Stateless | Reverse proxy, recreate |
|
||||||
|
| nats1 | Stateless | Messaging, recreate |
|
||||||
|
| ha1 | Stateful | Home Assistant + Zigbee2MQTT + Mosquitto |
|
||||||
|
| monitoring01 | Stateful | Prometheus, Grafana, Loki |
|
||||||
|
| jelly01 | Stateful | Jellyfin metadata, watch history, config |
|
||||||
|
| pgdb1 | Decommission | Only used by Open WebUI on gunter, migrating to local postgres |
|
||||||
|
| ~~jump~~ | ~~Decommission~~ | ✓ Complete |
|
||||||
|
| ~~auth01~~ | ~~Decommission~~ | ✓ Complete |
|
||||||
|
| ~~ca~~ | ~~Deferred~~ | ✓ Complete |
|
||||||
|
|
||||||
|
## Phase 1: Backup Preparation
|
||||||
|
|
||||||
|
Before migrating any stateful host, ensure restic backups are in place and verified.
|
||||||
|
|
||||||
|
### 1a. Expand monitoring01 Grafana Backup
|
||||||
|
|
||||||
|
The existing backup only covers `/var/lib/grafana/plugins` and a sqlite dump of `grafana.db`.
|
||||||
|
Expand to back up all of `/var/lib/grafana/` to capture config directory and any other state.
|
||||||
|
|
||||||
|
### 1b. Add Jellyfin Backup to jelly01
|
||||||
|
|
||||||
|
No backup currently exists. Add a restic backup job for `/var/lib/jellyfin/` which contains:
|
||||||
|
- `config/` — server settings, library configuration
|
||||||
|
- `data/` — user watch history, playback state, library metadata
|
||||||
|
|
||||||
|
Media files are on the NAS (`nas.home.2rjus.net:/mnt/hdd-pool/media`) and do not need backup.
|
||||||
|
The cache directory (`/var/cache/jellyfin/`) does not need backup — it regenerates.
|
||||||
|
|
||||||
|
### 1c. Verify Existing ha1 Backup
|
||||||
|
|
||||||
|
ha1 already backs up `/var/lib/hass`, `/var/lib/zigbee2mqtt`, `/var/lib/mosquitto`. Verify
|
||||||
|
these backups are current and restorable before proceeding with migration.
|
||||||
|
|
||||||
|
### 1d. Verify All Backups
|
||||||
|
|
||||||
|
After adding/expanding backup jobs:
|
||||||
|
1. Trigger a manual backup run on each host
|
||||||
|
2. Verify backup integrity with `restic check`
|
||||||
|
3. Test a restore to a temporary location to confirm data is recoverable
|
||||||
|
|
||||||
|
## Phase 2: Stateless Host Migration
|
||||||
|
|
||||||
|
These hosts have no meaningful state and can be recreated fresh. For each host:
|
||||||
|
|
||||||
|
1. Add the host definition to `terraform/vms.tf` (using `create-host` or manually)
|
||||||
|
2. Commit and push to master
|
||||||
|
3. Run `tofu apply` to provision the new VM
|
||||||
|
4. Wait for bootstrap to complete (VM pulls config from master and reboots)
|
||||||
|
5. Verify the host is functional
|
||||||
|
6. Decommission the old VM in Proxmox
|
||||||
|
|
||||||
|
### Migration Order
|
||||||
|
|
||||||
|
Migrate stateless hosts in an order that minimizes disruption:
|
||||||
|
|
||||||
|
1. **nix-cache01** — low risk, no downstream dependencies during migration
|
||||||
|
2. **nats1** — low risk, verify no persistent JetStream streams first
|
||||||
|
3. **http-proxy** — brief disruption to proxied services, migrate during low-traffic window
|
||||||
|
4. ~~**ns1** — ns2 already migrated, verify AXFR works after ns1 migration~~ ✓ Complete
|
||||||
|
|
||||||
|
~~For ns1/ns2: migrate ns2 first (secondary), verify AXFR works, then migrate ns1.~~ Both ns1
|
||||||
|
and ns2 migration complete. Zone transfer (AXFR) verified working between ns1 (primary) and
|
||||||
|
ns2 (secondary).
|
||||||
|
|
||||||
|
## Phase 3: Stateful Host Migration
|
||||||
|
|
||||||
|
For each stateful host, the procedure is:
|
||||||
|
|
||||||
|
1. Trigger a final restic backup
|
||||||
|
2. Stop services on the old host (to prevent state drift during migration)
|
||||||
|
3. Provision the new VM via `tofu apply`
|
||||||
|
4. Wait for bootstrap to complete
|
||||||
|
5. Stop the relevant services on the new host
|
||||||
|
6. Restore data from restic backup
|
||||||
|
7. Start services and verify functionality
|
||||||
|
8. Decommission the old VM
|
||||||
|
|
||||||
|
### 3a. monitoring01
|
||||||
|
|
||||||
|
1. Run final Grafana backup
|
||||||
|
2. Provision new monitoring01 via OpenTofu
|
||||||
|
3. After bootstrap, restore `/var/lib/grafana/` from restic
|
||||||
|
4. Restart Grafana, verify dashboards and datasources are intact
|
||||||
|
5. Prometheus and Loki start fresh with empty data (acceptable)
|
||||||
|
6. Verify all scrape targets are being collected
|
||||||
|
7. Decommission old VM
|
||||||
|
|
||||||
|
### 3b. jelly01
|
||||||
|
|
||||||
|
1. Run final Jellyfin backup
|
||||||
|
2. Provision new jelly01 via OpenTofu
|
||||||
|
3. After bootstrap, restore `/var/lib/jellyfin/` from restic
|
||||||
|
4. Verify NFS mount to NAS is working
|
||||||
|
5. Start Jellyfin, verify watch history and library metadata are present
|
||||||
|
6. Decommission old VM
|
||||||
|
|
||||||
|
### 3c. ha1
|
||||||
|
|
||||||
|
1. Verify latest restic backup is current
|
||||||
|
2. Stop Home Assistant, Zigbee2MQTT, and Mosquitto on old host
|
||||||
|
3. Provision new ha1 via OpenTofu
|
||||||
|
4. After bootstrap, restore `/var/lib/hass`, `/var/lib/zigbee2mqtt`, `/var/lib/mosquitto`
|
||||||
|
5. Start services, verify Home Assistant is functional
|
||||||
|
6. Verify Zigbee devices are still paired and communicating
|
||||||
|
7. Decommission old VM
|
||||||
|
|
||||||
|
**Note:** ha1 currently has 2 GB RAM, which is consistently tight. Average memory usage has
|
||||||
|
climbed from ~57% (30-day avg) to ~70% currently, with a 30-day low of only 187 MB free.
|
||||||
|
Consider increasing to 4 GB when reprovisioning to allow headroom for additional integrations.
|
||||||
|
|
||||||
|
**Note:** ha1 is the highest-risk migration due to Zigbee device pairings. The Zigbee
|
||||||
|
coordinator state in `/var/lib/zigbee2mqtt` should preserve pairings, but verify on a
|
||||||
|
non-critical time window.
|
||||||
|
|
||||||
|
**USB Passthrough:** The ha1 VM has a USB device passed through from the Proxmox hypervisor
|
||||||
|
(the Zigbee coordinator). The new VM must be configured with the same USB passthrough in
|
||||||
|
OpenTofu/Proxmox. Verify the USB device ID on the hypervisor and add the appropriate
|
||||||
|
`usb` block to the VM definition in `terraform/vms.tf`. The USB device must be passed
|
||||||
|
through before starting Zigbee2MQTT on the new host.
|
||||||
|
|
||||||
|
## Phase 4: Decommission Hosts
|
||||||
|
|
||||||
|
### jump ✓ COMPLETE
|
||||||
|
|
||||||
|
~~1. Verify nothing depends on the jump host (no SSH proxy configs pointing to it, etc.)~~
|
||||||
|
~~2. Remove host configuration from `hosts/jump/`~~
|
||||||
|
~~3. Remove from `flake.nix`~~
|
||||||
|
~~4. Remove any secrets in `secrets/jump/`~~
|
||||||
|
~~5. Remove from `.sops.yaml`~~
|
||||||
|
~~6. Destroy the VM in Proxmox~~
|
||||||
|
~~7. Commit cleanup~~
|
||||||
|
|
||||||
|
Host was already removed from flake.nix and VM destroyed. Configuration cleaned up in ba9f47f.
|
||||||
|
|
||||||
|
### auth01 ✓ COMPLETE
|
||||||
|
|
||||||
|
~~1. Remove host configuration from `hosts/auth01/`~~
|
||||||
|
~~2. Remove from `flake.nix`~~
|
||||||
|
~~3. Remove any secrets in `secrets/auth01/`~~
|
||||||
|
~~4. Remove from `.sops.yaml`~~
|
||||||
|
~~5. Remove `services/authelia/` and `services/lldap/` (only used by auth01)~~
|
||||||
|
~~6. Destroy the VM in Proxmox~~
|
||||||
|
~~7. Commit cleanup~~
|
||||||
|
|
||||||
|
Host configuration, services, and VM already removed.
|
||||||
|
|
||||||
|
### pgdb1 (in progress)
|
||||||
|
|
||||||
|
Only consumer was Open WebUI on gunter, which has been migrated to use local PostgreSQL.
|
||||||
|
|
||||||
|
1. ~~Verify Open WebUI on gunter is using local PostgreSQL (not pgdb1)~~ ✓
|
||||||
|
2. ~~Remove host configuration from `hosts/pgdb1/`~~ ✓
|
||||||
|
3. ~~Remove `services/postgres/` (only used by pgdb1)~~ ✓
|
||||||
|
4. ~~Remove from `flake.nix`~~ ✓
|
||||||
|
5. ~~Remove Vault AppRole from `terraform/vault/approle.tf`~~ ✓
|
||||||
|
6. Destroy the VM in Proxmox
|
||||||
|
7. ~~Commit cleanup~~ ✓
|
||||||
|
|
||||||
|
See `docs/plans/pgdb1-decommission.md` for detailed plan.
|
||||||
|
|
||||||
|
## Phase 5: Decommission ca Host ✓ COMPLETE
|
||||||
|
|
||||||
|
~~Deferred until Phase 4c (PKI migration to OpenBao) is complete. Once all hosts use the
|
||||||
|
OpenBao ACME endpoint for certificates, the step-ca host can be decommissioned following
|
||||||
|
the same cleanup steps as the jump host.~~
|
||||||
|
|
||||||
|
PKI migration to OpenBao complete. Host configuration, `services/ca/`, and VM removed.
|
||||||
|
|
||||||
|
## Phase 6: Remove sops-nix ✓ COMPLETE
|
||||||
|
|
||||||
|
~~Once `ca` is decommissioned (Phase 6), `sops-nix` is no longer used by any host. Remove
|
||||||
|
all remnants:~~
|
||||||
|
~~- `sops-nix` input from `flake.nix` and `flake.lock`~~
|
||||||
|
~~- `sops-nix.nixosModules.sops` from all host module lists in `flake.nix`~~
|
||||||
|
~~- `inherit sops-nix` from all specialArgs in `flake.nix`~~
|
||||||
|
~~- `system/sops.nix` and its import in `system/default.nix`~~
|
||||||
|
~~- `.sops.yaml`~~
|
||||||
|
~~- `secrets/` directory~~
|
||||||
|
~~- All `sops.secrets.*` declarations in `services/ca/`, `services/authelia/`, `services/lldap/`~~
|
||||||
|
~~- Template scripts that generate age keys for sops (`hosts/template/scripts.nix`,
|
||||||
|
`hosts/template2/scripts.nix`)~~
|
||||||
|
|
||||||
|
All sops-nix remnants removed. See `docs/plans/completed/sops-to-openbao-migration.md` for context.
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Each host migration should be done individually, not in bulk, to limit blast radius
|
||||||
|
- Keep the old VM running until the new one is verified — do not destroy prematurely
|
||||||
|
- The old VMs use IPs that the new VMs need, so the old VM must be shut down before
|
||||||
|
the new one is provisioned (or use a temporary IP and swap after verification)
|
||||||
|
- Stateful migrations should be done during low-usage windows
|
||||||
|
- After all migrations are complete, all decommissioned hosts (jump, auth01, ca) have been removed
|
||||||
|
- Since many hosts are being recreated, this is a good opportunity to establish consistent
|
||||||
|
hostname naming conventions before provisioning the new VMs. Current naming is inconsistent
|
||||||
|
(e.g. `ns1` vs `nix-cache01`, `ha1` vs `auth01`, `pgdb1` vs `http-proxy`). Decide on a
|
||||||
|
convention before starting migrations — e.g. whether to always use numeric suffixes, a
|
||||||
|
consistent format like `service-NN`, role-based vs function-based names, etc.
|
||||||
122
docs/plans/long-term-metrics-storage.md
Normal file
122
docs/plans/long-term-metrics-storage.md
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
# Long-Term Metrics Storage Options
|
||||||
|
|
||||||
|
## Problem Statement
|
||||||
|
|
||||||
|
Current Prometheus configuration retains metrics for 30 days (`retentionTime = "30d"`). Extending retention further raises disk usage concerns on the homelab hypervisor with limited local storage.
|
||||||
|
|
||||||
|
Prometheus does not support downsampling - it stores all data at full resolution until the retention period expires, then deletes it entirely.
|
||||||
|
|
||||||
|
## Current Configuration
|
||||||
|
|
||||||
|
Location: `services/monitoring/prometheus.nix`
|
||||||
|
|
||||||
|
- **Retention**: 30 days
|
||||||
|
- **Scrape interval**: 15s
|
||||||
|
- **Features**: Alertmanager, Pushgateway, auto-generated scrape configs from flake hosts
|
||||||
|
- **Storage**: Local disk on monitoring01
|
||||||
|
|
||||||
|
## Options Evaluated
|
||||||
|
|
||||||
|
### Option 1: VictoriaMetrics
|
||||||
|
|
||||||
|
VictoriaMetrics is a Prometheus-compatible TSDB with significantly better compression (5-10x smaller storage footprint).
|
||||||
|
|
||||||
|
**NixOS Options Available:**
|
||||||
|
- `services.victoriametrics.enable`
|
||||||
|
- `services.victoriametrics.prometheusConfig` - accepts Prometheus scrape config format
|
||||||
|
- `services.victoriametrics.retentionPeriod` - e.g., "6m" for 6 months
|
||||||
|
- `services.vmagent` - dedicated scraping agent
|
||||||
|
- `services.vmalert` - alerting rules evaluation
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- Simple migration - single service replacement
|
||||||
|
- Same PromQL query language - Grafana dashboards work unchanged
|
||||||
|
- Same scrape config format - existing auto-generated configs work as-is
|
||||||
|
- 5-10x better compression means 30 days of Prometheus data could become 180+ days
|
||||||
|
- Lightweight, single binary
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- No automatic downsampling (relies on compression alone)
|
||||||
|
- Alerting requires switching to vmalert instead of Prometheus alertmanager integration
|
||||||
|
- Would need to migrate existing data or start fresh
|
||||||
|
|
||||||
|
**Migration Steps:**
|
||||||
|
1. Replace `services.prometheus` with `services.victoriametrics`
|
||||||
|
2. Move scrape configs to `prometheusConfig`
|
||||||
|
3. Set up `services.vmalert` for alerting rules
|
||||||
|
4. Update Grafana datasource to VictoriaMetrics port (8428)
|
||||||
|
5. Keep Alertmanager for notification routing
|
||||||
|
|
||||||
|
### Option 2: Thanos
|
||||||
|
|
||||||
|
Thanos extends Prometheus with long-term storage and automatic downsampling by uploading data to object storage.
|
||||||
|
|
||||||
|
**NixOS Options Available:**
|
||||||
|
- `services.thanos.sidecar` - uploads Prometheus blocks to object storage
|
||||||
|
- `services.thanos.compact` - compacts and downsamples data
|
||||||
|
- `services.thanos.query` - unified query gateway
|
||||||
|
- `services.thanos.query-frontend` - query caching and parallelization
|
||||||
|
- `services.thanos.downsample` - dedicated downsampling service
|
||||||
|
|
||||||
|
**Downsampling Behavior:**
|
||||||
|
- Raw resolution kept for configurable period (default: indefinite)
|
||||||
|
- 5-minute resolution created after 40 hours
|
||||||
|
- 1-hour resolution created after 10 days
|
||||||
|
|
||||||
|
**Retention Configuration (in compactor):**
|
||||||
|
```nix
|
||||||
|
services.thanos.compact = {
|
||||||
|
retention.resolution-raw = "30d"; # Keep raw for 30 days
|
||||||
|
retention.resolution-5m = "180d"; # Keep 5m samples for 6 months
|
||||||
|
retention.resolution-1h = "2y"; # Keep 1h samples for 2 years
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- True downsampling - older data uses progressively less storage
|
||||||
|
- Keep metrics for years with minimal storage impact
|
||||||
|
- Prometheus continues running unchanged
|
||||||
|
- Existing Alertmanager integration preserved
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- Requires object storage (MinIO, S3, or local filesystem)
|
||||||
|
- Multiple services to manage (sidecar, compactor, query)
|
||||||
|
- More complex architecture
|
||||||
|
- Additional infrastructure (MinIO) may be needed
|
||||||
|
|
||||||
|
**Required Components:**
|
||||||
|
1. Thanos Sidecar (runs alongside Prometheus)
|
||||||
|
2. Object storage (MinIO or local filesystem)
|
||||||
|
3. Thanos Compactor (handles downsampling)
|
||||||
|
4. Thanos Query (provides unified query endpoint)
|
||||||
|
|
||||||
|
**Migration Steps:**
|
||||||
|
1. Deploy object storage (MinIO or configure filesystem backend)
|
||||||
|
2. Add Thanos sidecar pointing to Prometheus data directory
|
||||||
|
3. Add Thanos compactor with retention policies
|
||||||
|
4. Add Thanos query gateway
|
||||||
|
5. Update Grafana datasource to Thanos Query port (10902)
|
||||||
|
|
||||||
|
## Comparison
|
||||||
|
|
||||||
|
| Aspect | VictoriaMetrics | Thanos |
|
||||||
|
|--------|-----------------|--------|
|
||||||
|
| Complexity | Low (1 service) | Higher (3-4 services) |
|
||||||
|
| Downsampling | No | Yes (automatic) |
|
||||||
|
| Storage savings | 5-10x compression | Compression + downsampling |
|
||||||
|
| Object storage required | No | Yes |
|
||||||
|
| Migration effort | Minimal | Moderate |
|
||||||
|
| Grafana changes | Change port only | Change port only |
|
||||||
|
| Alerting changes | Need vmalert | Keep existing |
|
||||||
|
|
||||||
|
## Recommendation
|
||||||
|
|
||||||
|
**Start with VictoriaMetrics** for simplicity. The compression alone may provide 6+ months of retention in the same disk space currently used for 30 days.
|
||||||
|
|
||||||
|
If multi-year retention with true downsampling becomes necessary, Thanos can be evaluated later. However, it requires deploying object storage infrastructure (MinIO) which adds operational complexity.
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- VictoriaMetrics docs: https://docs.victoriametrics.com/
|
||||||
|
- Thanos docs: https://thanos.io/tip/thanos/getting-started.md/
|
||||||
|
- NixOS options searched from nixpkgs revision e576e3c9 (NixOS 25.11)
|
||||||
116
docs/plans/memory-issues-follow-up.md
Normal file
116
docs/plans/memory-issues-follow-up.md
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
# Memory Issues Follow-up
|
||||||
|
|
||||||
|
Tracking the zram change to verify it resolves OOM issues during nixos-upgrade on low-memory hosts.
|
||||||
|
|
||||||
|
## Background
|
||||||
|
|
||||||
|
On 2026-02-08, ns2 (2GB RAM) experienced an OOM kill during nixos-upgrade. The Nix evaluation process consumed ~1.6GB before being killed by the kernel. ns1 (manually increased to 4GB) succeeded with the same upgrade.
|
||||||
|
|
||||||
|
Root cause: 2GB RAM is insufficient for Nix flake evaluation without swap.
|
||||||
|
|
||||||
|
## Fix Applied
|
||||||
|
|
||||||
|
**Commit:** `1674b6a` - system: enable zram swap for all hosts
|
||||||
|
|
||||||
|
**Merged:** 2026-02-08 ~12:15 UTC
|
||||||
|
|
||||||
|
**Change:** Added `zramSwap.enable = true` to `system/zram.nix`, providing ~2GB compressed swap on all hosts.
|
||||||
|
|
||||||
|
## Timeline
|
||||||
|
|
||||||
|
| Time (UTC) | Event |
|
||||||
|
|------------|-------|
|
||||||
|
| 05:00:46 | ns2 nixos-upgrade OOM killed |
|
||||||
|
| 05:01:47 | `nixos_upgrade_failed` alert fired |
|
||||||
|
| 12:15 | zram commit merged to master |
|
||||||
|
| 12:19 | ns2 rebooted with zram enabled |
|
||||||
|
| 12:20 | ns1 rebooted (memory reduced to 2GB via tofu) |
|
||||||
|
|
||||||
|
## Hosts Affected
|
||||||
|
|
||||||
|
All 2GB VMs that run nixos-upgrade:
|
||||||
|
- ns1, ns2 (DNS)
|
||||||
|
- vault01
|
||||||
|
- testvm01, testvm02, testvm03
|
||||||
|
- kanidm01
|
||||||
|
|
||||||
|
## Metrics to Monitor
|
||||||
|
|
||||||
|
Check these in Grafana or via PromQL to verify the fix:
|
||||||
|
|
||||||
|
### Swap availability (should be ~2GB after upgrade)
|
||||||
|
```promql
|
||||||
|
node_memory_SwapTotal_bytes / 1024 / 1024
|
||||||
|
```
|
||||||
|
|
||||||
|
### Swap usage during upgrades
|
||||||
|
```promql
|
||||||
|
(node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) / 1024 / 1024
|
||||||
|
```
|
||||||
|
|
||||||
|
### Zswap compressed bytes (active compression)
|
||||||
|
```promql
|
||||||
|
node_memory_Zswap_bytes / 1024 / 1024
|
||||||
|
```
|
||||||
|
|
||||||
|
### Upgrade failures (should be 0)
|
||||||
|
```promql
|
||||||
|
node_systemd_unit_state{name="nixos-upgrade.service", state="failed"}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Memory available during upgrades
|
||||||
|
```promql
|
||||||
|
node_memory_MemAvailable_bytes / 1024 / 1024
|
||||||
|
```
|
||||||
|
|
||||||
|
## Verification Steps
|
||||||
|
|
||||||
|
After a few days (allow auto-upgrades to run on all hosts):
|
||||||
|
|
||||||
|
1. Check all hosts have swap enabled:
|
||||||
|
```promql
|
||||||
|
node_memory_SwapTotal_bytes > 0
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Check for any upgrade failures since the fix:
|
||||||
|
```promql
|
||||||
|
count_over_time(ALERTS{alertname="nixos_upgrade_failed"}[7d])
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Review if any hosts used swap during upgrades (check historical graphs)
|
||||||
|
|
||||||
|
## Success Criteria
|
||||||
|
|
||||||
|
- No `nixos_upgrade_failed` alerts due to OOM after 2026-02-08
|
||||||
|
- All hosts show ~2GB swap available
|
||||||
|
- Upgrades complete successfully on 2GB VMs
|
||||||
|
|
||||||
|
## Fallback Options
|
||||||
|
|
||||||
|
If zram is insufficient:
|
||||||
|
|
||||||
|
1. **Increase VM memory** - Update `terraform/vms.tf` to 4GB for affected hosts
|
||||||
|
2. **Enable memory ballooning** - Configure VMs with dynamic memory allocation (see below)
|
||||||
|
3. **Use remote builds** - Configure `nix.buildMachines` to offload evaluation
|
||||||
|
4. **Reduce flake size** - Split configurations to reduce evaluation memory
|
||||||
|
|
||||||
|
### Memory Ballooning
|
||||||
|
|
||||||
|
Proxmox supports memory ballooning, which allows VMs to dynamically grow/shrink memory allocation based on demand. The balloon driver inside the guest communicates with the hypervisor to release or reclaim memory pages.
|
||||||
|
|
||||||
|
Configuration in `terraform/vms.tf`:
|
||||||
|
```hcl
|
||||||
|
memory = 4096 # maximum memory
|
||||||
|
balloon = 2048 # minimum memory (shrinks to this when idle)
|
||||||
|
```
|
||||||
|
|
||||||
|
Pros:
|
||||||
|
- VMs get memory on-demand without reboots
|
||||||
|
- Better host memory utilization
|
||||||
|
- Solves upgrade OOM without permanently allocating 4GB
|
||||||
|
|
||||||
|
Cons:
|
||||||
|
- Requires QEMU guest agent running in guest
|
||||||
|
- Guest can experience memory pressure if host is overcommitted
|
||||||
|
|
||||||
|
Ballooning and zram are complementary - ballooning provides headroom from the host, zram provides overflow within the guest.
|
||||||
219
docs/plans/monitoring-migration-victoriametrics.md
Normal file
219
docs/plans/monitoring-migration-victoriametrics.md
Normal file
@@ -0,0 +1,219 @@
|
|||||||
|
# Monitoring Stack Migration to VictoriaMetrics
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Migrate from Prometheus to VictoriaMetrics on a new host (monitoring02) to gain better compression
|
||||||
|
and longer retention. Run in parallel with monitoring01 until validated, then switch over using
|
||||||
|
a `monitoring` CNAME for seamless transition.
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
**monitoring01** (10.69.13.13):
|
||||||
|
- 4 CPU cores, 4GB RAM, 33GB disk
|
||||||
|
- Prometheus with 30-day retention (15s scrape interval)
|
||||||
|
- Alertmanager (routes to alerttonotify webhook)
|
||||||
|
- Grafana (dashboards, datasources)
|
||||||
|
- Loki (log aggregation from all hosts via Promtail)
|
||||||
|
- Tempo (distributed tracing)
|
||||||
|
- Pyroscope (continuous profiling)
|
||||||
|
|
||||||
|
**Hardcoded References to monitoring01:**
|
||||||
|
- `system/monitoring/logs.nix` - Promtail sends logs to `http://monitoring01.home.2rjus.net:3100`
|
||||||
|
- `hosts/template2/bootstrap.nix` - Bootstrap logs to Loki (keep as-is until decommission)
|
||||||
|
- `services/http-proxy/proxy.nix` - Caddy proxies Prometheus, Alertmanager, Grafana, Pyroscope, Pushgateway
|
||||||
|
|
||||||
|
**Auto-generated:**
|
||||||
|
- Prometheus scrape targets (from `lib/monitoring.nix` + `homelab.monitoring.scrapeTargets`)
|
||||||
|
- Node-exporter targets (from all hosts with static IPs)
|
||||||
|
|
||||||
|
## Decision: VictoriaMetrics
|
||||||
|
|
||||||
|
Per `docs/plans/long-term-metrics-storage.md`, VictoriaMetrics is the recommended starting point:
|
||||||
|
- Single binary replacement for Prometheus
|
||||||
|
- 5-10x better compression (30 days could become 180+ days in same space)
|
||||||
|
- Same PromQL query language (Grafana dashboards work unchanged)
|
||||||
|
- Same scrape config format (existing auto-generated configs work)
|
||||||
|
|
||||||
|
If multi-year retention with downsampling becomes necessary later, Thanos can be evaluated.
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────┐
|
||||||
|
│ monitoring02 │
|
||||||
|
│ VictoriaMetrics│
|
||||||
|
│ + Grafana │
|
||||||
|
monitoring │ + Loki │
|
||||||
|
CNAME ──────────│ + Tempo │
|
||||||
|
│ + Pyroscope │
|
||||||
|
│ + Alertmanager │
|
||||||
|
│ (vmalert) │
|
||||||
|
└─────────────────┘
|
||||||
|
▲
|
||||||
|
│ scrapes
|
||||||
|
┌───────────────┼───────────────┐
|
||||||
|
│ │ │
|
||||||
|
┌────┴────┐ ┌─────┴────┐ ┌─────┴────┐
|
||||||
|
│ ns1 │ │ ha1 │ │ ... │
|
||||||
|
│ :9100 │ │ :9100 │ │ :9100 │
|
||||||
|
└─────────┘ └──────────┘ └──────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## Implementation Plan
|
||||||
|
|
||||||
|
### Phase 1: Create monitoring02 Host
|
||||||
|
|
||||||
|
Use `create-host` script which handles flake.nix and terraform/vms.tf automatically.
|
||||||
|
|
||||||
|
1. **Run create-host**: `nix develop -c create-host monitoring02 10.69.13.24`
|
||||||
|
2. **Update VM resources** in `terraform/vms.tf`:
|
||||||
|
- 4 cores (same as monitoring01)
|
||||||
|
- 8GB RAM (double, for VictoriaMetrics headroom)
|
||||||
|
- 100GB disk (for 3+ months retention with compression)
|
||||||
|
3. **Update host configuration**: Import monitoring services
|
||||||
|
4. **Create Vault AppRole**: Add to `terraform/vault/approle.tf`
|
||||||
|
|
||||||
|
### Phase 2: Set Up VictoriaMetrics Stack
|
||||||
|
|
||||||
|
Create new service module at `services/monitoring/victoriametrics/` for testing alongside existing
|
||||||
|
Prometheus config. Once validated, this can replace the Prometheus module.
|
||||||
|
|
||||||
|
1. **VictoriaMetrics** (port 8428):
|
||||||
|
- `services.victoriametrics.enable = true`
|
||||||
|
- `services.victoriametrics.retentionPeriod = "3m"` (3 months, increase later based on disk usage)
|
||||||
|
- Migrate scrape configs via `prometheusConfig`
|
||||||
|
- Use native push support (replaces Pushgateway)
|
||||||
|
|
||||||
|
2. **vmalert** for alerting rules:
|
||||||
|
- `services.vmalert.enable = true`
|
||||||
|
- Point to VictoriaMetrics for metrics evaluation
|
||||||
|
- Keep rules in separate `rules.yml` file (same format as Prometheus)
|
||||||
|
- No receiver configured during parallel operation (prevents duplicate alerts)
|
||||||
|
|
||||||
|
3. **Alertmanager** (port 9093):
|
||||||
|
- Keep existing configuration (alerttonotify webhook routing)
|
||||||
|
- Only enable receiver after cutover from monitoring01
|
||||||
|
|
||||||
|
4. **Loki** (port 3100):
|
||||||
|
- Same configuration as current
|
||||||
|
|
||||||
|
5. **Grafana** (port 3000):
|
||||||
|
- Define dashboards declaratively via NixOS options (not imported from monitoring01)
|
||||||
|
- Reference existing dashboards on monitoring01 for content inspiration
|
||||||
|
- Configure VictoriaMetrics datasource (port 8428)
|
||||||
|
- Configure Loki datasource
|
||||||
|
|
||||||
|
6. **Tempo** (ports 3200, 3201):
|
||||||
|
- Same configuration
|
||||||
|
|
||||||
|
7. **Pyroscope** (port 4040):
|
||||||
|
- Same Docker-based deployment
|
||||||
|
|
||||||
|
### Phase 3: Parallel Operation
|
||||||
|
|
||||||
|
Run both monitoring01 and monitoring02 simultaneously:
|
||||||
|
|
||||||
|
1. **Dual scraping**: Both hosts scrape the same targets
|
||||||
|
- Validates VictoriaMetrics is collecting data correctly
|
||||||
|
|
||||||
|
2. **Dual log shipping**: Configure Promtail to send logs to both Loki instances
|
||||||
|
- Add second client in `system/monitoring/logs.nix` pointing to monitoring02
|
||||||
|
|
||||||
|
3. **Validate dashboards**: Access Grafana on monitoring02, verify dashboards work
|
||||||
|
|
||||||
|
4. **Validate alerts**: Verify vmalert evaluates rules correctly (no receiver = no notifications)
|
||||||
|
|
||||||
|
5. **Compare resource usage**: Monitor disk/memory consumption between hosts
|
||||||
|
|
||||||
|
### Phase 4: Add monitoring CNAME
|
||||||
|
|
||||||
|
Add CNAME to monitoring02 once validated:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
# hosts/monitoring02/configuration.nix
|
||||||
|
homelab.dns.cnames = [ "monitoring" ];
|
||||||
|
```
|
||||||
|
|
||||||
|
This creates `monitoring.home.2rjus.net` pointing to monitoring02.
|
||||||
|
|
||||||
|
### Phase 5: Update References
|
||||||
|
|
||||||
|
Update hardcoded references to use the CNAME:
|
||||||
|
|
||||||
|
1. **system/monitoring/logs.nix**:
|
||||||
|
- Remove dual-shipping, point only to `http://monitoring.home.2rjus.net:3100`
|
||||||
|
|
||||||
|
2. **services/http-proxy/proxy.nix**: Update reverse proxy backends:
|
||||||
|
- prometheus.home.2rjus.net -> monitoring.home.2rjus.net:8428
|
||||||
|
- alertmanager.home.2rjus.net -> monitoring.home.2rjus.net:9093
|
||||||
|
- grafana.home.2rjus.net -> monitoring.home.2rjus.net:3000
|
||||||
|
- pyroscope.home.2rjus.net -> monitoring.home.2rjus.net:4040
|
||||||
|
|
||||||
|
Note: `hosts/template2/bootstrap.nix` stays pointed at monitoring01 until decommission.
|
||||||
|
|
||||||
|
### Phase 6: Enable Alerting
|
||||||
|
|
||||||
|
Once ready to cut over:
|
||||||
|
1. Enable Alertmanager receiver on monitoring02
|
||||||
|
2. Verify test alerts route correctly
|
||||||
|
|
||||||
|
### Phase 7: Cutover and Decommission
|
||||||
|
|
||||||
|
1. **Stop monitoring01**: Prevent duplicate alerts during transition
|
||||||
|
2. **Update bootstrap.nix**: Point to `monitoring.home.2rjus.net`
|
||||||
|
3. **Verify all targets scraped**: Check VictoriaMetrics UI
|
||||||
|
4. **Verify logs flowing**: Check Loki on monitoring02
|
||||||
|
5. **Decommission monitoring01**:
|
||||||
|
- Remove from flake.nix
|
||||||
|
- Remove host configuration
|
||||||
|
- Destroy VM in Proxmox
|
||||||
|
- Remove from terraform state
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- [ ] What disk size for monitoring02? 100GB should allow 3+ months with VictoriaMetrics compression
|
||||||
|
- [ ] Which dashboards to recreate declaratively? (Review monitoring01 Grafana for current set)
|
||||||
|
|
||||||
|
## VictoriaMetrics Service Configuration
|
||||||
|
|
||||||
|
Example NixOS configuration for monitoring02:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
# VictoriaMetrics replaces Prometheus
|
||||||
|
services.victoriametrics = {
|
||||||
|
enable = true;
|
||||||
|
retentionPeriod = "3m"; # 3 months, increase based on disk usage
|
||||||
|
prometheusConfig = {
|
||||||
|
global.scrape_interval = "15s";
|
||||||
|
scrape_configs = [
|
||||||
|
# Auto-generated node-exporter targets
|
||||||
|
# Service-specific scrape targets
|
||||||
|
# External targets
|
||||||
|
];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# vmalert for alerting rules (no receiver during parallel operation)
|
||||||
|
services.vmalert = {
|
||||||
|
enable = true;
|
||||||
|
datasource.url = "http://localhost:8428";
|
||||||
|
# notifier.alertmanager.url = "http://localhost:9093"; # Enable after cutover
|
||||||
|
rule = [ ./rules.yml ];
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
## Rollback Plan
|
||||||
|
|
||||||
|
If issues arise after cutover:
|
||||||
|
1. Move `monitoring` CNAME back to monitoring01
|
||||||
|
2. Restart monitoring01 services
|
||||||
|
3. Revert Promtail config to point only to monitoring01
|
||||||
|
4. Revert http-proxy backends
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- VictoriaMetrics uses port 8428 vs Prometheus 9090
|
||||||
|
- PromQL compatibility is excellent
|
||||||
|
- VictoriaMetrics native push replaces Pushgateway (remove from http-proxy if not needed)
|
||||||
|
- monitoring02 deployed via OpenTofu using `create-host` script
|
||||||
|
- Grafana dashboards defined declaratively via NixOS, not imported from monitoring01 state
|
||||||
212
docs/plans/nix-cache-reprovision.md
Normal file
212
docs/plans/nix-cache-reprovision.md
Normal file
@@ -0,0 +1,212 @@
|
|||||||
|
# Nix Cache Host Reprovision
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Reprovision `nix-cache01` using the OpenTofu workflow, and improve the build/cache system with:
|
||||||
|
1. NATS-based remote build triggering (replacing the current bash script)
|
||||||
|
2. Safer flake update workflow that validates builds before pushing to master
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
### Host Configuration
|
||||||
|
- `nix-cache01` at 10.69.13.15 serves the binary cache via Harmonia
|
||||||
|
- Runs Gitea Actions runner for CI workflows
|
||||||
|
- Has `homelab.deploy.enable = true` (already supports NATS-based deployment)
|
||||||
|
- Uses a dedicated XFS volume at `/nix` for cache storage
|
||||||
|
|
||||||
|
### Current Build System (`services/nix-cache/build-flakes.sh`)
|
||||||
|
- Runs every 30 minutes via systemd timer
|
||||||
|
- Clones/pulls two repos: `nixos-servers` and `nixos` (gunter)
|
||||||
|
- Builds all hosts with `nixos-rebuild build` (no blacklist despite docs mentioning it)
|
||||||
|
- Pushes success/failure metrics to pushgateway
|
||||||
|
- Simple but has no filtering, no parallelism, no remote triggering
|
||||||
|
|
||||||
|
### Current Flake Update Workflow (`.github/workflows/flake-update.yaml`)
|
||||||
|
- Runs daily at midnight via cron
|
||||||
|
- Runs `nix flake update --commit-lock-file`
|
||||||
|
- Pushes directly to master
|
||||||
|
- No build validation — can push broken inputs
|
||||||
|
|
||||||
|
## Improvement 1: NATS-Based Remote Build Triggering
|
||||||
|
|
||||||
|
### Design
|
||||||
|
|
||||||
|
Extend the existing `homelab-deploy` tool to support a "build" command that triggers builds on the cache host. This reuses the NATS infrastructure already in place.
|
||||||
|
|
||||||
|
| Approach | Pros | Cons |
|
||||||
|
|----------|------|------|
|
||||||
|
| Extend homelab-deploy | Reuses existing NATS auth, NKey handling, CLI | Adds scope to existing tool |
|
||||||
|
| New nix-cache-tool | Clean separation | Duplicate NATS boilerplate, new credentials |
|
||||||
|
| Gitea Actions webhook | No custom tooling | Less flexible, tied to Gitea |
|
||||||
|
|
||||||
|
**Recommendation:** Extend `homelab-deploy` with a build subcommand. The tool already has NATS client code, authentication handling, and a listener module in NixOS.
|
||||||
|
|
||||||
|
### Implementation
|
||||||
|
|
||||||
|
1. Add new message type to homelab-deploy: `build.<host>` subject
|
||||||
|
2. Listener on nix-cache01 subscribes to `build.>` wildcard
|
||||||
|
3. On message receipt, builds the specified host and returns success/failure
|
||||||
|
4. CLI command: `homelab-deploy build <hostname>` or `homelab-deploy build --all`
|
||||||
|
|
||||||
|
### Benefits
|
||||||
|
- Trigger rebuild for specific host to ensure it's cached
|
||||||
|
- Could be called from CI after merging PRs
|
||||||
|
- Reuses existing NATS infrastructure and auth
|
||||||
|
- Progress/status could stream back via NATS reply
|
||||||
|
|
||||||
|
## Improvement 2: Smarter Flake Update Workflow
|
||||||
|
|
||||||
|
### Current Problems
|
||||||
|
1. Updates can push breaking changes to master
|
||||||
|
2. No visibility into what broke when it does
|
||||||
|
3. Hosts that auto-update can pull broken configs
|
||||||
|
|
||||||
|
### Proposed Workflow
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────────────────────────────┐
|
||||||
|
│ Flake Update Workflow │
|
||||||
|
├─────────────────────────────────────────────────────────────────┤
|
||||||
|
│ 1. nix flake update (on feature branch) │
|
||||||
|
│ 2. Build ALL hosts locally │
|
||||||
|
│ 3. If all pass → fast-forward merge to master │
|
||||||
|
│ 4. If any fail → create PR with failure logs attached │
|
||||||
|
└─────────────────────────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
### Implementation Options
|
||||||
|
|
||||||
|
| Option | Description | Pros | Cons |
|
||||||
|
|--------|-------------|------|------|
|
||||||
|
| **A: Self-hosted runner** | Build on nix-cache01 | Fast (local cache), simple | Ties up cache host during build |
|
||||||
|
| **B: Gitea Actions only** | Use container runner | Clean separation | Slow (no cache), resource limits |
|
||||||
|
| **C: Hybrid** | Trigger builds on nix-cache01 via NATS from Actions | Best of both | More complex |
|
||||||
|
|
||||||
|
**Recommendation:** Option A with nix-cache01 as the runner. The host is already running Gitea Actions runner and has the cache. Building all ~16 hosts is disk I/O heavy but feasible on dedicated hardware.
|
||||||
|
|
||||||
|
### Workflow Steps
|
||||||
|
|
||||||
|
1. Workflow runs on schedule (daily or weekly)
|
||||||
|
2. Creates branch `flake-update/YYYY-MM-DD`
|
||||||
|
3. Runs `nix flake update --commit-lock-file`
|
||||||
|
4. Builds each host: `nix build .#nixosConfigurations.<host>.config.system.build.toplevel`
|
||||||
|
5. If all succeed:
|
||||||
|
- Fast-forward merge to master
|
||||||
|
- Delete feature branch
|
||||||
|
6. If any fail:
|
||||||
|
- Create PR from the update branch
|
||||||
|
- Attach build logs as PR comment
|
||||||
|
- Label PR with `needs-review` or `build-failure`
|
||||||
|
- Do NOT merge automatically
|
||||||
|
|
||||||
|
### Workflow File Changes
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# New: .github/workflows/flake-update-safe.yaml
|
||||||
|
name: Safe flake update
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
- cron: "0 2 * * 0" # Weekly on Sunday at 2 AM
|
||||||
|
workflow_dispatch: # Manual trigger
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
update-and-validate:
|
||||||
|
runs-on: homelab # Use self-hosted runner on nix-cache01
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
ref: master
|
||||||
|
fetch-depth: 0 # Need full history for merge
|
||||||
|
|
||||||
|
- name: Create update branch
|
||||||
|
run: |
|
||||||
|
BRANCH="flake-update/$(date +%Y-%m-%d)"
|
||||||
|
git checkout -b "$BRANCH"
|
||||||
|
|
||||||
|
- name: Update flake
|
||||||
|
run: nix flake update --commit-lock-file
|
||||||
|
|
||||||
|
- name: Build all hosts
|
||||||
|
id: build
|
||||||
|
run: |
|
||||||
|
FAILED=""
|
||||||
|
for host in $(nix flake show --json | jq -r '.nixosConfigurations | keys[]'); do
|
||||||
|
echo "Building $host..."
|
||||||
|
if ! nix build ".#nixosConfigurations.$host.config.system.build.toplevel" 2>&1 | tee "build-$host.log"; then
|
||||||
|
FAILED="$FAILED $host"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
echo "failed=$FAILED" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
|
- name: Merge to master (if all pass)
|
||||||
|
if: steps.build.outputs.failed == ''
|
||||||
|
run: |
|
||||||
|
git checkout master
|
||||||
|
git merge --ff-only "$BRANCH"
|
||||||
|
git push origin master
|
||||||
|
git push origin --delete "$BRANCH"
|
||||||
|
|
||||||
|
- name: Create PR (if any fail)
|
||||||
|
if: steps.build.outputs.failed != ''
|
||||||
|
run: |
|
||||||
|
git push origin "$BRANCH"
|
||||||
|
# Create PR via Gitea API with build logs
|
||||||
|
# ... (PR creation with log attachment)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Migration Steps
|
||||||
|
|
||||||
|
### Phase 1: Reprovision Host via OpenTofu
|
||||||
|
|
||||||
|
1. Add `nix-cache01` to `terraform/vms.tf`:
|
||||||
|
```hcl
|
||||||
|
"nix-cache01" = {
|
||||||
|
ip = "10.69.13.15/24"
|
||||||
|
cpu_cores = 4
|
||||||
|
memory = 8192
|
||||||
|
disk_size = "100G" # Larger for nix store
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Shut down existing nix-cache01 VM
|
||||||
|
3. Run `tofu apply` to provision new VM
|
||||||
|
4. Verify bootstrap completes and cache is serving
|
||||||
|
|
||||||
|
**Note:** The cache will be cold after reprovision. Run initial builds to populate.
|
||||||
|
|
||||||
|
### Phase 2: Add Build Triggering to homelab-deploy
|
||||||
|
|
||||||
|
1. Add `build` command to homelab-deploy CLI
|
||||||
|
2. Add listener handler in NixOS module for `build.*` subjects
|
||||||
|
3. Update nix-cache01 config to enable build listener
|
||||||
|
4. Test with `homelab-deploy build testvm01`
|
||||||
|
|
||||||
|
### Phase 3: Implement Safe Flake Update Workflow
|
||||||
|
|
||||||
|
1. Create `.github/workflows/flake-update-safe.yaml`
|
||||||
|
2. Disable or remove old `flake-update.yaml`
|
||||||
|
3. Test manually with `workflow_dispatch`
|
||||||
|
4. Monitor first automated run
|
||||||
|
|
||||||
|
### Phase 4: Remove Old Build Script
|
||||||
|
|
||||||
|
1. After new workflow is stable, remove:
|
||||||
|
- `services/nix-cache/build-flakes.nix`
|
||||||
|
- `services/nix-cache/build-flakes.sh`
|
||||||
|
2. The new workflow handles scheduled builds
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- [ ] What runner labels should the self-hosted runner use for the update workflow?
|
||||||
|
- [ ] Should we build hosts in parallel (faster) or sequentially (easier to debug)?
|
||||||
|
- [ ] How long to keep flake-update PRs open before auto-closing stale ones?
|
||||||
|
- [ ] Should successful updates trigger a NATS notification to rebuild all hosts?
|
||||||
|
- [ ] What to do about `gunter` (external nixos repo) - include in validation?
|
||||||
|
- [ ] Disk size for new nix-cache01 - is 100G enough for cache + builds?
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- The existing `homelab.deploy.enable = true` on nix-cache01 means it already has NATS connectivity
|
||||||
|
- The Harmonia service and cache signing key will work the same after reprovision
|
||||||
|
- Actions runner token is in Vault, will be provisioned automatically
|
||||||
|
- Consider adding a `homelab.host.role = "build-host"` label for monitoring/filtering
|
||||||
113
docs/plans/pgdb1-decommission.md
Normal file
113
docs/plans/pgdb1-decommission.md
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
# pgdb1 Decommissioning Plan
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Decommission the pgdb1 PostgreSQL server. The only consumer was Open WebUI on gunter, which has been migrated to use a local PostgreSQL instance.
|
||||||
|
|
||||||
|
## Pre-flight Verification
|
||||||
|
|
||||||
|
Before proceeding, verify that gunter is no longer using pgdb1:
|
||||||
|
|
||||||
|
1. Check Open WebUI on gunter is configured for local PostgreSQL (not 10.69.13.16)
|
||||||
|
2. Optionally: Check pgdb1 for recent connection activity:
|
||||||
|
```bash
|
||||||
|
ssh pgdb1 'sudo -u postgres psql -c "SELECT * FROM pg_stat_activity WHERE datname IS NOT NULL;"'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Files to Remove
|
||||||
|
|
||||||
|
### Host Configuration
|
||||||
|
- `hosts/pgdb1/default.nix`
|
||||||
|
- `hosts/pgdb1/configuration.nix`
|
||||||
|
- `hosts/pgdb1/hardware-configuration.nix`
|
||||||
|
- `hosts/pgdb1/` (directory)
|
||||||
|
|
||||||
|
### Service Module
|
||||||
|
- `services/postgres/postgres.nix`
|
||||||
|
- `services/postgres/default.nix`
|
||||||
|
- `services/postgres/` (directory)
|
||||||
|
|
||||||
|
Note: This service module is only used by pgdb1, so it can be removed entirely.
|
||||||
|
|
||||||
|
### Flake Entry
|
||||||
|
Remove from `flake.nix` (lines 131-138):
|
||||||
|
```nix
|
||||||
|
pgdb1 = nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
specialArgs = {
|
||||||
|
inherit inputs self;
|
||||||
|
};
|
||||||
|
modules = commonModules ++ [
|
||||||
|
./hosts/pgdb1
|
||||||
|
];
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
### Vault AppRole
|
||||||
|
Remove from `terraform/vault/approle.tf` (lines 69-73):
|
||||||
|
```hcl
|
||||||
|
"pgdb1" = {
|
||||||
|
paths = [
|
||||||
|
"secret/data/hosts/pgdb1/*",
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Monitoring Rules
|
||||||
|
Remove from `services/monitoring/rules.yml` the `postgres_down` alert (lines 359-365):
|
||||||
|
```yaml
|
||||||
|
- name: postgres_rules
|
||||||
|
rules:
|
||||||
|
- alert: postgres_down
|
||||||
|
expr: node_systemd_unit_state{instance="pgdb1.home.2rjus.net:9100", name="postgresql.service", state="active"} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
```
|
||||||
|
|
||||||
|
### Utility Scripts
|
||||||
|
Delete `rebuild-all.sh` entirely (obsolete script).
|
||||||
|
|
||||||
|
## Execution Steps
|
||||||
|
|
||||||
|
### Phase 1: Verification
|
||||||
|
- [ ] Confirm Open WebUI on gunter uses local PostgreSQL
|
||||||
|
- [ ] Verify no active connections to pgdb1
|
||||||
|
|
||||||
|
### Phase 2: Code Cleanup
|
||||||
|
- [ ] Create feature branch: `git checkout -b decommission-pgdb1`
|
||||||
|
- [ ] Remove `hosts/pgdb1/` directory
|
||||||
|
- [ ] Remove `services/postgres/` directory
|
||||||
|
- [ ] Remove pgdb1 entry from `flake.nix`
|
||||||
|
- [ ] Remove postgres alert from `services/monitoring/rules.yml`
|
||||||
|
- [ ] Delete `rebuild-all.sh` (obsolete)
|
||||||
|
- [ ] Run `nix flake check` to verify no broken references
|
||||||
|
- [ ] Commit changes
|
||||||
|
|
||||||
|
### Phase 3: Terraform Cleanup
|
||||||
|
- [ ] Remove pgdb1 from `terraform/vault/approle.tf`
|
||||||
|
- [ ] Run `tofu plan` in `terraform/vault/` to preview changes
|
||||||
|
- [ ] Run `tofu apply` to remove the AppRole
|
||||||
|
- [ ] Commit terraform changes
|
||||||
|
|
||||||
|
### Phase 4: Infrastructure Cleanup
|
||||||
|
- [ ] Shut down pgdb1 VM in Proxmox
|
||||||
|
- [ ] Delete the VM from Proxmox
|
||||||
|
- [ ] (Optional) Remove any DNS entries if not auto-generated
|
||||||
|
|
||||||
|
### Phase 5: Finalize
|
||||||
|
- [ ] Merge feature branch to master
|
||||||
|
- [ ] Trigger auto-upgrade on DNS servers (ns1, ns2) to remove DNS entry
|
||||||
|
- [ ] Move this plan to `docs/plans/completed/`
|
||||||
|
|
||||||
|
## Rollback
|
||||||
|
|
||||||
|
If issues arise after decommissioning:
|
||||||
|
1. The VM can be recreated from template using the git history
|
||||||
|
2. Database data would need to be restored from backup (if any exists)
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- pgdb1 IP: 10.69.13.16
|
||||||
|
- The postgres service allowed connections from gunter (10.69.30.105)
|
||||||
|
- No restic backup was configured for this host
|
||||||
122
docs/plans/remote-access.md
Normal file
122
docs/plans/remote-access.md
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
# Remote Access to Homelab Services
|
||||||
|
|
||||||
|
## Status: Planning
|
||||||
|
|
||||||
|
## Goal
|
||||||
|
|
||||||
|
Enable remote access to some or all homelab services from outside the internal network, without exposing anything directly to the internet.
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
- All services are only accessible from the internal 10.69.13.x network
|
||||||
|
- Exception: jelly01 has a WireGuard link to an external VPS
|
||||||
|
- No services are directly exposed to the public internet
|
||||||
|
|
||||||
|
## Constraints
|
||||||
|
|
||||||
|
- Nothing should be directly accessible from the outside
|
||||||
|
- Must use VPN or overlay network (no port forwarding of services)
|
||||||
|
- Self-hosted solutions preferred over managed services
|
||||||
|
|
||||||
|
## Options
|
||||||
|
|
||||||
|
### 1. WireGuard Gateway (Internal Router)
|
||||||
|
|
||||||
|
A dedicated NixOS host on the internal network with a WireGuard tunnel out to the VPS. The VPS becomes the public entry point, and the gateway routes traffic to internal services. Firewall rules on the gateway control which services are reachable.
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- Simple, well-understood technology
|
||||||
|
- Already running WireGuard for jelly01
|
||||||
|
- Full control over routing and firewall rules
|
||||||
|
- Excellent NixOS module support
|
||||||
|
- No extra dependencies
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- Hub-and-spoke topology (all traffic goes through VPS)
|
||||||
|
- Manual peer management
|
||||||
|
- Adding a new client device means editing configs on both VPS and gateway
|
||||||
|
|
||||||
|
### 2. WireGuard Mesh (No Relay)
|
||||||
|
|
||||||
|
Each client device connects directly to a WireGuard endpoint. Could be on the VPS which forwards to the homelab, or if there is a routable IP at home, directly to an internal host.
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- Simple and fast
|
||||||
|
- No extra software
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- Manual key and endpoint management for every peer
|
||||||
|
- Doesn't scale well
|
||||||
|
- If behind CGNAT, still needs the VPS as intermediary
|
||||||
|
|
||||||
|
### 3. Headscale (Self-Hosted Tailscale)
|
||||||
|
|
||||||
|
Run a Headscale control server (on the VPS or internally) and install the Tailscale client on homelab hosts and personal devices. Gets the Tailscale mesh networking UX without depending on Tailscale's infrastructure.
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- Mesh topology - devices communicate directly via NAT traversal (DERP relay as fallback)
|
||||||
|
- Easy to add/remove devices
|
||||||
|
- ACL support for granular access control
|
||||||
|
- MagicDNS for service discovery
|
||||||
|
- Good NixOS support for both headscale server and tailscale client
|
||||||
|
- Subnet routing lets you expose the entire 10.69.13.x network or specific hosts without installing tailscale on every host
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- More moving parts than plain WireGuard
|
||||||
|
- Headscale is a third-party reimplementation, can lag behind Tailscale features
|
||||||
|
- Need to run and maintain the control server
|
||||||
|
|
||||||
|
### 4. Tailscale (Managed)
|
||||||
|
|
||||||
|
Same as Headscale but using Tailscale's hosted control plane.
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- Zero infrastructure to manage on the control plane side
|
||||||
|
- Polished UX, well-maintained clients
|
||||||
|
- Free tier covers personal use
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- Dependency on Tailscale's service
|
||||||
|
- Less aligned with self-hosting preference
|
||||||
|
- Coordination metadata goes through their servers (data plane is still peer-to-peer)
|
||||||
|
|
||||||
|
### 5. Netbird (Self-Hosted)
|
||||||
|
|
||||||
|
Open-source alternative to Tailscale with a self-hostable management server. WireGuard-based, supports ACLs and NAT traversal.
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- Fully self-hostable
|
||||||
|
- Web UI for management
|
||||||
|
- ACL and peer grouping support
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- Heavier to self-host (needs multiple components: management server, signal server, TURN relay)
|
||||||
|
- Less mature NixOS module support compared to Tailscale/Headscale
|
||||||
|
|
||||||
|
### 6. Nebula (by Defined Networking)
|
||||||
|
|
||||||
|
Certificate-based mesh VPN. Each node gets a certificate from a CA you control. No central coordination server needed at runtime.
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- No always-on control plane
|
||||||
|
- Certificate-based identity
|
||||||
|
- Lightweight
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- Less convenient for ad-hoc device addition (need to issue certs)
|
||||||
|
- NAT traversal less mature than Tailscale's
|
||||||
|
- Smaller community/ecosystem
|
||||||
|
|
||||||
|
## Key Decision Points
|
||||||
|
|
||||||
|
- **Static public IP vs CGNAT?** Determines whether clients can connect directly to home network or need VPS relay.
|
||||||
|
- **Number of client devices?** If just phone and laptop, plain WireGuard via VPS is fine. More devices favors Headscale.
|
||||||
|
- **Per-service vs per-network access?** Gateway with firewall rules gives per-service control. Headscale ACLs can also do this. Plain WireGuard gives network-level access with gateway firewall for finer control.
|
||||||
|
- **Subnet routing vs per-host agents?** With Headscale/Tailscale, can either install client on every host, or use a single subnet router that advertises the 10.69.13.x range. The latter is closer to the gateway approach and avoids touching every host.
|
||||||
|
|
||||||
|
## Leading Candidates
|
||||||
|
|
||||||
|
Based on existing WireGuard experience, self-hosting preference, and NixOS stack:
|
||||||
|
|
||||||
|
1. **Headscale with a subnet router** - Best balance of convenience and self-hosting
|
||||||
|
2. **WireGuard gateway via VPS** - Simplest, most transparent, builds on existing setup
|
||||||
224
docs/plans/security-hardening.md
Normal file
224
docs/plans/security-hardening.md
Normal file
@@ -0,0 +1,224 @@
|
|||||||
|
# Security Hardening Plan
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Address security gaps identified in infrastructure review. Focus areas: SSH hardening, network security, logging improvements, and secrets management.
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
- SSH allows password auth and unrestricted root login (`system/sshd.nix`)
|
||||||
|
- Firewall disabled on all hosts (`networking.firewall.enable = false`)
|
||||||
|
- Promtail ships logs over HTTP to Loki
|
||||||
|
- Loki has no authentication (`auth_enabled = false`)
|
||||||
|
- AppRole secret-IDs never expire (`secret_id_ttl = 0`)
|
||||||
|
- Vault TLS verification disabled by default (`skipTlsVerify = true`)
|
||||||
|
- Audit logging exists (`common/ssh-audit.nix`) but not applied globally
|
||||||
|
- Alert rules focus on availability, no security event detection
|
||||||
|
|
||||||
|
## Priority Matrix
|
||||||
|
|
||||||
|
| Issue | Severity | Effort | Priority |
|
||||||
|
|-------|----------|--------|----------|
|
||||||
|
| SSH password auth | High | Low | **P1** |
|
||||||
|
| Firewall disabled | High | Medium | **P1** |
|
||||||
|
| Promtail HTTP (no TLS) | High | Medium | **P2** |
|
||||||
|
| No security alerting | Medium | Low | **P2** |
|
||||||
|
| Audit logging not global | Low | Low | **P2** |
|
||||||
|
| Loki no auth | Medium | Medium | **P3** |
|
||||||
|
| Secret-ID TTL | Medium | Medium | **P3** |
|
||||||
|
| Vault skipTlsVerify | Medium | Low | **P3** |
|
||||||
|
|
||||||
|
## Phase 1: Quick Wins (P1)
|
||||||
|
|
||||||
|
### 1.1 SSH Hardening
|
||||||
|
|
||||||
|
Edit `system/sshd.nix`:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
services.openssh = {
|
||||||
|
enable = true;
|
||||||
|
settings = {
|
||||||
|
PermitRootLogin = "prohibit-password"; # Key-only root login
|
||||||
|
PasswordAuthentication = false;
|
||||||
|
KbdInteractiveAuthentication = false;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**Prerequisite:** Verify all hosts have SSH keys deployed for root.
|
||||||
|
|
||||||
|
### 1.2 Enable Firewall
|
||||||
|
|
||||||
|
Create `system/firewall.nix` with default deny policy:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
{ ... }: {
|
||||||
|
networking.firewall.enable = true;
|
||||||
|
|
||||||
|
# Use openssh's built-in firewall integration
|
||||||
|
services.openssh.openFirewall = true;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Useful firewall options:**
|
||||||
|
|
||||||
|
| Option | Description |
|
||||||
|
|--------|-------------|
|
||||||
|
| `networking.firewall.trustedInterfaces` | Accept all traffic from these interfaces (e.g., `[ "lo" ]`) |
|
||||||
|
| `networking.firewall.interfaces.<name>.allowedTCPPorts` | Per-interface port rules |
|
||||||
|
| `networking.firewall.extraInputRules` | Custom nftables rules (for complex filtering) |
|
||||||
|
|
||||||
|
**Network range restrictions:** Consider restricting SSH to the infrastructure subnet (`10.69.13.0/24`) using `extraInputRules` for defense in depth. However, this adds complexity and may not be necessary given the trusted network model.
|
||||||
|
|
||||||
|
#### Per-Interface Rules (http-proxy WireGuard)
|
||||||
|
|
||||||
|
The `http-proxy` host has a WireGuard interface (`wg0`) that may need different rules than the LAN interface. Use `networking.firewall.interfaces` to apply per-interface policies:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
# Example: http-proxy with different rules per interface
|
||||||
|
networking.firewall = {
|
||||||
|
enable = true;
|
||||||
|
|
||||||
|
# Default: only SSH (via openFirewall)
|
||||||
|
allowedTCPPorts = [ ];
|
||||||
|
|
||||||
|
# LAN interface: allow HTTP/HTTPS
|
||||||
|
interfaces.ens18 = {
|
||||||
|
allowedTCPPorts = [ 80 443 ];
|
||||||
|
};
|
||||||
|
|
||||||
|
# WireGuard interface: restrict to specific services or trust fully
|
||||||
|
interfaces.wg0 = {
|
||||||
|
allowedTCPPorts = [ 80 443 ];
|
||||||
|
# Or use trustedInterfaces = [ "wg0" ] if fully trusted
|
||||||
|
};
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**TODO:** Investigate current WireGuard usage on http-proxy to determine appropriate rules.
|
||||||
|
|
||||||
|
Then per-host, open required ports:
|
||||||
|
|
||||||
|
| Host | Additional Ports |
|
||||||
|
|------|------------------|
|
||||||
|
| ns1/ns2 | 53 (TCP/UDP) |
|
||||||
|
| vault01 | 8200 |
|
||||||
|
| monitoring01 | 3100, 9090, 3000, 9093 |
|
||||||
|
| http-proxy | 80, 443 |
|
||||||
|
| nats1 | 4222 |
|
||||||
|
| ha1 | 1883, 8123 |
|
||||||
|
| jelly01 | 8096 |
|
||||||
|
| nix-cache01 | 5000 |
|
||||||
|
|
||||||
|
## Phase 2: Logging & Detection (P2)
|
||||||
|
|
||||||
|
### 2.1 Enable TLS for Promtail → Loki
|
||||||
|
|
||||||
|
Update `system/monitoring/logs.nix`:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
clients = [{
|
||||||
|
url = "https://monitoring01.home.2rjus.net:3100/loki/api/v1/push";
|
||||||
|
tls_config = {
|
||||||
|
ca_file = "/etc/ssl/certs/homelab-root-ca.pem";
|
||||||
|
};
|
||||||
|
}];
|
||||||
|
```
|
||||||
|
|
||||||
|
Requires:
|
||||||
|
- Configure Loki with TLS certificate (use internal ACME)
|
||||||
|
- Ensure all hosts trust root CA (already done via `system/pki/root-ca.nix`)
|
||||||
|
|
||||||
|
### 2.2 Security Alert Rules
|
||||||
|
|
||||||
|
Add to `services/monitoring/rules.yml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- name: security_rules
|
||||||
|
rules:
|
||||||
|
- alert: ssh_auth_failures
|
||||||
|
expr: increase(node_logind_sessions_total[5m]) > 20
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Unusual login activity on {{ $labels.instance }}"
|
||||||
|
|
||||||
|
- alert: vault_secret_fetch_failure
|
||||||
|
expr: increase(vault_secret_failures[5m]) > 5
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Vault secret fetch failures on {{ $labels.instance }}"
|
||||||
|
```
|
||||||
|
|
||||||
|
Also add Loki-based alerts for:
|
||||||
|
- Failed SSH attempts: `{job="systemd-journal"} |= "Failed password"`
|
||||||
|
- sudo usage: `{job="systemd-journal"} |= "sudo"`
|
||||||
|
|
||||||
|
### 2.3 Global Audit Logging
|
||||||
|
|
||||||
|
Add `./common/ssh-audit.nix` import to `system/default.nix`:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
imports = [
|
||||||
|
# ... existing imports
|
||||||
|
../common/ssh-audit.nix
|
||||||
|
];
|
||||||
|
```
|
||||||
|
|
||||||
|
## Phase 3: Defense in Depth (P3)
|
||||||
|
|
||||||
|
### 3.1 Loki Authentication
|
||||||
|
|
||||||
|
Options:
|
||||||
|
1. **Basic auth via reverse proxy** - Put Loki behind Caddy with auth
|
||||||
|
2. **Loki multi-tenancy** - Enable `auth_enabled = true` and use tenant IDs
|
||||||
|
3. **Network isolation** - Bind Loki only to localhost, expose via authenticated proxy
|
||||||
|
|
||||||
|
Recommendation: Option 1 (reverse proxy) is simplest for homelab.
|
||||||
|
|
||||||
|
### 3.2 AppRole Secret Rotation
|
||||||
|
|
||||||
|
Update `terraform/vault/approle.tf`:
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
secret_id_ttl = 2592000 # 30 days
|
||||||
|
```
|
||||||
|
|
||||||
|
Add documentation for manual rotation procedure or implement automated rotation via the existing `restartTrigger` mechanism in `vault-secrets.nix`.
|
||||||
|
|
||||||
|
### 3.3 Enable Vault TLS Verification
|
||||||
|
|
||||||
|
Change default in `system/vault-secrets.nix`:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
skipTlsVerify = mkOption {
|
||||||
|
type = types.bool;
|
||||||
|
default = false; # Changed from true
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**Prerequisite:** Verify all hosts trust the internal CA that signed the Vault certificate.
|
||||||
|
|
||||||
|
## Implementation Order
|
||||||
|
|
||||||
|
1. **Test on test-tier first** - Deploy phases 1-2 to testvm01/02/03
|
||||||
|
2. **Validate SSH access** - Ensure key-based login works before disabling passwords
|
||||||
|
3. **Document firewall ports** - Create reference of ports per host before enabling
|
||||||
|
4. **Phase prod rollout** - Deploy to prod hosts one at a time, verify each
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- [ ] Do all hosts have SSH keys configured for root access?
|
||||||
|
- [ ] Should firewall rules be per-host or use a central definition with roles?
|
||||||
|
- [ ] Should Loki authentication use the existing Kanidm setup?
|
||||||
|
|
||||||
|
**Resolved:** Password-based SSH access for recovery is not required - most hosts have console access through Proxmox or physical access, which provides an out-of-band recovery path if SSH keys fail.
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Firewall changes are the highest risk - test thoroughly on test-tier
|
||||||
|
- SSH hardening must not lock out access - verify keys first
|
||||||
|
- Consider creating a "break glass" procedure for emergency access if keys fail
|
||||||
267
docs/user-management.md
Normal file
267
docs/user-management.md
Normal file
@@ -0,0 +1,267 @@
|
|||||||
|
# User Management with Kanidm
|
||||||
|
|
||||||
|
Central authentication for the homelab using Kanidm.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
- **Server**: kanidm01.home.2rjus.net (auth.home.2rjus.net)
|
||||||
|
- **WebUI**: https://auth.home.2rjus.net
|
||||||
|
- **LDAPS**: port 636
|
||||||
|
|
||||||
|
## CLI Setup
|
||||||
|
|
||||||
|
The `kanidm` CLI is available in the devshell:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
nix develop
|
||||||
|
|
||||||
|
# Login as idm_admin
|
||||||
|
kanidm login --name idm_admin --url https://auth.home.2rjus.net
|
||||||
|
```
|
||||||
|
|
||||||
|
## User Management
|
||||||
|
|
||||||
|
POSIX users are managed imperatively via the `kanidm` CLI. This allows setting
|
||||||
|
all attributes (including UNIX password) in one workflow.
|
||||||
|
|
||||||
|
### Creating a POSIX User
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create the person
|
||||||
|
kanidm person create <username> "<Display Name>"
|
||||||
|
|
||||||
|
# Add to groups
|
||||||
|
kanidm group add-members ssh-users <username>
|
||||||
|
|
||||||
|
# Enable POSIX (UID is auto-assigned)
|
||||||
|
kanidm person posix set <username>
|
||||||
|
|
||||||
|
# Set UNIX password (required for SSH login, min 10 characters)
|
||||||
|
kanidm person posix set-password <username>
|
||||||
|
|
||||||
|
# Optionally set login shell
|
||||||
|
kanidm person posix set <username> --shell /bin/zsh
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example: Full User Creation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm person create testuser "Test User"
|
||||||
|
kanidm group add-members ssh-users testuser
|
||||||
|
kanidm person posix set testuser
|
||||||
|
kanidm person posix set-password testuser
|
||||||
|
kanidm person get testuser
|
||||||
|
```
|
||||||
|
|
||||||
|
After creation, verify on a client host:
|
||||||
|
```bash
|
||||||
|
getent passwd testuser
|
||||||
|
ssh testuser@testvm01.home.2rjus.net
|
||||||
|
```
|
||||||
|
|
||||||
|
### Viewing User Details
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm person get <username>
|
||||||
|
```
|
||||||
|
|
||||||
|
### Removing a User
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm person delete <username>
|
||||||
|
```
|
||||||
|
|
||||||
|
## Group Management
|
||||||
|
|
||||||
|
Groups for POSIX access are also managed via CLI.
|
||||||
|
|
||||||
|
### Creating a POSIX Group
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create the group
|
||||||
|
kanidm group create <group-name>
|
||||||
|
|
||||||
|
# Enable POSIX with a specific GID
|
||||||
|
kanidm group posix set <group-name> --gidnumber <gid>
|
||||||
|
```
|
||||||
|
|
||||||
|
### Adding Members
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm group add-members <group-name> <username>
|
||||||
|
```
|
||||||
|
|
||||||
|
### Viewing Group Details
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm group get <group-name>
|
||||||
|
kanidm group list-members <group-name>
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example: Full Group Creation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm group create testgroup
|
||||||
|
kanidm group posix set testgroup --gidnumber 68010
|
||||||
|
kanidm group add-members testgroup testuser
|
||||||
|
kanidm group get testgroup
|
||||||
|
```
|
||||||
|
|
||||||
|
After creation, verify on a client host:
|
||||||
|
```bash
|
||||||
|
getent group testgroup
|
||||||
|
```
|
||||||
|
|
||||||
|
### Current Groups
|
||||||
|
|
||||||
|
| Group | GID | Purpose |
|
||||||
|
|-------|-----|---------|
|
||||||
|
| ssh-users | 68000 | SSH login access |
|
||||||
|
| admins | 68001 | Administrative access |
|
||||||
|
| users | 68002 | General users |
|
||||||
|
|
||||||
|
### UID/GID Allocation
|
||||||
|
|
||||||
|
Kanidm auto-assigns UIDs/GIDs from its configured range. For manually assigned GIDs:
|
||||||
|
|
||||||
|
| Range | Purpose |
|
||||||
|
|-------|---------|
|
||||||
|
| 65,536+ | Users (auto-assigned) |
|
||||||
|
| 68,000 - 68,999 | Groups (manually assigned) |
|
||||||
|
|
||||||
|
## PAM/NSS Client Configuration
|
||||||
|
|
||||||
|
Enable central authentication on a host:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
homelab.kanidm.enable = true;
|
||||||
|
```
|
||||||
|
|
||||||
|
This configures:
|
||||||
|
- `services.kanidm.enablePam = true`
|
||||||
|
- Client connection to auth.home.2rjus.net
|
||||||
|
- Login authorization for `ssh-users` group
|
||||||
|
- Short usernames (`torjus` instead of `torjus@home.2rjus.net`)
|
||||||
|
- Home directory symlinks (`/home/torjus` → UUID-based directory)
|
||||||
|
|
||||||
|
### Enabled Hosts
|
||||||
|
|
||||||
|
- testvm01, testvm02, testvm03 (test tier)
|
||||||
|
|
||||||
|
### Options
|
||||||
|
|
||||||
|
```nix
|
||||||
|
homelab.kanidm = {
|
||||||
|
enable = true;
|
||||||
|
server = "https://auth.home.2rjus.net"; # default
|
||||||
|
allowedLoginGroups = [ "ssh-users" ]; # default
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
### Home Directories
|
||||||
|
|
||||||
|
Home directories use UUID-based paths for stability (so renaming a user doesn't
|
||||||
|
require moving their home directory). Symlinks provide convenient access:
|
||||||
|
|
||||||
|
```
|
||||||
|
/home/torjus -> /home/e4f4c56c-4aee-4c20-846f-90cb69807733
|
||||||
|
```
|
||||||
|
|
||||||
|
The symlinks are created by `kanidm-unixd-tasks` on first login.
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
### Verify NSS Resolution
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check user resolution
|
||||||
|
getent passwd <username>
|
||||||
|
|
||||||
|
# Check group resolution
|
||||||
|
getent group <group-name>
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test SSH Login
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ssh <username>@<hostname>.home.2rjus.net
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### "PAM user mismatch" error
|
||||||
|
|
||||||
|
SSH fails with "fatal: PAM user mismatch" in logs. This happens when Kanidm returns
|
||||||
|
usernames in SPN format (`torjus@home.2rjus.net`) but SSH expects short names (`torjus`).
|
||||||
|
|
||||||
|
**Solution**: Configure `uid_attr_map = "name"` in unixSettings (already set in our module).
|
||||||
|
|
||||||
|
Check current format:
|
||||||
|
```bash
|
||||||
|
getent passwd torjus
|
||||||
|
# Should show: torjus:x:65536:...
|
||||||
|
# NOT: torjus@home.2rjus.net:x:65536:...
|
||||||
|
```
|
||||||
|
|
||||||
|
### User resolves but SSH fails immediately
|
||||||
|
|
||||||
|
The user's login group (e.g., `ssh-users`) likely doesn't have POSIX enabled:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check if group has POSIX
|
||||||
|
getent group ssh-users
|
||||||
|
|
||||||
|
# If empty, enable POSIX on the server
|
||||||
|
kanidm group posix set ssh-users --gidnumber 68000
|
||||||
|
```
|
||||||
|
|
||||||
|
### User doesn't resolve via getent
|
||||||
|
|
||||||
|
1. Check kanidm-unixd service is running:
|
||||||
|
```bash
|
||||||
|
systemctl status kanidm-unixd
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Check unixd can reach server:
|
||||||
|
```bash
|
||||||
|
kanidm-unix status
|
||||||
|
# Should show: system: online, Kanidm: online
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Check client can reach server:
|
||||||
|
```bash
|
||||||
|
curl -s https://auth.home.2rjus.net/status
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Check user has POSIX enabled on server:
|
||||||
|
```bash
|
||||||
|
kanidm person get <username>
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Restart nscd to clear stale cache:
|
||||||
|
```bash
|
||||||
|
systemctl restart nscd
|
||||||
|
```
|
||||||
|
|
||||||
|
6. Invalidate kanidm cache:
|
||||||
|
```bash
|
||||||
|
kanidm-unix cache-invalidate
|
||||||
|
```
|
||||||
|
|
||||||
|
### Changes not taking effect after deployment
|
||||||
|
|
||||||
|
NixOS uses nsncd (a Rust reimplementation of nscd) for NSS caching. After deploying
|
||||||
|
kanidm-unixd config changes, you may need to restart both services:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
systemctl restart kanidm-unixd
|
||||||
|
systemctl restart nscd
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test PAM authentication directly
|
||||||
|
|
||||||
|
Use the kanidm-unix CLI to test PAM auth without SSH:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm-unix auth-test --name <username>
|
||||||
|
```
|
||||||
66
flake.lock
generated
66
flake.lock
generated
@@ -21,25 +21,45 @@
|
|||||||
"url": "https://git.t-juice.club/torjus/alerttonotify"
|
"url": "https://git.t-juice.club/torjus/alerttonotify"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"labmon": {
|
"homelab-deploy": {
|
||||||
"inputs": {
|
"inputs": {
|
||||||
"nixpkgs": [
|
"nixpkgs": [
|
||||||
"nixpkgs-unstable"
|
"nixpkgs-unstable"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1748983975,
|
"lastModified": 1770481834,
|
||||||
"narHash": "sha256-DA5mOqxwLMj/XLb4hvBU1WtE6cuVej7PjUr8N0EZsCE=",
|
"narHash": "sha256-Xx9BYnI0C/qgPbwr9nj6NoAdQTbYLunrdbNSaUww9oY=",
|
||||||
"ref": "master",
|
"ref": "master",
|
||||||
"rev": "040a73e891a70ff06ec7ab31d7167914129dbf7d",
|
"rev": "fd0d63b103dfaf21d1c27363266590e723021c67",
|
||||||
"revCount": 17,
|
"revCount": 24,
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://git.t-juice.club/torjus/labmon"
|
"url": "https://git.t-juice.club/torjus/homelab-deploy"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
"ref": "master",
|
"ref": "master",
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://git.t-juice.club/torjus/labmon"
|
"url": "https://git.t-juice.club/torjus/homelab-deploy"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nixos-exporter": {
|
||||||
|
"inputs": {
|
||||||
|
"nixpkgs": [
|
||||||
|
"nixpkgs-unstable"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1770422522,
|
||||||
|
"narHash": "sha256-WmIFnquu4u58v8S2bOVWmknRwHn4x88CRfBFTzJ1inQ=",
|
||||||
|
"ref": "refs/heads/master",
|
||||||
|
"rev": "cf0ce858997af4d8dcc2ce10393ff393e17fc911",
|
||||||
|
"revCount": 11,
|
||||||
|
"type": "git",
|
||||||
|
"url": "https://git.t-juice.club/torjus/nixos-exporter"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"type": "git",
|
||||||
|
"url": "https://git.t-juice.club/torjus/nixos-exporter"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
@@ -60,11 +80,11 @@
|
|||||||
},
|
},
|
||||||
"nixpkgs-unstable": {
|
"nixpkgs-unstable": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1770115704,
|
"lastModified": 1770197578,
|
||||||
"narHash": "sha256-KHFT9UWOF2yRPlAnSXQJh6uVcgNcWlFqqiAZ7OVlHNc=",
|
"narHash": "sha256-AYqlWrX09+HvGs8zM6ebZ1pwUqjkfpnv8mewYwAo+iM=",
|
||||||
"owner": "nixos",
|
"owner": "nixos",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "e6eae2ee2110f3d31110d5c222cd395303343b08",
|
"rev": "00c21e4c93d963c50d4c0c89bfa84ed6e0694df2",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
@@ -77,30 +97,10 @@
|
|||||||
"root": {
|
"root": {
|
||||||
"inputs": {
|
"inputs": {
|
||||||
"alerttonotify": "alerttonotify",
|
"alerttonotify": "alerttonotify",
|
||||||
"labmon": "labmon",
|
"homelab-deploy": "homelab-deploy",
|
||||||
|
"nixos-exporter": "nixos-exporter",
|
||||||
"nixpkgs": "nixpkgs",
|
"nixpkgs": "nixpkgs",
|
||||||
"nixpkgs-unstable": "nixpkgs-unstable",
|
"nixpkgs-unstable": "nixpkgs-unstable"
|
||||||
"sops-nix": "sops-nix"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"sops-nix": {
|
|
||||||
"inputs": {
|
|
||||||
"nixpkgs": [
|
|
||||||
"nixpkgs-unstable"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"locked": {
|
|
||||||
"lastModified": 1770145881,
|
|
||||||
"narHash": "sha256-ktjWTq+D5MTXQcL9N6cDZXUf9kX8JBLLBLT0ZyOTSYY=",
|
|
||||||
"owner": "Mic92",
|
|
||||||
"repo": "sops-nix",
|
|
||||||
"rev": "17eea6f3816ba6568b8c81db8a4e6ca438b30b7c",
|
|
||||||
"type": "github"
|
|
||||||
},
|
|
||||||
"original": {
|
|
||||||
"owner": "Mic92",
|
|
||||||
"repo": "sops-nix",
|
|
||||||
"type": "github"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|||||||
365
flake.nix
365
flake.nix
@@ -5,16 +5,16 @@
|
|||||||
nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-25.11";
|
nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-25.11";
|
||||||
nixpkgs-unstable.url = "github:nixos/nixpkgs?ref=nixos-unstable";
|
nixpkgs-unstable.url = "github:nixos/nixpkgs?ref=nixos-unstable";
|
||||||
|
|
||||||
sops-nix = {
|
|
||||||
url = "github:Mic92/sops-nix";
|
|
||||||
inputs.nixpkgs.follows = "nixpkgs-unstable";
|
|
||||||
};
|
|
||||||
alerttonotify = {
|
alerttonotify = {
|
||||||
url = "git+https://git.t-juice.club/torjus/alerttonotify?ref=master";
|
url = "git+https://git.t-juice.club/torjus/alerttonotify?ref=master";
|
||||||
inputs.nixpkgs.follows = "nixpkgs-unstable";
|
inputs.nixpkgs.follows = "nixpkgs-unstable";
|
||||||
};
|
};
|
||||||
labmon = {
|
nixos-exporter = {
|
||||||
url = "git+https://git.t-juice.club/torjus/labmon?ref=master";
|
url = "git+https://git.t-juice.club/torjus/nixos-exporter";
|
||||||
|
inputs.nixpkgs.follows = "nixpkgs-unstable";
|
||||||
|
};
|
||||||
|
homelab-deploy = {
|
||||||
|
url = "git+https://git.t-juice.club/torjus/homelab-deploy?ref=master";
|
||||||
inputs.nixpkgs.follows = "nixpkgs-unstable";
|
inputs.nixpkgs.follows = "nixpkgs-unstable";
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
@@ -24,9 +24,9 @@
|
|||||||
self,
|
self,
|
||||||
nixpkgs,
|
nixpkgs,
|
||||||
nixpkgs-unstable,
|
nixpkgs-unstable,
|
||||||
sops-nix,
|
|
||||||
alerttonotify,
|
alerttonotify,
|
||||||
labmon,
|
nixos-exporter,
|
||||||
|
homelab-deploy,
|
||||||
...
|
...
|
||||||
}@inputs:
|
}@inputs:
|
||||||
let
|
let
|
||||||
@@ -40,7 +40,19 @@
|
|||||||
commonOverlays = [
|
commonOverlays = [
|
||||||
overlay-unstable
|
overlay-unstable
|
||||||
alerttonotify.overlays.default
|
alerttonotify.overlays.default
|
||||||
labmon.overlays.default
|
];
|
||||||
|
# Common modules applied to all hosts
|
||||||
|
commonModules = [
|
||||||
|
(
|
||||||
|
{ config, pkgs, ... }:
|
||||||
|
{
|
||||||
|
nixpkgs.overlays = commonOverlays;
|
||||||
|
system.configurationRevision = self.rev or self.dirtyRev or "dirty";
|
||||||
|
}
|
||||||
|
)
|
||||||
|
nixos-exporter.nixosModules.default
|
||||||
|
homelab-deploy.nixosModules.default
|
||||||
|
./modules/homelab
|
||||||
];
|
];
|
||||||
allSystems = [
|
allSystems = [
|
||||||
"x86_64-linux"
|
"x86_64-linux"
|
||||||
@@ -53,325 +65,130 @@
|
|||||||
in
|
in
|
||||||
{
|
{
|
||||||
nixosConfigurations = {
|
nixosConfigurations = {
|
||||||
ns1 = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self sops-nix;
|
|
||||||
};
|
|
||||||
modules = [
|
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/ns1
|
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
|
||||||
};
|
|
||||||
ns2 = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self sops-nix;
|
|
||||||
};
|
|
||||||
modules = [
|
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/ns2
|
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
|
||||||
};
|
|
||||||
ns3 = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self sops-nix;
|
|
||||||
};
|
|
||||||
modules = [
|
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/ns3
|
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
|
||||||
};
|
|
||||||
ns4 = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self sops-nix;
|
|
||||||
};
|
|
||||||
modules = [
|
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/ns4
|
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
|
||||||
};
|
|
||||||
nixos-test1 = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self sops-nix;
|
|
||||||
};
|
|
||||||
modules = [
|
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/nixos-test1
|
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
|
||||||
};
|
|
||||||
ha1 = nixpkgs.lib.nixosSystem {
|
ha1 = nixpkgs.lib.nixosSystem {
|
||||||
inherit system;
|
inherit system;
|
||||||
specialArgs = {
|
specialArgs = {
|
||||||
inherit inputs self sops-nix;
|
inherit inputs self;
|
||||||
};
|
};
|
||||||
modules = [
|
modules = commonModules ++ [
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/ha1
|
./hosts/ha1
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
|
||||||
};
|
|
||||||
template1 = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self sops-nix;
|
|
||||||
};
|
|
||||||
modules = [
|
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/template
|
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
template2 = nixpkgs.lib.nixosSystem {
|
template2 = nixpkgs.lib.nixosSystem {
|
||||||
inherit system;
|
inherit system;
|
||||||
specialArgs = {
|
specialArgs = {
|
||||||
inherit inputs self sops-nix;
|
inherit inputs self;
|
||||||
};
|
};
|
||||||
modules = [
|
modules = commonModules ++ [
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/template2
|
./hosts/template2
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
http-proxy = nixpkgs.lib.nixosSystem {
|
http-proxy = nixpkgs.lib.nixosSystem {
|
||||||
inherit system;
|
inherit system;
|
||||||
specialArgs = {
|
specialArgs = {
|
||||||
inherit inputs self sops-nix;
|
inherit inputs self;
|
||||||
};
|
};
|
||||||
modules = [
|
modules = commonModules ++ [
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/http-proxy
|
./hosts/http-proxy
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
|
||||||
};
|
|
||||||
ca = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self sops-nix;
|
|
||||||
};
|
|
||||||
modules = [
|
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/ca
|
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
monitoring01 = nixpkgs.lib.nixosSystem {
|
monitoring01 = nixpkgs.lib.nixosSystem {
|
||||||
inherit system;
|
inherit system;
|
||||||
specialArgs = {
|
specialArgs = {
|
||||||
inherit inputs self sops-nix;
|
inherit inputs self;
|
||||||
};
|
};
|
||||||
modules = [
|
modules = commonModules ++ [
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/monitoring01
|
./hosts/monitoring01
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
labmon.nixosModules.labmon
|
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
jelly01 = nixpkgs.lib.nixosSystem {
|
jelly01 = nixpkgs.lib.nixosSystem {
|
||||||
inherit system;
|
inherit system;
|
||||||
specialArgs = {
|
specialArgs = {
|
||||||
inherit inputs self sops-nix;
|
inherit inputs self;
|
||||||
};
|
};
|
||||||
modules = [
|
modules = commonModules ++ [
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/jelly01
|
./hosts/jelly01
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
nix-cache01 = nixpkgs.lib.nixosSystem {
|
nix-cache01 = nixpkgs.lib.nixosSystem {
|
||||||
inherit system;
|
inherit system;
|
||||||
specialArgs = {
|
specialArgs = {
|
||||||
inherit inputs self sops-nix;
|
inherit inputs self;
|
||||||
};
|
};
|
||||||
modules = [
|
modules = commonModules ++ [
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/nix-cache01
|
./hosts/nix-cache01
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
|
||||||
};
|
|
||||||
media1 = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self sops-nix;
|
|
||||||
};
|
|
||||||
modules = [
|
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/media1
|
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
|
||||||
};
|
|
||||||
pgdb1 = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self sops-nix;
|
|
||||||
};
|
|
||||||
modules = [
|
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/pgdb1
|
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
nats1 = nixpkgs.lib.nixosSystem {
|
nats1 = nixpkgs.lib.nixosSystem {
|
||||||
inherit system;
|
inherit system;
|
||||||
specialArgs = {
|
specialArgs = {
|
||||||
inherit inputs self sops-nix;
|
inherit inputs self;
|
||||||
};
|
};
|
||||||
modules = [
|
modules = commonModules ++ [
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/nats1
|
./hosts/nats1
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
|
||||||
};
|
|
||||||
auth01 = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self sops-nix;
|
|
||||||
};
|
|
||||||
modules = [
|
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/auth01
|
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
|
||||||
};
|
|
||||||
testvm01 = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self sops-nix;
|
|
||||||
};
|
|
||||||
modules = [
|
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/testvm01
|
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
vault01 = nixpkgs.lib.nixosSystem {
|
vault01 = nixpkgs.lib.nixosSystem {
|
||||||
inherit system;
|
inherit system;
|
||||||
specialArgs = {
|
specialArgs = {
|
||||||
inherit inputs self sops-nix;
|
inherit inputs self;
|
||||||
};
|
};
|
||||||
modules = [
|
modules = commonModules ++ [
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/vault01
|
./hosts/vault01
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
vaulttest01 = nixpkgs.lib.nixosSystem {
|
testvm01 = nixpkgs.lib.nixosSystem {
|
||||||
inherit system;
|
inherit system;
|
||||||
specialArgs = {
|
specialArgs = {
|
||||||
inherit inputs self sops-nix;
|
inherit inputs self;
|
||||||
};
|
};
|
||||||
modules = [
|
modules = commonModules ++ [
|
||||||
(
|
./hosts/testvm01
|
||||||
{ config, pkgs, ... }:
|
];
|
||||||
{
|
};
|
||||||
nixpkgs.overlays = commonOverlays;
|
testvm02 = nixpkgs.lib.nixosSystem {
|
||||||
}
|
inherit system;
|
||||||
)
|
specialArgs = {
|
||||||
./hosts/vaulttest01
|
inherit inputs self;
|
||||||
sops-nix.nixosModules.sops
|
};
|
||||||
|
modules = commonModules ++ [
|
||||||
|
./hosts/testvm02
|
||||||
|
];
|
||||||
|
};
|
||||||
|
testvm03 = nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
specialArgs = {
|
||||||
|
inherit inputs self;
|
||||||
|
};
|
||||||
|
modules = commonModules ++ [
|
||||||
|
./hosts/testvm03
|
||||||
|
];
|
||||||
|
};
|
||||||
|
ns2 = nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
specialArgs = {
|
||||||
|
inherit inputs self;
|
||||||
|
};
|
||||||
|
modules = commonModules ++ [
|
||||||
|
./hosts/ns2
|
||||||
|
];
|
||||||
|
};
|
||||||
|
ns1 = nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
specialArgs = {
|
||||||
|
inherit inputs self;
|
||||||
|
};
|
||||||
|
modules = commonModules ++ [
|
||||||
|
./hosts/ns1
|
||||||
|
];
|
||||||
|
};
|
||||||
|
kanidm01 = nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
specialArgs = {
|
||||||
|
inherit inputs self;
|
||||||
|
};
|
||||||
|
modules = commonModules ++ [
|
||||||
|
./hosts/kanidm01
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
@@ -386,11 +203,13 @@
|
|||||||
{ pkgs }:
|
{ pkgs }:
|
||||||
{
|
{
|
||||||
default = pkgs.mkShell {
|
default = pkgs.mkShell {
|
||||||
packages = with pkgs; [
|
packages = [
|
||||||
ansible
|
pkgs.ansible
|
||||||
opentofu
|
pkgs.opentofu
|
||||||
openbao
|
pkgs.openbao
|
||||||
|
pkgs.kanidm_1_8
|
||||||
(pkgs.callPackage ./scripts/create-host { })
|
(pkgs.callPackage ./scripts/create-host { })
|
||||||
|
homelab-deploy.packages.${pkgs.system}.default
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,8 +0,0 @@
|
|||||||
{ ... }:
|
|
||||||
{
|
|
||||||
imports = [
|
|
||||||
./configuration.nix
|
|
||||||
../../services/lldap
|
|
||||||
../../services/authelia
|
|
||||||
];
|
|
||||||
}
|
|
||||||
@@ -1,7 +0,0 @@
|
|||||||
{ ... }:
|
|
||||||
{
|
|
||||||
imports = [
|
|
||||||
./configuration.nix
|
|
||||||
../../services/ca
|
|
||||||
];
|
|
||||||
}
|
|
||||||
@@ -7,7 +7,7 @@
|
|||||||
|
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../template/hardware-configuration.nix
|
./hardware-configuration.nix
|
||||||
|
|
||||||
../../system
|
../../system
|
||||||
../../common/vm
|
../../common/vm
|
||||||
@@ -55,8 +55,17 @@
|
|||||||
git
|
git
|
||||||
];
|
];
|
||||||
|
|
||||||
|
# Vault secrets management
|
||||||
|
vault.enable = true;
|
||||||
|
homelab.deploy.enable = true;
|
||||||
|
vault.secrets.backup-helper = {
|
||||||
|
secretPath = "shared/backup/password";
|
||||||
|
extractKey = "password";
|
||||||
|
outputDir = "/run/secrets/backup_helper_secret";
|
||||||
|
services = [ "restic-backups-ha1" ];
|
||||||
|
};
|
||||||
|
|
||||||
# Backup service dirs
|
# Backup service dirs
|
||||||
sops.secrets."backup_helper_secret" = { };
|
|
||||||
services.restic.backups.ha1 = {
|
services.restic.backups.ha1 = {
|
||||||
repository = "rest:http://10.69.12.52:8000/backup-nix";
|
repository = "rest:http://10.69.12.52:8000/backup-nix";
|
||||||
passwordFile = "/run/secrets/backup_helper_secret";
|
passwordFile = "/run/secrets/backup_helper_secret";
|
||||||
@@ -68,6 +77,7 @@
|
|||||||
timerConfig = {
|
timerConfig = {
|
||||||
OnCalendar = "daily";
|
OnCalendar = "daily";
|
||||||
Persistent = true;
|
Persistent = true;
|
||||||
|
RandomizedDelaySec = "2h";
|
||||||
};
|
};
|
||||||
pruneOpts = [
|
pruneOpts = [
|
||||||
"--keep-daily 7"
|
"--keep-daily 7"
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../template/hardware-configuration.nix
|
./hardware-configuration.nix
|
||||||
|
|
||||||
../../system
|
../../system
|
||||||
../../common/vm
|
../../common/vm
|
||||||
@@ -21,8 +21,6 @@
|
|||||||
"prometheus"
|
"prometheus"
|
||||||
"alertmanager"
|
"alertmanager"
|
||||||
"jelly"
|
"jelly"
|
||||||
"auth"
|
|
||||||
"lldap"
|
|
||||||
"pyroscope"
|
"pyroscope"
|
||||||
"pushgw"
|
"pushgw"
|
||||||
];
|
];
|
||||||
@@ -62,6 +60,9 @@
|
|||||||
"nix-command"
|
"nix-command"
|
||||||
"flakes"
|
"flakes"
|
||||||
];
|
];
|
||||||
|
vault.enable = true;
|
||||||
|
homelab.deploy.enable = true;
|
||||||
|
|
||||||
nix.settings.tarball-ttl = 0;
|
nix.settings.tarball-ttl = 0;
|
||||||
environment.systemPackages = with pkgs; [
|
environment.systemPackages = with pkgs; [
|
||||||
vim
|
vim
|
||||||
|
|||||||
42
hosts/http-proxy/hardware-configuration.nix
Normal file
42
hosts/http-proxy/hardware-configuration.nix
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
|
pkgs,
|
||||||
|
modulesPath,
|
||||||
|
...
|
||||||
|
}:
|
||||||
|
|
||||||
|
{
|
||||||
|
imports = [
|
||||||
|
(modulesPath + "/profiles/qemu-guest.nix")
|
||||||
|
];
|
||||||
|
boot.initrd.availableKernelModules = [
|
||||||
|
"ata_piix"
|
||||||
|
"uhci_hcd"
|
||||||
|
"virtio_pci"
|
||||||
|
"virtio_scsi"
|
||||||
|
"sd_mod"
|
||||||
|
"sr_mod"
|
||||||
|
];
|
||||||
|
boot.initrd.kernelModules = [ "dm-snapshot" ];
|
||||||
|
boot.kernelModules = [
|
||||||
|
"ptp_kvm"
|
||||||
|
];
|
||||||
|
boot.extraModulePackages = [ ];
|
||||||
|
|
||||||
|
fileSystems."/" = {
|
||||||
|
device = "/dev/disk/by-label/root";
|
||||||
|
fsType = "xfs";
|
||||||
|
};
|
||||||
|
|
||||||
|
swapDevices = [ { device = "/dev/disk/by-label/swap"; } ];
|
||||||
|
|
||||||
|
# Enables DHCP on each ethernet and wireless interface. In case of scripted networking
|
||||||
|
# (the default) this is the recommended approach. When using systemd-networkd it's
|
||||||
|
# still possible to use this option, but it's recommended to use it in conjunction
|
||||||
|
# with explicit per-interface declarations with `networking.interfaces.<interface>.useDHCP`.
|
||||||
|
networking.useDHCP = lib.mkDefault true;
|
||||||
|
# networking.interfaces.ens18.useDHCP = lib.mkDefault true;
|
||||||
|
|
||||||
|
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
|
||||||
|
}
|
||||||
@@ -1,9 +1,12 @@
|
|||||||
{ config, ... }:
|
{ config, ... }:
|
||||||
{
|
{
|
||||||
sops.secrets.wireguard_private_key = {
|
vault.secrets.wireguard = {
|
||||||
sopsFile = ../../secrets/http-proxy/wireguard.yaml;
|
secretPath = "hosts/http-proxy/wireguard";
|
||||||
key = "wg_private_key";
|
extractKey = "private_key";
|
||||||
|
outputDir = "/run/secrets/wireguard_private_key";
|
||||||
|
services = [ "wireguard-wg0" ];
|
||||||
};
|
};
|
||||||
|
|
||||||
networking.wireguard = {
|
networking.wireguard = {
|
||||||
enable = true;
|
enable = true;
|
||||||
useNetworkd = true;
|
useNetworkd = true;
|
||||||
@@ -13,7 +16,7 @@
|
|||||||
ips = [ "10.69.222.3/24" ];
|
ips = [ "10.69.222.3/24" ];
|
||||||
mtu = 1384;
|
mtu = 1384;
|
||||||
listenPort = 51820;
|
listenPort = 51820;
|
||||||
privateKeyFile = config.sops.secrets.wireguard_private_key.path;
|
privateKeyFile = "/run/secrets/wireguard_private_key";
|
||||||
peers = [
|
peers = [
|
||||||
{
|
{
|
||||||
name = "docker2.t-juice.club";
|
name = "docker2.t-juice.club";
|
||||||
@@ -26,7 +29,11 @@
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
# monitoring
|
homelab.monitoring.scrapeTargets = [{
|
||||||
|
job_name = "wireguard";
|
||||||
|
port = 9586;
|
||||||
|
}];
|
||||||
|
|
||||||
services.prometheus.exporters.wireguard = {
|
services.prometheus.exporters.wireguard = {
|
||||||
enable = true;
|
enable = true;
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../template/hardware-configuration.nix
|
./hardware-configuration.nix
|
||||||
|
|
||||||
../../system
|
../../system
|
||||||
../../common/vm
|
../../common/vm
|
||||||
@@ -61,9 +61,8 @@
|
|||||||
# Or disable the firewall altogether.
|
# Or disable the firewall altogether.
|
||||||
networking.firewall.enable = false;
|
networking.firewall.enable = false;
|
||||||
|
|
||||||
zramSwap = {
|
vault.enable = true;
|
||||||
enable = true;
|
homelab.deploy.enable = true;
|
||||||
};
|
|
||||||
|
|
||||||
system.stateVersion = "23.11"; # Did you read the comment?
|
system.stateVersion = "23.11"; # Did you read the comment?
|
||||||
}
|
}
|
||||||
|
|||||||
42
hosts/jelly01/hardware-configuration.nix
Normal file
42
hosts/jelly01/hardware-configuration.nix
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
|
pkgs,
|
||||||
|
modulesPath,
|
||||||
|
...
|
||||||
|
}:
|
||||||
|
|
||||||
|
{
|
||||||
|
imports = [
|
||||||
|
(modulesPath + "/profiles/qemu-guest.nix")
|
||||||
|
];
|
||||||
|
boot.initrd.availableKernelModules = [
|
||||||
|
"ata_piix"
|
||||||
|
"uhci_hcd"
|
||||||
|
"virtio_pci"
|
||||||
|
"virtio_scsi"
|
||||||
|
"sd_mod"
|
||||||
|
"sr_mod"
|
||||||
|
];
|
||||||
|
boot.initrd.kernelModules = [ "dm-snapshot" ];
|
||||||
|
boot.kernelModules = [
|
||||||
|
"ptp_kvm"
|
||||||
|
];
|
||||||
|
boot.extraModulePackages = [ ];
|
||||||
|
|
||||||
|
fileSystems."/" = {
|
||||||
|
device = "/dev/disk/by-label/root";
|
||||||
|
fsType = "xfs";
|
||||||
|
};
|
||||||
|
|
||||||
|
swapDevices = [ { device = "/dev/disk/by-label/swap"; } ];
|
||||||
|
|
||||||
|
# Enables DHCP on each ethernet and wireless interface. In case of scripted networking
|
||||||
|
# (the default) this is the recommended approach. When using systemd-networkd it's
|
||||||
|
# still possible to use this option, but it's recommended to use it in conjunction
|
||||||
|
# with explicit per-interface declarations with `networking.interfaces.<interface>.useDHCP`.
|
||||||
|
networking.useDHCP = lib.mkDefault true;
|
||||||
|
# networking.interfaces.ens18.useDHCP = lib.mkDefault true;
|
||||||
|
|
||||||
|
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
|
||||||
|
}
|
||||||
@@ -1,53 +0,0 @@
|
|||||||
{ config, lib, pkgs, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports =
|
|
||||||
[
|
|
||||||
../template/hardware-configuration.nix
|
|
||||||
../../system
|
|
||||||
];
|
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
|
||||||
# Use the systemd-boot EFI boot loader.
|
|
||||||
boot.loader.grub.enable = true;
|
|
||||||
boot.loader.grub.device = "/dev/sda";
|
|
||||||
|
|
||||||
networking.hostName = "jump";
|
|
||||||
networking.domain = "home.2rjus.net";
|
|
||||||
networking.useNetworkd = true;
|
|
||||||
networking.useDHCP = false;
|
|
||||||
services.resolved.enable = false;
|
|
||||||
networking.nameservers = [
|
|
||||||
"10.69.13.5"
|
|
||||||
"10.69.13.6"
|
|
||||||
];
|
|
||||||
|
|
||||||
systemd.network.enable = true;
|
|
||||||
systemd.network.networks."ens18" = {
|
|
||||||
matchConfig.Name = "ens18";
|
|
||||||
address = [
|
|
||||||
"10.69.13.10/24"
|
|
||||||
];
|
|
||||||
routes = [
|
|
||||||
{ Gateway = "10.69.13.1"; }
|
|
||||||
];
|
|
||||||
linkConfig.RequiredForOnline = "routable";
|
|
||||||
};
|
|
||||||
time.timeZone = "Europe/Oslo";
|
|
||||||
|
|
||||||
nix.settings.experimental-features = [ "nix-command" "flakes" ];
|
|
||||||
environment.systemPackages = with pkgs; [
|
|
||||||
vim
|
|
||||||
wget
|
|
||||||
git
|
|
||||||
];
|
|
||||||
|
|
||||||
# Open ports in the firewall.
|
|
||||||
# networking.firewall.allowedTCPPorts = [ ... ];
|
|
||||||
# networking.firewall.allowedUDPPorts = [ ... ];
|
|
||||||
# Or disable the firewall altogether.
|
|
||||||
networking.firewall.enable = false;
|
|
||||||
|
|
||||||
system.stateVersion = "23.11"; # Did you read the comment?
|
|
||||||
}
|
|
||||||
|
|
||||||
@@ -1,36 +0,0 @@
|
|||||||
{ config, lib, pkgs, modulesPath, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports =
|
|
||||||
[
|
|
||||||
(modulesPath + "/profiles/qemu-guest.nix")
|
|
||||||
];
|
|
||||||
|
|
||||||
boot.initrd.availableKernelModules = [ "ata_piix" "uhci_hcd" "virtio_pci" "virtio_scsi" "sd_mod" "sr_mod" ];
|
|
||||||
boot.initrd.kernelModules = [ ];
|
|
||||||
# boot.kernelModules = [ ];
|
|
||||||
# boot.extraModulePackages = [ ];
|
|
||||||
|
|
||||||
fileSystems."/" =
|
|
||||||
{
|
|
||||||
device = "/dev/disk/by-uuid/6889aba9-61ed-4687-ab10-e5cf4017ac8d";
|
|
||||||
fsType = "xfs";
|
|
||||||
};
|
|
||||||
|
|
||||||
fileSystems."/boot" =
|
|
||||||
{
|
|
||||||
device = "/dev/disk/by-uuid/BC07-3B7A";
|
|
||||||
fsType = "vfat";
|
|
||||||
};
|
|
||||||
|
|
||||||
swapDevices =
|
|
||||||
[{ device = "/dev/disk/by-uuid/64e5757b-6625-4dd2-aa2a-66ca93444d23"; }];
|
|
||||||
|
|
||||||
# Enables DHCP on each ethernet and wireless interface. In case of scripted networking
|
|
||||||
# (the default) this is the recommended approach. When using systemd-networkd it's
|
|
||||||
# still possible to use this option, but it's recommended to use it in conjunction
|
|
||||||
# with explicit per-interface declarations with `networking.interfaces.<interface>.useDHCP`.
|
|
||||||
# networking.interfaces.ens18.useDHCP = lib.mkDefault true;
|
|
||||||
|
|
||||||
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
|
|
||||||
}
|
|
||||||
@@ -1,27 +1,39 @@
|
|||||||
{
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
pkgs,
|
pkgs,
|
||||||
...
|
...
|
||||||
}:
|
}:
|
||||||
|
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../template/hardware-configuration.nix
|
../template2/hardware-configuration.nix
|
||||||
|
|
||||||
../../system
|
../../system
|
||||||
../../common/vm
|
../../common/vm
|
||||||
|
../../services/kanidm
|
||||||
];
|
];
|
||||||
|
|
||||||
homelab.dns.cnames = [ "ldap" ];
|
# Host metadata
|
||||||
|
homelab.host = {
|
||||||
nixpkgs.config.allowUnfree = true;
|
tier = "test";
|
||||||
# Use the systemd-boot EFI boot loader.
|
role = "auth";
|
||||||
boot.loader.grub = {
|
|
||||||
enable = true;
|
|
||||||
device = "/dev/sda";
|
|
||||||
configurationLimit = 3;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
networking.hostName = "auth01";
|
# DNS CNAME for auth.home.2rjus.net
|
||||||
|
homelab.dns.cnames = [ "auth" ];
|
||||||
|
|
||||||
|
# Enable Vault integration
|
||||||
|
vault.enable = true;
|
||||||
|
|
||||||
|
# Enable remote deployment via NATS
|
||||||
|
homelab.deploy.enable = true;
|
||||||
|
|
||||||
|
nixpkgs.config.allowUnfree = true;
|
||||||
|
boot.loader.grub.enable = true;
|
||||||
|
boot.loader.grub.device = "/dev/vda";
|
||||||
|
|
||||||
|
networking.hostName = "kanidm01";
|
||||||
networking.domain = "home.2rjus.net";
|
networking.domain = "home.2rjus.net";
|
||||||
networking.useNetworkd = true;
|
networking.useNetworkd = true;
|
||||||
networking.useDHCP = false;
|
networking.useDHCP = false;
|
||||||
@@ -35,7 +47,7 @@
|
|||||||
systemd.network.networks."ens18" = {
|
systemd.network.networks."ens18" = {
|
||||||
matchConfig.Name = "ens18";
|
matchConfig.Name = "ens18";
|
||||||
address = [
|
address = [
|
||||||
"10.69.13.18/24"
|
"10.69.13.23/24"
|
||||||
];
|
];
|
||||||
routes = [
|
routes = [
|
||||||
{ Gateway = "10.69.13.1"; }
|
{ Gateway = "10.69.13.1"; }
|
||||||
@@ -55,13 +67,11 @@
|
|||||||
git
|
git
|
||||||
];
|
];
|
||||||
|
|
||||||
services.qemuGuest.enable = true;
|
|
||||||
|
|
||||||
# Open ports in the firewall.
|
# Open ports in the firewall.
|
||||||
# networking.firewall.allowedTCPPorts = [ ... ];
|
# networking.firewall.allowedTCPPorts = [ ... ];
|
||||||
# networking.firewall.allowedUDPPorts = [ ... ];
|
# networking.firewall.allowedUDPPorts = [ ... ];
|
||||||
# Or disable the firewall altogether.
|
# Or disable the firewall altogether.
|
||||||
networking.firewall.enable = false;
|
networking.firewall.enable = false;
|
||||||
|
|
||||||
system.stateVersion = "23.11"; # Did you read the comment?
|
system.stateVersion = "25.11"; # Did you read the comment?
|
||||||
}
|
}
|
||||||
@@ -1,76 +0,0 @@
|
|||||||
{
|
|
||||||
pkgs,
|
|
||||||
...
|
|
||||||
}:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports = [
|
|
||||||
./hardware-configuration.nix
|
|
||||||
|
|
||||||
../../system
|
|
||||||
];
|
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
|
||||||
|
|
||||||
# Use the systemd-boot EFI boot loader.
|
|
||||||
boot = {
|
|
||||||
loader.systemd-boot = {
|
|
||||||
enable = true;
|
|
||||||
configurationLimit = 5;
|
|
||||||
memtest86.enable = true;
|
|
||||||
};
|
|
||||||
loader.efi.canTouchEfiVariables = true;
|
|
||||||
supportedFilesystems = [ "nfs" ];
|
|
||||||
};
|
|
||||||
|
|
||||||
networking.hostName = "media1";
|
|
||||||
networking.domain = "home.2rjus.net";
|
|
||||||
networking.useNetworkd = true;
|
|
||||||
networking.useDHCP = false;
|
|
||||||
services.resolved.enable = true;
|
|
||||||
networking.nameservers = [
|
|
||||||
"10.69.13.5"
|
|
||||||
"10.69.13.6"
|
|
||||||
];
|
|
||||||
|
|
||||||
systemd.network.enable = true;
|
|
||||||
systemd.network.networks."enp2s0" = {
|
|
||||||
matchConfig.Name = "enp2s0";
|
|
||||||
address = [
|
|
||||||
"10.69.12.82/24"
|
|
||||||
];
|
|
||||||
routes = [
|
|
||||||
{ Gateway = "10.69.12.1"; }
|
|
||||||
];
|
|
||||||
linkConfig.RequiredForOnline = "routable";
|
|
||||||
};
|
|
||||||
time.timeZone = "Europe/Oslo";
|
|
||||||
|
|
||||||
# Graphics
|
|
||||||
hardware.graphics = {
|
|
||||||
enable = true;
|
|
||||||
extraPackages = with pkgs; [
|
|
||||||
libvdpau-va-gl
|
|
||||||
libva-vdpau-driver
|
|
||||||
];
|
|
||||||
};
|
|
||||||
|
|
||||||
nix.settings.experimental-features = [
|
|
||||||
"nix-command"
|
|
||||||
"flakes"
|
|
||||||
];
|
|
||||||
nix.settings.tarball-ttl = 0;
|
|
||||||
environment.systemPackages = with pkgs; [
|
|
||||||
vim
|
|
||||||
wget
|
|
||||||
git
|
|
||||||
];
|
|
||||||
|
|
||||||
# Open ports in the firewall.
|
|
||||||
# networking.firewall.allowedTCPPorts = [ ... ];
|
|
||||||
# networking.firewall.allowedUDPPorts = [ ... ];
|
|
||||||
# Or disable the firewall altogether.
|
|
||||||
networking.firewall.enable = false;
|
|
||||||
|
|
||||||
system.stateVersion = "23.11"; # Did you read the comment?
|
|
||||||
}
|
|
||||||
@@ -1,7 +0,0 @@
|
|||||||
{ ... }:
|
|
||||||
{
|
|
||||||
imports = [
|
|
||||||
./configuration.nix
|
|
||||||
./kodi.nix
|
|
||||||
];
|
|
||||||
}
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
{ config, lib, pkgs, modulesPath, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports =
|
|
||||||
[
|
|
||||||
(modulesPath + "/installer/scan/not-detected.nix")
|
|
||||||
];
|
|
||||||
|
|
||||||
boot.initrd.availableKernelModules = [ "xhci_pci" "ahci" "usb_storage" "usbhid" "sd_mod" "rtsx_usb_sdmmc" ];
|
|
||||||
boot.initrd.kernelModules = [ ];
|
|
||||||
boot.kernelModules = [ "kvm-amd" ];
|
|
||||||
boot.extraModulePackages = [ ];
|
|
||||||
|
|
||||||
fileSystems."/" =
|
|
||||||
{
|
|
||||||
device = "/dev/disk/by-uuid/3e7c311c-b1a3-4be7-b8bf-e497cba64302";
|
|
||||||
fsType = "btrfs";
|
|
||||||
};
|
|
||||||
|
|
||||||
fileSystems."/boot" =
|
|
||||||
{
|
|
||||||
device = "/dev/disk/by-uuid/F0D7-E5C1";
|
|
||||||
fsType = "vfat";
|
|
||||||
options = [ "fmask=0022" "dmask=0022" ];
|
|
||||||
};
|
|
||||||
|
|
||||||
swapDevices =
|
|
||||||
[{ device = "/dev/disk/by-uuid/1a06a36f-da61-4d36-b94e-b852836c328a"; }];
|
|
||||||
|
|
||||||
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
|
|
||||||
hardware.cpu.amd.updateMicrocode = lib.mkDefault config.hardware.enableRedistributableFirmware;
|
|
||||||
}
|
|
||||||
|
|
||||||
@@ -1,29 +0,0 @@
|
|||||||
{ pkgs, ... }:
|
|
||||||
let
|
|
||||||
kodipkg = pkgs.kodi-wayland.withPackages (
|
|
||||||
p: with p; [
|
|
||||||
jellyfin
|
|
||||||
]
|
|
||||||
);
|
|
||||||
in
|
|
||||||
{
|
|
||||||
users.users.kodi = {
|
|
||||||
isNormalUser = true;
|
|
||||||
description = "Kodi Media Center user";
|
|
||||||
};
|
|
||||||
#services.xserver = {
|
|
||||||
# enable = true;
|
|
||||||
#};
|
|
||||||
services.cage = {
|
|
||||||
enable = true;
|
|
||||||
user = "kodi";
|
|
||||||
environment = {
|
|
||||||
XKB_DEFAULT_LAYOUT = "no";
|
|
||||||
};
|
|
||||||
program = "${kodipkg}/bin/kodi";
|
|
||||||
};
|
|
||||||
|
|
||||||
environment.systemPackages = with pkgs; [
|
|
||||||
firefox
|
|
||||||
];
|
|
||||||
}
|
|
||||||
@@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../template/hardware-configuration.nix
|
./hardware-configuration.nix
|
||||||
|
|
||||||
../../system
|
../../system
|
||||||
../../common/vm
|
../../common/vm
|
||||||
@@ -56,7 +56,16 @@
|
|||||||
|
|
||||||
services.qemuGuest.enable = true;
|
services.qemuGuest.enable = true;
|
||||||
|
|
||||||
sops.secrets."backup_helper_secret" = { };
|
# Vault secrets management
|
||||||
|
vault.enable = true;
|
||||||
|
homelab.deploy.enable = true;
|
||||||
|
vault.secrets.backup-helper = {
|
||||||
|
secretPath = "shared/backup/password";
|
||||||
|
extractKey = "password";
|
||||||
|
outputDir = "/run/secrets/backup_helper_secret";
|
||||||
|
services = [ "restic-backups-grafana" "restic-backups-grafana-db" ];
|
||||||
|
};
|
||||||
|
|
||||||
services.restic.backups.grafana = {
|
services.restic.backups.grafana = {
|
||||||
repository = "rest:http://10.69.12.52:8000/backup-nix";
|
repository = "rest:http://10.69.12.52:8000/backup-nix";
|
||||||
passwordFile = "/run/secrets/backup_helper_secret";
|
passwordFile = "/run/secrets/backup_helper_secret";
|
||||||
@@ -64,6 +73,7 @@
|
|||||||
timerConfig = {
|
timerConfig = {
|
||||||
OnCalendar = "daily";
|
OnCalendar = "daily";
|
||||||
Persistent = true;
|
Persistent = true;
|
||||||
|
RandomizedDelaySec = "2h";
|
||||||
};
|
};
|
||||||
pruneOpts = [
|
pruneOpts = [
|
||||||
"--keep-daily 7"
|
"--keep-daily 7"
|
||||||
@@ -80,6 +90,7 @@
|
|||||||
timerConfig = {
|
timerConfig = {
|
||||||
OnCalendar = "daily";
|
OnCalendar = "daily";
|
||||||
Persistent = true;
|
Persistent = true;
|
||||||
|
RandomizedDelaySec = "2h";
|
||||||
};
|
};
|
||||||
pruneOpts = [
|
pruneOpts = [
|
||||||
"--keep-daily 7"
|
"--keep-daily 7"
|
||||||
@@ -89,61 +100,6 @@
|
|||||||
];
|
];
|
||||||
};
|
};
|
||||||
|
|
||||||
labmon = {
|
|
||||||
enable = true;
|
|
||||||
|
|
||||||
settings = {
|
|
||||||
ListenAddr = ":9969";
|
|
||||||
Profiling = true;
|
|
||||||
StepMonitors = [
|
|
||||||
{
|
|
||||||
Enabled = true;
|
|
||||||
BaseURL = "https://ca.home.2rjus.net";
|
|
||||||
RootID = "3381bda8015a86b9a3cd1851439d1091890a79005e0f1f7c4301fe4bccc29d80";
|
|
||||||
}
|
|
||||||
];
|
|
||||||
|
|
||||||
TLSConnectionMonitors = [
|
|
||||||
{
|
|
||||||
Enabled = true;
|
|
||||||
Address = "ca.home.2rjus.net:443";
|
|
||||||
Verify = true;
|
|
||||||
Duration = "12h";
|
|
||||||
}
|
|
||||||
{
|
|
||||||
Enabled = true;
|
|
||||||
Address = "jelly.home.2rjus.net:443";
|
|
||||||
Verify = true;
|
|
||||||
Duration = "12h";
|
|
||||||
}
|
|
||||||
{
|
|
||||||
Enabled = true;
|
|
||||||
Address = "grafana.home.2rjus.net:443";
|
|
||||||
Verify = true;
|
|
||||||
Duration = "12h";
|
|
||||||
}
|
|
||||||
{
|
|
||||||
Enabled = true;
|
|
||||||
Address = "prometheus.home.2rjus.net:443";
|
|
||||||
Verify = true;
|
|
||||||
Duration = "12h";
|
|
||||||
}
|
|
||||||
{
|
|
||||||
Enabled = true;
|
|
||||||
Address = "alertmanager.home.2rjus.net:443";
|
|
||||||
Verify = true;
|
|
||||||
Duration = "12h";
|
|
||||||
}
|
|
||||||
{
|
|
||||||
Enabled = true;
|
|
||||||
Address = "pyroscope.home.2rjus.net:443";
|
|
||||||
Verify = true;
|
|
||||||
Duration = "12h";
|
|
||||||
}
|
|
||||||
];
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
# Open ports in the firewall.
|
# Open ports in the firewall.
|
||||||
# networking.firewall.allowedTCPPorts = [ ... ];
|
# networking.firewall.allowedTCPPorts = [ ... ];
|
||||||
# networking.firewall.allowedUDPPorts = [ ... ];
|
# networking.firewall.allowedUDPPorts = [ ... ];
|
||||||
|
|||||||
42
hosts/monitoring01/hardware-configuration.nix
Normal file
42
hosts/monitoring01/hardware-configuration.nix
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
|
pkgs,
|
||||||
|
modulesPath,
|
||||||
|
...
|
||||||
|
}:
|
||||||
|
|
||||||
|
{
|
||||||
|
imports = [
|
||||||
|
(modulesPath + "/profiles/qemu-guest.nix")
|
||||||
|
];
|
||||||
|
boot.initrd.availableKernelModules = [
|
||||||
|
"ata_piix"
|
||||||
|
"uhci_hcd"
|
||||||
|
"virtio_pci"
|
||||||
|
"virtio_scsi"
|
||||||
|
"sd_mod"
|
||||||
|
"sr_mod"
|
||||||
|
];
|
||||||
|
boot.initrd.kernelModules = [ "dm-snapshot" ];
|
||||||
|
boot.kernelModules = [
|
||||||
|
"ptp_kvm"
|
||||||
|
];
|
||||||
|
boot.extraModulePackages = [ ];
|
||||||
|
|
||||||
|
fileSystems."/" = {
|
||||||
|
device = "/dev/disk/by-label/root";
|
||||||
|
fsType = "xfs";
|
||||||
|
};
|
||||||
|
|
||||||
|
swapDevices = [ { device = "/dev/disk/by-label/swap"; } ];
|
||||||
|
|
||||||
|
# Enables DHCP on each ethernet and wireless interface. In case of scripted networking
|
||||||
|
# (the default) this is the recommended approach. When using systemd-networkd it's
|
||||||
|
# still possible to use this option, but it's recommended to use it in conjunction
|
||||||
|
# with explicit per-interface declarations with `networking.interfaces.<interface>.useDHCP`.
|
||||||
|
networking.useDHCP = lib.mkDefault true;
|
||||||
|
# networking.interfaces.ens18.useDHCP = lib.mkDefault true;
|
||||||
|
|
||||||
|
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
|
||||||
|
}
|
||||||
@@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../template/hardware-configuration.nix
|
./hardware-configuration.nix
|
||||||
|
|
||||||
../../system
|
../../system
|
||||||
../../common/vm
|
../../common/vm
|
||||||
@@ -59,5 +59,8 @@
|
|||||||
# Or disable the firewall altogether.
|
# Or disable the firewall altogether.
|
||||||
networking.firewall.enable = false;
|
networking.firewall.enable = false;
|
||||||
|
|
||||||
|
vault.enable = true;
|
||||||
|
homelab.deploy.enable = true;
|
||||||
|
|
||||||
system.stateVersion = "23.11"; # Did you read the comment?
|
system.stateVersion = "23.11"; # Did you read the comment?
|
||||||
}
|
}
|
||||||
|
|||||||
42
hosts/nats1/hardware-configuration.nix
Normal file
42
hosts/nats1/hardware-configuration.nix
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
|
pkgs,
|
||||||
|
modulesPath,
|
||||||
|
...
|
||||||
|
}:
|
||||||
|
|
||||||
|
{
|
||||||
|
imports = [
|
||||||
|
(modulesPath + "/profiles/qemu-guest.nix")
|
||||||
|
];
|
||||||
|
boot.initrd.availableKernelModules = [
|
||||||
|
"ata_piix"
|
||||||
|
"uhci_hcd"
|
||||||
|
"virtio_pci"
|
||||||
|
"virtio_scsi"
|
||||||
|
"sd_mod"
|
||||||
|
"sr_mod"
|
||||||
|
];
|
||||||
|
boot.initrd.kernelModules = [ "dm-snapshot" ];
|
||||||
|
boot.kernelModules = [
|
||||||
|
"ptp_kvm"
|
||||||
|
];
|
||||||
|
boot.extraModulePackages = [ ];
|
||||||
|
|
||||||
|
fileSystems."/" = {
|
||||||
|
device = "/dev/disk/by-label/root";
|
||||||
|
fsType = "xfs";
|
||||||
|
};
|
||||||
|
|
||||||
|
swapDevices = [ { device = "/dev/disk/by-label/swap"; } ];
|
||||||
|
|
||||||
|
# Enables DHCP on each ethernet and wireless interface. In case of scripted networking
|
||||||
|
# (the default) this is the recommended approach. When using systemd-networkd it's
|
||||||
|
# still possible to use this option, but it's recommended to use it in conjunction
|
||||||
|
# with explicit per-interface declarations with `networking.interfaces.<interface>.useDHCP`.
|
||||||
|
networking.useDHCP = lib.mkDefault true;
|
||||||
|
# networking.interfaces.ens18.useDHCP = lib.mkDefault true;
|
||||||
|
|
||||||
|
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
|
||||||
|
}
|
||||||
@@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../template/hardware-configuration.nix
|
./hardware-configuration.nix
|
||||||
|
|
||||||
../../system
|
../../system
|
||||||
../../common/vm
|
../../common/vm
|
||||||
@@ -13,6 +13,8 @@
|
|||||||
|
|
||||||
homelab.dns.cnames = [ "nix-cache" "actions1" ];
|
homelab.dns.cnames = [ "nix-cache" "actions1" ];
|
||||||
|
|
||||||
|
homelab.host.role = "build-host";
|
||||||
|
|
||||||
fileSystems."/nix" = {
|
fileSystems."/nix" = {
|
||||||
device = "/dev/disk/by-label/nixcache";
|
device = "/dev/disk/by-label/nixcache";
|
||||||
fsType = "xfs";
|
fsType = "xfs";
|
||||||
@@ -52,6 +54,9 @@
|
|||||||
"nix-command"
|
"nix-command"
|
||||||
"flakes"
|
"flakes"
|
||||||
];
|
];
|
||||||
|
vault.enable = true;
|
||||||
|
homelab.deploy.enable = true;
|
||||||
|
|
||||||
nix.settings.tarball-ttl = 0;
|
nix.settings.tarball-ttl = 0;
|
||||||
environment.systemPackages = with pkgs; [
|
environment.systemPackages = with pkgs; [
|
||||||
vim
|
vim
|
||||||
|
|||||||
@@ -4,6 +4,5 @@
|
|||||||
./configuration.nix
|
./configuration.nix
|
||||||
../../services/nix-cache
|
../../services/nix-cache
|
||||||
../../services/actions-runner
|
../../services/actions-runner
|
||||||
./zram.nix
|
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|||||||
42
hosts/nix-cache01/hardware-configuration.nix
Normal file
42
hosts/nix-cache01/hardware-configuration.nix
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
|
pkgs,
|
||||||
|
modulesPath,
|
||||||
|
...
|
||||||
|
}:
|
||||||
|
|
||||||
|
{
|
||||||
|
imports = [
|
||||||
|
(modulesPath + "/profiles/qemu-guest.nix")
|
||||||
|
];
|
||||||
|
boot.initrd.availableKernelModules = [
|
||||||
|
"ata_piix"
|
||||||
|
"uhci_hcd"
|
||||||
|
"virtio_pci"
|
||||||
|
"virtio_scsi"
|
||||||
|
"sd_mod"
|
||||||
|
"sr_mod"
|
||||||
|
];
|
||||||
|
boot.initrd.kernelModules = [ "dm-snapshot" ];
|
||||||
|
boot.kernelModules = [
|
||||||
|
"ptp_kvm"
|
||||||
|
];
|
||||||
|
boot.extraModulePackages = [ ];
|
||||||
|
|
||||||
|
fileSystems."/" = {
|
||||||
|
device = "/dev/disk/by-label/root";
|
||||||
|
fsType = "xfs";
|
||||||
|
};
|
||||||
|
|
||||||
|
swapDevices = [ { device = "/dev/disk/by-label/swap"; } ];
|
||||||
|
|
||||||
|
# Enables DHCP on each ethernet and wireless interface. In case of scripted networking
|
||||||
|
# (the default) this is the recommended approach. When using systemd-networkd it's
|
||||||
|
# still possible to use this option, but it's recommended to use it in conjunction
|
||||||
|
# with explicit per-interface declarations with `networking.interfaces.<interface>.useDHCP`.
|
||||||
|
networking.useDHCP = lib.mkDefault true;
|
||||||
|
# networking.interfaces.ens18.useDHCP = lib.mkDefault true;
|
||||||
|
|
||||||
|
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
|
||||||
|
}
|
||||||
@@ -1,6 +0,0 @@
|
|||||||
{ ... }:
|
|
||||||
{
|
|
||||||
zramSwap = {
|
|
||||||
enable = true;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -1,77 +0,0 @@
|
|||||||
{ config, lib, pkgs, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports =
|
|
||||||
[
|
|
||||||
../template/hardware-configuration.nix
|
|
||||||
|
|
||||||
../../system
|
|
||||||
];
|
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
|
||||||
# Use the systemd-boot EFI boot loader.
|
|
||||||
boot.loader.grub.enable = true;
|
|
||||||
boot.loader.grub.device = "/dev/sda";
|
|
||||||
|
|
||||||
networking.hostName = "nixos-test1";
|
|
||||||
networking.domain = "home.2rjus.net";
|
|
||||||
networking.useNetworkd = true;
|
|
||||||
networking.useDHCP = false;
|
|
||||||
services.resolved.enable = true;
|
|
||||||
networking.nameservers = [
|
|
||||||
"10.69.13.5"
|
|
||||||
"10.69.13.6"
|
|
||||||
];
|
|
||||||
|
|
||||||
systemd.network.enable = true;
|
|
||||||
systemd.network.networks."ens18" = {
|
|
||||||
matchConfig.Name = "ens18";
|
|
||||||
address = [
|
|
||||||
"10.69.13.10/24"
|
|
||||||
];
|
|
||||||
routes = [
|
|
||||||
{ Gateway = "10.69.13.1"; }
|
|
||||||
];
|
|
||||||
linkConfig.RequiredForOnline = "routable";
|
|
||||||
};
|
|
||||||
time.timeZone = "Europe/Oslo";
|
|
||||||
|
|
||||||
nix.settings.experimental-features = [ "nix-command" "flakes" ];
|
|
||||||
nix.settings.tarball-ttl = 0;
|
|
||||||
environment.systemPackages = with pkgs; [
|
|
||||||
vim
|
|
||||||
wget
|
|
||||||
git
|
|
||||||
];
|
|
||||||
|
|
||||||
# Open ports in the firewall.
|
|
||||||
# networking.firewall.allowedTCPPorts = [ ... ];
|
|
||||||
# networking.firewall.allowedUDPPorts = [ ... ];
|
|
||||||
# Or disable the firewall altogether.
|
|
||||||
networking.firewall.enable = false;
|
|
||||||
|
|
||||||
# Secrets
|
|
||||||
# Backup
|
|
||||||
sops.secrets."backup_helper_secret" = { };
|
|
||||||
services.restic.backups.test = {
|
|
||||||
repository = "rest:http://10.69.12.52:8000/backup-nix";
|
|
||||||
passwordFile = "/run/secrets/backup_helper_secret";
|
|
||||||
paths = [
|
|
||||||
"/etc/machine-id"
|
|
||||||
"/etc/os-release"
|
|
||||||
];
|
|
||||||
timerConfig = {
|
|
||||||
OnCalendar = "daily";
|
|
||||||
Persistent = true;
|
|
||||||
};
|
|
||||||
pruneOpts = [
|
|
||||||
"--keep-daily 7"
|
|
||||||
"--keep-weekly 4"
|
|
||||||
"--keep-monthly 6"
|
|
||||||
"--keep-within 1d"
|
|
||||||
];
|
|
||||||
};
|
|
||||||
|
|
||||||
system.stateVersion = "23.11"; # Did you read the comment?
|
|
||||||
}
|
|
||||||
|
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
{ ... }: {
|
|
||||||
imports = [
|
|
||||||
./configuration.nix
|
|
||||||
];
|
|
||||||
}
|
|
||||||
@@ -7,23 +7,38 @@
|
|||||||
|
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../template/hardware-configuration.nix
|
../template2/hardware-configuration.nix
|
||||||
|
|
||||||
../../system
|
../../system
|
||||||
|
../../common/vm
|
||||||
|
|
||||||
|
# DNS services
|
||||||
../../services/ns/master-authorative.nix
|
../../services/ns/master-authorative.nix
|
||||||
../../services/ns/resolver.nix
|
../../services/ns/resolver.nix
|
||||||
../../common/vm
|
|
||||||
];
|
];
|
||||||
|
|
||||||
|
# Host metadata
|
||||||
|
homelab.host = {
|
||||||
|
tier = "prod";
|
||||||
|
role = "dns";
|
||||||
|
labels.dns_role = "primary";
|
||||||
|
};
|
||||||
|
|
||||||
|
# Enable Vault integration
|
||||||
|
vault.enable = true;
|
||||||
|
|
||||||
|
# Enable remote deployment via NATS
|
||||||
|
homelab.deploy.enable = true;
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
nixpkgs.config.allowUnfree = true;
|
||||||
# Use the systemd-boot EFI boot loader.
|
|
||||||
boot.loader.grub.enable = true;
|
boot.loader.grub.enable = true;
|
||||||
boot.loader.grub.device = "/dev/sda";
|
boot.loader.grub.device = "/dev/vda";
|
||||||
|
|
||||||
networking.hostName = "ns1";
|
networking.hostName = "ns1";
|
||||||
networking.domain = "home.2rjus.net";
|
networking.domain = "home.2rjus.net";
|
||||||
networking.useNetworkd = true;
|
networking.useNetworkd = true;
|
||||||
networking.useDHCP = false;
|
networking.useDHCP = false;
|
||||||
|
# Disable resolved - conflicts with Unbound resolver
|
||||||
services.resolved.enable = false;
|
services.resolved.enable = false;
|
||||||
networking.nameservers = [
|
networking.nameservers = [
|
||||||
"10.69.13.5"
|
"10.69.13.5"
|
||||||
@@ -60,5 +75,5 @@
|
|||||||
# Or disable the firewall altogether.
|
# Or disable the firewall altogether.
|
||||||
networking.firewall.enable = false;
|
networking.firewall.enable = false;
|
||||||
|
|
||||||
system.stateVersion = "23.11"; # Did you read the comment?
|
system.stateVersion = "25.11"; # Did you read the comment?
|
||||||
}
|
}
|
||||||
@@ -1,36 +0,0 @@
|
|||||||
{ config, lib, pkgs, modulesPath, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports =
|
|
||||||
[
|
|
||||||
(modulesPath + "/profiles/qemu-guest.nix")
|
|
||||||
];
|
|
||||||
|
|
||||||
boot.initrd.availableKernelModules = [ "ata_piix" "uhci_hcd" "virtio_pci" "virtio_scsi" "sd_mod" "sr_mod" ];
|
|
||||||
boot.initrd.kernelModules = [ ];
|
|
||||||
# boot.kernelModules = [ ];
|
|
||||||
# boot.extraModulePackages = [ ];
|
|
||||||
|
|
||||||
fileSystems."/" =
|
|
||||||
{
|
|
||||||
device = "/dev/disk/by-uuid/6889aba9-61ed-4687-ab10-e5cf4017ac8d";
|
|
||||||
fsType = "xfs";
|
|
||||||
};
|
|
||||||
|
|
||||||
fileSystems."/boot" =
|
|
||||||
{
|
|
||||||
device = "/dev/disk/by-uuid/BC07-3B7A";
|
|
||||||
fsType = "vfat";
|
|
||||||
};
|
|
||||||
|
|
||||||
swapDevices =
|
|
||||||
[{ device = "/dev/disk/by-uuid/64e5757b-6625-4dd2-aa2a-66ca93444d23"; }];
|
|
||||||
|
|
||||||
# Enables DHCP on each ethernet and wireless interface. In case of scripted networking
|
|
||||||
# (the default) this is the recommended approach. When using systemd-networkd it's
|
|
||||||
# still possible to use this option, but it's recommended to use it in conjunction
|
|
||||||
# with explicit per-interface declarations with `networking.interfaces.<interface>.useDHCP`.
|
|
||||||
# networking.interfaces.ens18.useDHCP = lib.mkDefault true;
|
|
||||||
|
|
||||||
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
|
|
||||||
}
|
|
||||||
@@ -7,23 +7,38 @@
|
|||||||
|
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../template/hardware-configuration.nix
|
../template2/hardware-configuration.nix
|
||||||
|
|
||||||
../../system
|
../../system
|
||||||
|
../../common/vm
|
||||||
|
|
||||||
|
# DNS services
|
||||||
../../services/ns/secondary-authorative.nix
|
../../services/ns/secondary-authorative.nix
|
||||||
../../services/ns/resolver.nix
|
../../services/ns/resolver.nix
|
||||||
../../common/vm
|
|
||||||
];
|
];
|
||||||
|
|
||||||
|
# Host metadata
|
||||||
|
homelab.host = {
|
||||||
|
tier = "prod";
|
||||||
|
role = "dns";
|
||||||
|
labels.dns_role = "secondary";
|
||||||
|
};
|
||||||
|
|
||||||
|
# Enable Vault integration
|
||||||
|
vault.enable = true;
|
||||||
|
|
||||||
|
# Enable remote deployment via NATS
|
||||||
|
homelab.deploy.enable = true;
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
nixpkgs.config.allowUnfree = true;
|
||||||
# Use the systemd-boot EFI boot loader.
|
|
||||||
boot.loader.grub.enable = true;
|
boot.loader.grub.enable = true;
|
||||||
boot.loader.grub.device = "/dev/sda";
|
boot.loader.grub.device = "/dev/vda";
|
||||||
|
|
||||||
networking.hostName = "ns2";
|
networking.hostName = "ns2";
|
||||||
networking.domain = "home.2rjus.net";
|
networking.domain = "home.2rjus.net";
|
||||||
networking.useNetworkd = true;
|
networking.useNetworkd = true;
|
||||||
networking.useDHCP = false;
|
networking.useDHCP = false;
|
||||||
|
# Disable resolved - conflicts with Unbound resolver
|
||||||
services.resolved.enable = false;
|
services.resolved.enable = false;
|
||||||
networking.nameservers = [
|
networking.nameservers = [
|
||||||
"10.69.13.5"
|
"10.69.13.5"
|
||||||
@@ -47,6 +62,7 @@
|
|||||||
"nix-command"
|
"nix-command"
|
||||||
"flakes"
|
"flakes"
|
||||||
];
|
];
|
||||||
|
nix.settings.tarball-ttl = 0;
|
||||||
environment.systemPackages = with pkgs; [
|
environment.systemPackages = with pkgs; [
|
||||||
vim
|
vim
|
||||||
wget
|
wget
|
||||||
@@ -59,5 +75,5 @@
|
|||||||
# Or disable the firewall altogether.
|
# Or disable the firewall altogether.
|
||||||
networking.firewall.enable = false;
|
networking.firewall.enable = false;
|
||||||
|
|
||||||
system.stateVersion = "23.11"; # Did you read the comment?
|
system.stateVersion = "25.11"; # Did you read the comment?
|
||||||
}
|
}
|
||||||
@@ -1,36 +0,0 @@
|
|||||||
{ config, lib, pkgs, modulesPath, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports =
|
|
||||||
[
|
|
||||||
(modulesPath + "/profiles/qemu-guest.nix")
|
|
||||||
];
|
|
||||||
|
|
||||||
boot.initrd.availableKernelModules = [ "ata_piix" "uhci_hcd" "virtio_pci" "virtio_scsi" "sd_mod" "sr_mod" ];
|
|
||||||
boot.initrd.kernelModules = [ ];
|
|
||||||
# boot.kernelModules = [ ];
|
|
||||||
# boot.extraModulePackages = [ ];
|
|
||||||
|
|
||||||
fileSystems."/" =
|
|
||||||
{
|
|
||||||
device = "/dev/disk/by-uuid/6889aba9-61ed-4687-ab10-e5cf4017ac8d";
|
|
||||||
fsType = "xfs";
|
|
||||||
};
|
|
||||||
|
|
||||||
fileSystems."/boot" =
|
|
||||||
{
|
|
||||||
device = "/dev/disk/by-uuid/BC07-3B7A";
|
|
||||||
fsType = "vfat";
|
|
||||||
};
|
|
||||||
|
|
||||||
swapDevices =
|
|
||||||
[{ device = "/dev/disk/by-uuid/64e5757b-6625-4dd2-aa2a-66ca93444d23"; }];
|
|
||||||
|
|
||||||
# Enables DHCP on each ethernet and wireless interface. In case of scripted networking
|
|
||||||
# (the default) this is the recommended approach. When using systemd-networkd it's
|
|
||||||
# still possible to use this option, but it's recommended to use it in conjunction
|
|
||||||
# with explicit per-interface declarations with `networking.interfaces.<interface>.useDHCP`.
|
|
||||||
# networking.interfaces.ens18.useDHCP = lib.mkDefault true;
|
|
||||||
|
|
||||||
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
|
|
||||||
}
|
|
||||||
@@ -1,56 +0,0 @@
|
|||||||
{ config, lib, pkgs, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports =
|
|
||||||
[
|
|
||||||
../template/hardware-configuration.nix
|
|
||||||
|
|
||||||
../../system
|
|
||||||
../../services/ns/master-authorative.nix
|
|
||||||
../../services/ns/resolver.nix
|
|
||||||
];
|
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
|
||||||
# Use the systemd-boot EFI boot loader.
|
|
||||||
boot.loader.grub.enable = true;
|
|
||||||
boot.loader.grub.device = "/dev/sda";
|
|
||||||
|
|
||||||
networking.hostName = "ns3";
|
|
||||||
networking.domain = "home.2rjus.net";
|
|
||||||
networking.useNetworkd = true;
|
|
||||||
networking.useDHCP = false;
|
|
||||||
services.resolved.enable = false;
|
|
||||||
networking.nameservers = [
|
|
||||||
"10.69.13.5"
|
|
||||||
"10.69.13.6"
|
|
||||||
];
|
|
||||||
|
|
||||||
systemd.network.enable = true;
|
|
||||||
systemd.network.networks."ens18" = {
|
|
||||||
matchConfig.Name = "ens18";
|
|
||||||
address = [
|
|
||||||
"10.69.13.7/24"
|
|
||||||
];
|
|
||||||
routes = [
|
|
||||||
{ Gateway = "10.69.13.1"; }
|
|
||||||
];
|
|
||||||
linkConfig.RequiredForOnline = "routable";
|
|
||||||
};
|
|
||||||
time.timeZone = "Europe/Oslo";
|
|
||||||
|
|
||||||
nix.settings.experimental-features = [ "nix-command" "flakes" ];
|
|
||||||
environment.systemPackages = with pkgs; [
|
|
||||||
vim
|
|
||||||
wget
|
|
||||||
git
|
|
||||||
];
|
|
||||||
|
|
||||||
# Open ports in the firewall.
|
|
||||||
# networking.firewall.allowedTCPPorts = [ ... ];
|
|
||||||
# networking.firewall.allowedUDPPorts = [ ... ];
|
|
||||||
# Or disable the firewall altogether.
|
|
||||||
networking.firewall.enable = false;
|
|
||||||
|
|
||||||
system.stateVersion = "23.11"; # Did you read the comment?
|
|
||||||
}
|
|
||||||
|
|
||||||
@@ -1,36 +0,0 @@
|
|||||||
{ config, lib, pkgs, modulesPath, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports =
|
|
||||||
[
|
|
||||||
(modulesPath + "/profiles/qemu-guest.nix")
|
|
||||||
];
|
|
||||||
|
|
||||||
boot.initrd.availableKernelModules = [ "ata_piix" "uhci_hcd" "virtio_pci" "virtio_scsi" "sd_mod" "sr_mod" ];
|
|
||||||
boot.initrd.kernelModules = [ ];
|
|
||||||
# boot.kernelModules = [ ];
|
|
||||||
# boot.extraModulePackages = [ ];
|
|
||||||
|
|
||||||
fileSystems."/" =
|
|
||||||
{
|
|
||||||
device = "/dev/disk/by-uuid/6889aba9-61ed-4687-ab10-e5cf4017ac8d";
|
|
||||||
fsType = "xfs";
|
|
||||||
};
|
|
||||||
|
|
||||||
fileSystems."/boot" =
|
|
||||||
{
|
|
||||||
device = "/dev/disk/by-uuid/BC07-3B7A";
|
|
||||||
fsType = "vfat";
|
|
||||||
};
|
|
||||||
|
|
||||||
swapDevices =
|
|
||||||
[{ device = "/dev/disk/by-uuid/64e5757b-6625-4dd2-aa2a-66ca93444d23"; }];
|
|
||||||
|
|
||||||
# Enables DHCP on each ethernet and wireless interface. In case of scripted networking
|
|
||||||
# (the default) this is the recommended approach. When using systemd-networkd it's
|
|
||||||
# still possible to use this option, but it's recommended to use it in conjunction
|
|
||||||
# with explicit per-interface declarations with `networking.interfaces.<interface>.useDHCP`.
|
|
||||||
# networking.interfaces.ens18.useDHCP = lib.mkDefault true;
|
|
||||||
|
|
||||||
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
|
|
||||||
}
|
|
||||||
@@ -1,56 +0,0 @@
|
|||||||
{ config, lib, pkgs, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports =
|
|
||||||
[
|
|
||||||
../template/hardware-configuration.nix
|
|
||||||
|
|
||||||
../../system
|
|
||||||
../../services/ns/secondary-authorative.nix
|
|
||||||
../../services/ns/resolver.nix
|
|
||||||
];
|
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
|
||||||
# Use the systemd-boot EFI boot loader.
|
|
||||||
boot.loader.grub.enable = true;
|
|
||||||
boot.loader.grub.device = "/dev/sda";
|
|
||||||
|
|
||||||
networking.hostName = "ns4";
|
|
||||||
networking.domain = "home.2rjus.net";
|
|
||||||
networking.useNetworkd = true;
|
|
||||||
networking.useDHCP = false;
|
|
||||||
services.resolved.enable = false;
|
|
||||||
networking.nameservers = [
|
|
||||||
"10.69.13.5"
|
|
||||||
"10.69.13.6"
|
|
||||||
];
|
|
||||||
|
|
||||||
systemd.network.enable = true;
|
|
||||||
systemd.network.networks."ens18" = {
|
|
||||||
matchConfig.Name = "ens18";
|
|
||||||
address = [
|
|
||||||
"10.69.13.8/24"
|
|
||||||
];
|
|
||||||
routes = [
|
|
||||||
{ Gateway = "10.69.13.1"; }
|
|
||||||
];
|
|
||||||
linkConfig.RequiredForOnline = "routable";
|
|
||||||
};
|
|
||||||
time.timeZone = "Europe/Oslo";
|
|
||||||
|
|
||||||
nix.settings.experimental-features = [ "nix-command" "flakes" ];
|
|
||||||
environment.systemPackages = with pkgs; [
|
|
||||||
vim
|
|
||||||
wget
|
|
||||||
git
|
|
||||||
];
|
|
||||||
|
|
||||||
# Open ports in the firewall.
|
|
||||||
# networking.firewall.allowedTCPPorts = [ ... ];
|
|
||||||
# networking.firewall.allowedUDPPorts = [ ... ];
|
|
||||||
# Or disable the firewall altogether.
|
|
||||||
networking.firewall.enable = false;
|
|
||||||
|
|
||||||
system.stateVersion = "23.11"; # Did you read the comment?
|
|
||||||
}
|
|
||||||
|
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
{ ... }: {
|
|
||||||
imports = [
|
|
||||||
./configuration.nix
|
|
||||||
];
|
|
||||||
}
|
|
||||||
@@ -1,36 +0,0 @@
|
|||||||
{ config, lib, pkgs, modulesPath, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports =
|
|
||||||
[
|
|
||||||
(modulesPath + "/profiles/qemu-guest.nix")
|
|
||||||
];
|
|
||||||
|
|
||||||
boot.initrd.availableKernelModules = [ "ata_piix" "uhci_hcd" "virtio_pci" "virtio_scsi" "sd_mod" "sr_mod" ];
|
|
||||||
boot.initrd.kernelModules = [ ];
|
|
||||||
# boot.kernelModules = [ ];
|
|
||||||
# boot.extraModulePackages = [ ];
|
|
||||||
|
|
||||||
fileSystems."/" =
|
|
||||||
{
|
|
||||||
device = "/dev/disk/by-uuid/6889aba9-61ed-4687-ab10-e5cf4017ac8d";
|
|
||||||
fsType = "xfs";
|
|
||||||
};
|
|
||||||
|
|
||||||
fileSystems."/boot" =
|
|
||||||
{
|
|
||||||
device = "/dev/disk/by-uuid/BC07-3B7A";
|
|
||||||
fsType = "vfat";
|
|
||||||
};
|
|
||||||
|
|
||||||
swapDevices =
|
|
||||||
[{ device = "/dev/disk/by-uuid/64e5757b-6625-4dd2-aa2a-66ca93444d23"; }];
|
|
||||||
|
|
||||||
# Enables DHCP on each ethernet and wireless interface. In case of scripted networking
|
|
||||||
# (the default) this is the recommended approach. When using systemd-networkd it's
|
|
||||||
# still possible to use this option, but it's recommended to use it in conjunction
|
|
||||||
# with explicit per-interface declarations with `networking.interfaces.<interface>.useDHCP`.
|
|
||||||
# networking.interfaces.ens18.useDHCP = lib.mkDefault true;
|
|
||||||
|
|
||||||
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
|
|
||||||
}
|
|
||||||
@@ -1,7 +0,0 @@
|
|||||||
{ ... }:
|
|
||||||
{
|
|
||||||
imports = [
|
|
||||||
./configuration.nix
|
|
||||||
../../services/postgres
|
|
||||||
];
|
|
||||||
}
|
|
||||||
@@ -1,57 +0,0 @@
|
|||||||
{ config, lib, pkgs, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports =
|
|
||||||
[
|
|
||||||
./hardware-configuration.nix
|
|
||||||
|
|
||||||
../../system
|
|
||||||
];
|
|
||||||
|
|
||||||
# Template host - exclude from DNS zone generation
|
|
||||||
homelab.dns.enable = false;
|
|
||||||
|
|
||||||
|
|
||||||
boot.loader.grub.enable = true;
|
|
||||||
boot.loader.grub.device = "/dev/sda";
|
|
||||||
networking.hostName = "nixos-template";
|
|
||||||
networking.domain = "home.2rjus.net";
|
|
||||||
networking.useNetworkd = true;
|
|
||||||
networking.useDHCP = false;
|
|
||||||
services.resolved.enable = true;
|
|
||||||
networking.nameservers = [
|
|
||||||
"10.69.13.5"
|
|
||||||
"10.69.13.6"
|
|
||||||
];
|
|
||||||
|
|
||||||
systemd.network.enable = true;
|
|
||||||
systemd.network.networks."ens18" = {
|
|
||||||
matchConfig.Name = "ens18";
|
|
||||||
address = [
|
|
||||||
"10.69.8.250/24"
|
|
||||||
];
|
|
||||||
routes = [
|
|
||||||
{ Gateway = "10.69.8.1"; }
|
|
||||||
];
|
|
||||||
linkConfig.RequiredForOnline = "routable";
|
|
||||||
};
|
|
||||||
time.timeZone = "Europe/Oslo";
|
|
||||||
|
|
||||||
nix.settings.experimental-features = [ "nix-command" "flakes" ];
|
|
||||||
nix.settings.tarball-ttl = 0;
|
|
||||||
environment.systemPackages = with pkgs; [
|
|
||||||
age
|
|
||||||
vim
|
|
||||||
wget
|
|
||||||
git
|
|
||||||
];
|
|
||||||
|
|
||||||
# Open ports in the firewall.
|
|
||||||
# networking.firewall.allowedTCPPorts = [ ... ];
|
|
||||||
# networking.firewall.allowedUDPPorts = [ ... ];
|
|
||||||
# Or disable the firewall altogether.
|
|
||||||
networking.firewall.enable = false;
|
|
||||||
|
|
||||||
system.stateVersion = "23.11"; # Did you read the comment?
|
|
||||||
}
|
|
||||||
|
|
||||||
@@ -1,7 +0,0 @@
|
|||||||
{ ... }: {
|
|
||||||
imports = [
|
|
||||||
./hardware-configuration.nix
|
|
||||||
./configuration.nix
|
|
||||||
./scripts.nix
|
|
||||||
];
|
|
||||||
}
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
{ pkgs, ... }:
|
|
||||||
let
|
|
||||||
prepare-host-script = pkgs.writeShellScriptBin "prepare-host.sh"
|
|
||||||
''
|
|
||||||
echo "Removing machine-id"
|
|
||||||
rm -f /etc/machine-id || true
|
|
||||||
|
|
||||||
echo "Removing SSH host keys"
|
|
||||||
rm -f /etc/ssh/ssh_host_* || true
|
|
||||||
|
|
||||||
echo "Restarting SSH"
|
|
||||||
systemctl restart sshd
|
|
||||||
|
|
||||||
echo "Removing temporary files"
|
|
||||||
rm -rf /tmp/* || true
|
|
||||||
|
|
||||||
echo "Removing logs"
|
|
||||||
journalctl --rotate || true
|
|
||||||
journalctl --vacuum-time=1s || true
|
|
||||||
|
|
||||||
echo "Removing cache"
|
|
||||||
rm -rf /var/cache/* || true
|
|
||||||
|
|
||||||
echo "Generate age key"
|
|
||||||
rm -rf /var/lib/sops-nix || true
|
|
||||||
mkdir -p /var/lib/sops-nix
|
|
||||||
${pkgs.age}/bin/age-keygen -o /var/lib/sops-nix/key.txt
|
|
||||||
'';
|
|
||||||
in
|
|
||||||
{
|
|
||||||
environment.systemPackages = [ prepare-host-script ];
|
|
||||||
users.motd = "Prepare host by running 'prepare-host.sh'.";
|
|
||||||
}
|
|
||||||
@@ -6,22 +6,72 @@ let
|
|||||||
text = ''
|
text = ''
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
|
LOKI_URL="http://monitoring01.home.2rjus.net:3100/loki/api/v1/push"
|
||||||
|
|
||||||
|
# Send a log entry to Loki with bootstrap status
|
||||||
|
# Usage: log_to_loki <stage> <message>
|
||||||
|
# Fails silently if Loki is unreachable
|
||||||
|
log_to_loki() {
|
||||||
|
local stage="$1"
|
||||||
|
local message="$2"
|
||||||
|
local timestamp_ns
|
||||||
|
timestamp_ns="$(date +%s)000000000"
|
||||||
|
|
||||||
|
local payload
|
||||||
|
payload=$(jq -n \
|
||||||
|
--arg host "$HOSTNAME" \
|
||||||
|
--arg stage "$stage" \
|
||||||
|
--arg branch "''${BRANCH:-master}" \
|
||||||
|
--arg ts "$timestamp_ns" \
|
||||||
|
--arg msg "$message" \
|
||||||
|
'{
|
||||||
|
streams: [{
|
||||||
|
stream: {
|
||||||
|
job: "bootstrap",
|
||||||
|
host: $host,
|
||||||
|
stage: $stage,
|
||||||
|
branch: $branch
|
||||||
|
},
|
||||||
|
values: [[$ts, $msg]]
|
||||||
|
}]
|
||||||
|
}')
|
||||||
|
|
||||||
|
curl -s --connect-timeout 2 --max-time 5 \
|
||||||
|
-X POST \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d "$payload" \
|
||||||
|
"$LOKI_URL" >/dev/null 2>&1 || true
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "================================================================================"
|
||||||
|
echo " NIXOS BOOTSTRAP IN PROGRESS"
|
||||||
|
echo "================================================================================"
|
||||||
|
echo ""
|
||||||
|
|
||||||
# Read hostname set by cloud-init (from Terraform VM name via user-data)
|
# Read hostname set by cloud-init (from Terraform VM name via user-data)
|
||||||
# Cloud-init sets the system hostname from user-data.txt, so we read it from hostnamectl
|
# Cloud-init sets the system hostname from user-data.txt, so we read it from hostnamectl
|
||||||
HOSTNAME=$(hostnamectl hostname)
|
HOSTNAME=$(hostnamectl hostname)
|
||||||
echo "DEBUG: Hostname from hostnamectl: '$HOSTNAME'"
|
# Read git branch from environment, default to master
|
||||||
|
BRANCH="''${NIXOS_FLAKE_BRANCH:-master}"
|
||||||
|
|
||||||
|
echo "Hostname: $HOSTNAME"
|
||||||
|
echo ""
|
||||||
echo "Starting NixOS bootstrap for host: $HOSTNAME"
|
echo "Starting NixOS bootstrap for host: $HOSTNAME"
|
||||||
|
|
||||||
|
log_to_loki "starting" "Bootstrap starting for $HOSTNAME (branch: $BRANCH)"
|
||||||
|
|
||||||
echo "Waiting for network connectivity..."
|
echo "Waiting for network connectivity..."
|
||||||
|
|
||||||
# Verify we can reach the git server via HTTPS (doesn't respond to ping)
|
# Verify we can reach the git server via HTTPS (doesn't respond to ping)
|
||||||
if ! curl -s --connect-timeout 5 --max-time 10 https://git.t-juice.club >/dev/null 2>&1; then
|
if ! curl -s --connect-timeout 5 --max-time 10 https://git.t-juice.club >/dev/null 2>&1; then
|
||||||
echo "ERROR: Cannot reach git.t-juice.club via HTTPS"
|
echo "ERROR: Cannot reach git.t-juice.club via HTTPS"
|
||||||
echo "Check network configuration and DNS settings"
|
echo "Check network configuration and DNS settings"
|
||||||
|
log_to_loki "failed" "Network check failed - cannot reach git.t-juice.club"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Network connectivity confirmed"
|
echo "Network connectivity confirmed"
|
||||||
|
log_to_loki "network_ok" "Network connectivity confirmed"
|
||||||
|
|
||||||
# Unwrap Vault token and store AppRole credentials (if provided)
|
# Unwrap Vault token and store AppRole credentials (if provided)
|
||||||
if [ -n "''${VAULT_WRAPPED_TOKEN:-}" ]; then
|
if [ -n "''${VAULT_WRAPPED_TOKEN:-}" ]; then
|
||||||
@@ -50,6 +100,7 @@ let
|
|||||||
chmod 600 /var/lib/vault/approle/secret-id
|
chmod 600 /var/lib/vault/approle/secret-id
|
||||||
|
|
||||||
echo "Vault credentials unwrapped and stored successfully"
|
echo "Vault credentials unwrapped and stored successfully"
|
||||||
|
log_to_loki "vault_ok" "Vault credentials unwrapped and stored"
|
||||||
else
|
else
|
||||||
echo "WARNING: Failed to unwrap Vault token"
|
echo "WARNING: Failed to unwrap Vault token"
|
||||||
if [ -n "$UNWRAP_RESPONSE" ]; then
|
if [ -n "$UNWRAP_RESPONSE" ]; then
|
||||||
@@ -63,17 +114,17 @@ let
|
|||||||
echo "To regenerate token, run: create-host --hostname $HOSTNAME --force"
|
echo "To regenerate token, run: create-host --hostname $HOSTNAME --force"
|
||||||
echo ""
|
echo ""
|
||||||
echo "Vault secrets will not be available, but continuing bootstrap..."
|
echo "Vault secrets will not be available, but continuing bootstrap..."
|
||||||
|
log_to_loki "vault_warn" "Failed to unwrap Vault token - continuing without secrets"
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
echo "No Vault wrapped token provided (VAULT_WRAPPED_TOKEN not set)"
|
echo "No Vault wrapped token provided (VAULT_WRAPPED_TOKEN not set)"
|
||||||
echo "Skipping Vault credential setup"
|
echo "Skipping Vault credential setup"
|
||||||
|
log_to_loki "vault_skip" "No Vault token provided - skipping credential setup"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Fetching and building NixOS configuration from flake..."
|
echo "Fetching and building NixOS configuration from flake..."
|
||||||
|
|
||||||
# Read git branch from environment, default to master
|
|
||||||
BRANCH="''${NIXOS_FLAKE_BRANCH:-master}"
|
|
||||||
echo "Using git branch: $BRANCH"
|
echo "Using git branch: $BRANCH"
|
||||||
|
log_to_loki "building" "Starting nixos-rebuild boot"
|
||||||
|
|
||||||
# Build and activate the host-specific configuration
|
# Build and activate the host-specific configuration
|
||||||
FLAKE_URL="git+https://git.t-juice.club/torjus/nixos-servers.git?ref=$BRANCH#''${HOSTNAME}"
|
FLAKE_URL="git+https://git.t-juice.club/torjus/nixos-servers.git?ref=$BRANCH#''${HOSTNAME}"
|
||||||
@@ -81,18 +132,30 @@ let
|
|||||||
if nixos-rebuild boot --flake "$FLAKE_URL"; then
|
if nixos-rebuild boot --flake "$FLAKE_URL"; then
|
||||||
echo "Successfully built configuration for $HOSTNAME"
|
echo "Successfully built configuration for $HOSTNAME"
|
||||||
echo "Rebooting into new configuration..."
|
echo "Rebooting into new configuration..."
|
||||||
|
log_to_loki "success" "Build successful - rebooting into new configuration"
|
||||||
sleep 2
|
sleep 2
|
||||||
systemctl reboot
|
systemctl reboot
|
||||||
else
|
else
|
||||||
echo "ERROR: nixos-rebuild failed for $HOSTNAME"
|
echo "ERROR: nixos-rebuild failed for $HOSTNAME"
|
||||||
echo "Check that flake has configuration for this hostname"
|
echo "Check that flake has configuration for this hostname"
|
||||||
echo "Manual intervention required - system will not reboot"
|
echo "Manual intervention required - system will not reboot"
|
||||||
|
log_to_loki "failed" "nixos-rebuild failed - manual intervention required"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
in
|
in
|
||||||
{
|
{
|
||||||
|
# Custom greeting line to indicate this is a bootstrap image
|
||||||
|
services.getty.greetingLine = lib.mkForce ''
|
||||||
|
================================================================================
|
||||||
|
BOOTSTRAP IMAGE - NixOS \V (\l)
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
Bootstrap service is running. Logs are displayed on tty1.
|
||||||
|
Check status: journalctl -fu nixos-bootstrap
|
||||||
|
'';
|
||||||
|
|
||||||
systemd.services."nixos-bootstrap" = {
|
systemd.services."nixos-bootstrap" = {
|
||||||
description = "Bootstrap NixOS configuration from flake on first boot";
|
description = "Bootstrap NixOS configuration from flake on first boot";
|
||||||
|
|
||||||
@@ -107,12 +170,12 @@ in
|
|||||||
serviceConfig = {
|
serviceConfig = {
|
||||||
Type = "oneshot";
|
Type = "oneshot";
|
||||||
RemainAfterExit = true;
|
RemainAfterExit = true;
|
||||||
ExecStart = "${bootstrap-script}/bin/nixos-bootstrap";
|
ExecStart = lib.getExe bootstrap-script;
|
||||||
|
|
||||||
# Read environment variables from cloud-init (set by cloud-init write_files)
|
# Read environment variables from cloud-init (set by cloud-init write_files)
|
||||||
EnvironmentFile = "-/run/cloud-init-env";
|
EnvironmentFile = "-/run/cloud-init-env";
|
||||||
|
|
||||||
# Logging to journald
|
# Log to journal and console
|
||||||
StandardOutput = "journal+console";
|
StandardOutput = "journal+console";
|
||||||
StandardError = "journal+console";
|
StandardError = "journal+console";
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -32,6 +32,11 @@
|
|||||||
datasource_list = [ "ConfigDrive" "NoCloud" ];
|
datasource_list = [ "ConfigDrive" "NoCloud" ];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
homelab.host = {
|
||||||
|
tier = "test";
|
||||||
|
priority = "low";
|
||||||
|
};
|
||||||
|
|
||||||
boot.loader.grub.enable = true;
|
boot.loader.grub.enable = true;
|
||||||
boot.loader.grub.device = "/dev/vda";
|
boot.loader.grub.device = "/dev/vda";
|
||||||
networking.hostName = "nixos-template2";
|
networking.hostName = "nixos-template2";
|
||||||
@@ -53,6 +58,14 @@
|
|||||||
"flakes"
|
"flakes"
|
||||||
];
|
];
|
||||||
nix.settings.tarball-ttl = 0;
|
nix.settings.tarball-ttl = 0;
|
||||||
|
nix.settings.substituters = [
|
||||||
|
"https://nix-cache.home.2rjus.net"
|
||||||
|
"https://cache.nixos.org"
|
||||||
|
];
|
||||||
|
nix.settings.trusted-public-keys = [
|
||||||
|
"nix-cache.home.2rjus.net-1:2kowZOG6pvhoK4AHVO3alBlvcghH20wchzoR0V86UWI="
|
||||||
|
"cache.nixos.org-1:6NCHdD59X431o0gWypbMrAURkbJ16ZPMQFGspcDShjY="
|
||||||
|
];
|
||||||
environment.systemPackages = with pkgs; [
|
environment.systemPackages = with pkgs; [
|
||||||
age
|
age
|
||||||
vim
|
vim
|
||||||
@@ -66,5 +79,8 @@
|
|||||||
# Or disable the firewall altogether.
|
# Or disable the firewall altogether.
|
||||||
networking.firewall.enable = false;
|
networking.firewall.enable = false;
|
||||||
|
|
||||||
|
# Compressed swap in RAM - prevents OOM during bootstrap nixos-rebuild
|
||||||
|
zramSwap.enable = true;
|
||||||
|
|
||||||
system.stateVersion = "25.11";
|
system.stateVersion = "25.11";
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,8 @@
|
|||||||
{ pkgs, ... }:
|
{ pkgs, ... }:
|
||||||
let
|
let
|
||||||
prepare-host-script = pkgs.writeShellScriptBin "prepare-host.sh"
|
prepare-host-script = pkgs.writeShellApplication {
|
||||||
''
|
name = "prepare-host.sh";
|
||||||
|
text = ''
|
||||||
echo "Removing machine-id"
|
echo "Removing machine-id"
|
||||||
rm -f /etc/machine-id || true
|
rm -f /etc/machine-id || true
|
||||||
|
|
||||||
@@ -20,12 +21,8 @@ let
|
|||||||
|
|
||||||
echo "Removing cache"
|
echo "Removing cache"
|
||||||
rm -rf /var/cache/* || true
|
rm -rf /var/cache/* || true
|
||||||
|
|
||||||
echo "Generate age key"
|
|
||||||
rm -rf /var/lib/sops-nix || true
|
|
||||||
mkdir -p /var/lib/sops-nix
|
|
||||||
${pkgs.age}/bin/age-keygen -o /var/lib/sops-nix/key.txt
|
|
||||||
'';
|
'';
|
||||||
|
};
|
||||||
in
|
in
|
||||||
{
|
{
|
||||||
environment.systemPackages = [ prepare-host-script ];
|
environment.systemPackages = [ prepare-host-script ];
|
||||||
|
|||||||
@@ -11,10 +11,22 @@
|
|||||||
|
|
||||||
../../system
|
../../system
|
||||||
../../common/vm
|
../../common/vm
|
||||||
|
../../common/ssh-audit.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
# Test VM - exclude from DNS zone generation
|
# Host metadata (adjust as needed)
|
||||||
homelab.dns.enable = false;
|
homelab.host = {
|
||||||
|
tier = "test"; # Start in test tier, move to prod after validation
|
||||||
|
};
|
||||||
|
|
||||||
|
# Enable Vault integration
|
||||||
|
vault.enable = true;
|
||||||
|
|
||||||
|
# Enable remote deployment via NATS
|
||||||
|
homelab.deploy.enable = true;
|
||||||
|
|
||||||
|
# Enable Kanidm PAM/NSS for central authentication
|
||||||
|
homelab.kanidm.enable = true;
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
nixpkgs.config.allowUnfree = true;
|
||||||
boot.loader.grub.enable = true;
|
boot.loader.grub.enable = true;
|
||||||
@@ -24,7 +36,7 @@
|
|||||||
networking.domain = "home.2rjus.net";
|
networking.domain = "home.2rjus.net";
|
||||||
networking.useNetworkd = true;
|
networking.useNetworkd = true;
|
||||||
networking.useDHCP = false;
|
networking.useDHCP = false;
|
||||||
services.resolved.enable = false;
|
services.resolved.enable = true;
|
||||||
networking.nameservers = [
|
networking.nameservers = [
|
||||||
"10.69.13.5"
|
"10.69.13.5"
|
||||||
"10.69.13.6"
|
"10.69.13.6"
|
||||||
@@ -34,7 +46,7 @@
|
|||||||
systemd.network.networks."ens18" = {
|
systemd.network.networks."ens18" = {
|
||||||
matchConfig.Name = "ens18";
|
matchConfig.Name = "ens18";
|
||||||
address = [
|
address = [
|
||||||
"10.69.13.101/24"
|
"10.69.13.20/24"
|
||||||
];
|
];
|
||||||
routes = [
|
routes = [
|
||||||
{ Gateway = "10.69.13.1"; }
|
{ Gateway = "10.69.13.1"; }
|
||||||
@@ -54,6 +66,39 @@
|
|||||||
git
|
git
|
||||||
];
|
];
|
||||||
|
|
||||||
|
# Test nginx with ACME certificate from OpenBao PKI
|
||||||
|
services.nginx = {
|
||||||
|
enable = true;
|
||||||
|
virtualHosts."testvm01.home.2rjus.net" = {
|
||||||
|
forceSSL = true;
|
||||||
|
enableACME = true;
|
||||||
|
locations."/" = {
|
||||||
|
root = pkgs.writeTextDir "index.html" ''
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>testvm01 - ACME Test</title>
|
||||||
|
<style>
|
||||||
|
body { font-family: monospace; max-width: 600px; margin: 50px auto; padding: 20px; }
|
||||||
|
.joke { background: #f0f0f0; padding: 20px; border-radius: 8px; margin: 20px 0; }
|
||||||
|
.punchline { margin-top: 15px; font-weight: bold; }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>OpenBao PKI ACME Test</h1>
|
||||||
|
<p>If you're seeing this over HTTPS, the migration worked!</p>
|
||||||
|
<div class="joke">
|
||||||
|
<p>Why do programmers prefer dark mode?</p>
|
||||||
|
<p class="punchline">Because light attracts bugs.</p>
|
||||||
|
</div>
|
||||||
|
<p><small>Certificate issued by: vault.home.2rjus.net</small></p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
# Open ports in the firewall.
|
# Open ports in the firewall.
|
||||||
# networking.firewall.allowedTCPPorts = [ ... ];
|
# networking.firewall.allowedTCPPorts = [ ... ];
|
||||||
# networking.firewall.allowedUDPPorts = [ ... ];
|
# networking.firewall.allowedUDPPorts = [ ... ];
|
||||||
|
|||||||
@@ -1,25 +1,38 @@
|
|||||||
{
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
pkgs,
|
pkgs,
|
||||||
...
|
...
|
||||||
}:
|
}:
|
||||||
|
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../template/hardware-configuration.nix
|
../template2/hardware-configuration.nix
|
||||||
|
|
||||||
../../system
|
../../system
|
||||||
../../common/vm
|
../../common/vm
|
||||||
|
../../common/ssh-audit.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
# Host metadata (adjust as needed)
|
||||||
# Use the systemd-boot EFI boot loader.
|
homelab.host = {
|
||||||
boot.loader.grub = {
|
tier = "test"; # Start in test tier, move to prod after validation
|
||||||
enable = true;
|
|
||||||
device = "/dev/sda";
|
|
||||||
configurationLimit = 3;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
networking.hostName = "pgdb1";
|
# Enable Vault integration
|
||||||
|
vault.enable = true;
|
||||||
|
|
||||||
|
# Enable remote deployment via NATS
|
||||||
|
homelab.deploy.enable = true;
|
||||||
|
|
||||||
|
# Enable Kanidm PAM/NSS for central authentication
|
||||||
|
homelab.kanidm.enable = true;
|
||||||
|
|
||||||
|
nixpkgs.config.allowUnfree = true;
|
||||||
|
boot.loader.grub.enable = true;
|
||||||
|
boot.loader.grub.device = "/dev/vda";
|
||||||
|
|
||||||
|
networking.hostName = "testvm02";
|
||||||
networking.domain = "home.2rjus.net";
|
networking.domain = "home.2rjus.net";
|
||||||
networking.useNetworkd = true;
|
networking.useNetworkd = true;
|
||||||
networking.useDHCP = false;
|
networking.useDHCP = false;
|
||||||
@@ -33,7 +46,7 @@
|
|||||||
systemd.network.networks."ens18" = {
|
systemd.network.networks."ens18" = {
|
||||||
matchConfig.Name = "ens18";
|
matchConfig.Name = "ens18";
|
||||||
address = [
|
address = [
|
||||||
"10.69.13.16/24"
|
"10.69.13.21/24"
|
||||||
];
|
];
|
||||||
routes = [
|
routes = [
|
||||||
{ Gateway = "10.69.13.1"; }
|
{ Gateway = "10.69.13.1"; }
|
||||||
@@ -59,5 +72,5 @@
|
|||||||
# Or disable the firewall altogether.
|
# Or disable the firewall altogether.
|
||||||
networking.firewall.enable = false;
|
networking.firewall.enable = false;
|
||||||
|
|
||||||
system.stateVersion = "23.11"; # Did you read the comment?
|
system.stateVersion = "25.11"; # Did you read the comment?
|
||||||
}
|
}
|
||||||
@@ -1,25 +1,38 @@
|
|||||||
{
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
pkgs,
|
pkgs,
|
||||||
...
|
...
|
||||||
}:
|
}:
|
||||||
|
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../template/hardware-configuration.nix
|
../template2/hardware-configuration.nix
|
||||||
|
|
||||||
../../system
|
../../system
|
||||||
../../common/vm
|
../../common/vm
|
||||||
|
../../common/ssh-audit.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
# Host metadata (adjust as needed)
|
||||||
# Use the systemd-boot EFI boot loader.
|
homelab.host = {
|
||||||
boot.loader.grub = {
|
tier = "test"; # Start in test tier, move to prod after validation
|
||||||
enable = true;
|
|
||||||
device = "/dev/sda";
|
|
||||||
configurationLimit = 3;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
networking.hostName = "ca";
|
# Enable Vault integration
|
||||||
|
vault.enable = true;
|
||||||
|
|
||||||
|
# Enable remote deployment via NATS
|
||||||
|
homelab.deploy.enable = true;
|
||||||
|
|
||||||
|
# Enable Kanidm PAM/NSS for central authentication
|
||||||
|
homelab.kanidm.enable = true;
|
||||||
|
|
||||||
|
nixpkgs.config.allowUnfree = true;
|
||||||
|
boot.loader.grub.enable = true;
|
||||||
|
boot.loader.grub.device = "/dev/vda";
|
||||||
|
|
||||||
|
networking.hostName = "testvm03";
|
||||||
networking.domain = "home.2rjus.net";
|
networking.domain = "home.2rjus.net";
|
||||||
networking.useNetworkd = true;
|
networking.useNetworkd = true;
|
||||||
networking.useDHCP = false;
|
networking.useDHCP = false;
|
||||||
@@ -33,7 +46,7 @@
|
|||||||
systemd.network.networks."ens18" = {
|
systemd.network.networks."ens18" = {
|
||||||
matchConfig.Name = "ens18";
|
matchConfig.Name = "ens18";
|
||||||
address = [
|
address = [
|
||||||
"10.69.13.12/24"
|
"10.69.13.22/24"
|
||||||
];
|
];
|
||||||
routes = [
|
routes = [
|
||||||
{ Gateway = "10.69.13.1"; }
|
{ Gateway = "10.69.13.1"; }
|
||||||
@@ -59,5 +72,5 @@
|
|||||||
# Or disable the firewall altogether.
|
# Or disable the firewall altogether.
|
||||||
networking.firewall.enable = false;
|
networking.firewall.enable = false;
|
||||||
|
|
||||||
system.stateVersion = "23.11"; # Did you read the comment?
|
system.stateVersion = "25.11"; # Did you read the comment?
|
||||||
}
|
}
|
||||||
@@ -16,6 +16,8 @@
|
|||||||
|
|
||||||
homelab.dns.cnames = [ "vault" ];
|
homelab.dns.cnames = [ "vault" ];
|
||||||
|
|
||||||
|
homelab.host.role = "vault";
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
nixpkgs.config.allowUnfree = true;
|
||||||
boot.loader.grub.enable = true;
|
boot.loader.grub.enable = true;
|
||||||
boot.loader.grub.device = "/dev/vda";
|
boot.loader.grub.device = "/dev/vda";
|
||||||
@@ -60,6 +62,16 @@
|
|||||||
# Or disable the firewall altogether.
|
# Or disable the firewall altogether.
|
||||||
networking.firewall.enable = false;
|
networking.firewall.enable = false;
|
||||||
|
|
||||||
|
# Vault fetches secrets from itself (after unseal)
|
||||||
|
vault.enable = true;
|
||||||
|
homelab.deploy.enable = true;
|
||||||
|
|
||||||
|
# Ensure vault-secret services wait for openbao to be unsealed
|
||||||
|
systemd.services.vault-secret-homelab-deploy-nkey = {
|
||||||
|
after = [ "openbao.service" ];
|
||||||
|
wants = [ "openbao.service" ];
|
||||||
|
};
|
||||||
|
|
||||||
system.stateVersion = "25.11"; # Did you read the comment?
|
system.stateVersion = "25.11"; # Did you read the comment?
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,121 +0,0 @@
|
|||||||
{
|
|
||||||
config,
|
|
||||||
lib,
|
|
||||||
pkgs,
|
|
||||||
...
|
|
||||||
}:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports = [
|
|
||||||
../template2/hardware-configuration.nix
|
|
||||||
|
|
||||||
../../system
|
|
||||||
../../common/vm
|
|
||||||
];
|
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
|
||||||
boot.loader.grub.enable = true;
|
|
||||||
boot.loader.grub.device = "/dev/vda";
|
|
||||||
|
|
||||||
networking.hostName = "vaulttest01";
|
|
||||||
networking.domain = "home.2rjus.net";
|
|
||||||
networking.useNetworkd = true;
|
|
||||||
networking.useDHCP = false;
|
|
||||||
services.resolved.enable = true;
|
|
||||||
networking.nameservers = [
|
|
||||||
"10.69.13.5"
|
|
||||||
"10.69.13.6"
|
|
||||||
];
|
|
||||||
|
|
||||||
systemd.network.enable = true;
|
|
||||||
systemd.network.networks."ens18" = {
|
|
||||||
matchConfig.Name = "ens18";
|
|
||||||
address = [
|
|
||||||
"10.69.13.150/24"
|
|
||||||
];
|
|
||||||
routes = [
|
|
||||||
{ Gateway = "10.69.13.1"; }
|
|
||||||
];
|
|
||||||
linkConfig.RequiredForOnline = "routable";
|
|
||||||
};
|
|
||||||
time.timeZone = "Europe/Oslo";
|
|
||||||
|
|
||||||
nix.settings.experimental-features = [
|
|
||||||
"nix-command"
|
|
||||||
"flakes"
|
|
||||||
];
|
|
||||||
nix.settings.tarball-ttl = 0;
|
|
||||||
environment.systemPackages = with pkgs; [
|
|
||||||
vim
|
|
||||||
wget
|
|
||||||
git
|
|
||||||
];
|
|
||||||
|
|
||||||
# Open ports in the firewall.
|
|
||||||
# networking.firewall.allowedTCPPorts = [ ... ];
|
|
||||||
# networking.firewall.allowedUDPPorts = [ ... ];
|
|
||||||
# Or disable the firewall altogether.
|
|
||||||
networking.firewall.enable = false;
|
|
||||||
|
|
||||||
# Testing config
|
|
||||||
# Enable Vault secrets management
|
|
||||||
vault.enable = true;
|
|
||||||
|
|
||||||
# Define a test secret
|
|
||||||
vault.secrets.test-service = {
|
|
||||||
secretPath = "hosts/vaulttest01/test-service";
|
|
||||||
restartTrigger = true;
|
|
||||||
restartInterval = "daily";
|
|
||||||
services = [ "vault-test" ];
|
|
||||||
};
|
|
||||||
|
|
||||||
# Create a test service that uses the secret
|
|
||||||
systemd.services.vault-test = {
|
|
||||||
description = "Test Vault secret fetching";
|
|
||||||
wantedBy = [ "multi-user.target" ];
|
|
||||||
after = [ "vault-secret-test-service.service" ];
|
|
||||||
|
|
||||||
serviceConfig = {
|
|
||||||
Type = "oneshot";
|
|
||||||
RemainAfterExit = true;
|
|
||||||
|
|
||||||
ExecStart = pkgs.writeShellScript "vault-test" ''
|
|
||||||
echo "=== Vault Secret Test ==="
|
|
||||||
echo "Secret path: hosts/vaulttest01/test-service"
|
|
||||||
|
|
||||||
if [ -f /run/secrets/test-service/password ]; then
|
|
||||||
echo "✓ Password file exists"
|
|
||||||
echo "Password length: $(wc -c < /run/secrets/test-service/password)"
|
|
||||||
else
|
|
||||||
echo "✗ Password file missing!"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -d /var/lib/vault/cache/test-service ]; then
|
|
||||||
echo "✓ Cache directory exists"
|
|
||||||
else
|
|
||||||
echo "✗ Cache directory missing!"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Test successful!"
|
|
||||||
'';
|
|
||||||
|
|
||||||
StandardOutput = "journal+console";
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
# Test ACME certificate issuance from OpenBao PKI
|
|
||||||
# Override the global ACME server (from system/acme.nix) to use OpenBao instead of step-ca
|
|
||||||
security.acme.defaults.server = lib.mkForce "https://vault01.home.2rjus.net:8200/v1/pki_int/acme/directory";
|
|
||||||
|
|
||||||
# Request a certificate for this host
|
|
||||||
# Using HTTP-01 challenge with standalone listener on port 80
|
|
||||||
security.acme.certs."vaulttest01.home.2rjus.net" = {
|
|
||||||
listenHTTP = ":80";
|
|
||||||
enableDebugLogs = true;
|
|
||||||
};
|
|
||||||
|
|
||||||
system.stateVersion = "25.11"; # Did you read the comment?
|
|
||||||
}
|
|
||||||
|
|
||||||
@@ -6,10 +6,6 @@ import subprocess
|
|||||||
IGNORED_HOSTS = [
|
IGNORED_HOSTS = [
|
||||||
"inc1",
|
"inc1",
|
||||||
"inc2",
|
"inc2",
|
||||||
"media1",
|
|
||||||
"nixos-test1",
|
|
||||||
"ns3",
|
|
||||||
"ns4",
|
|
||||||
"template1",
|
"template1",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -86,7 +86,7 @@ let
|
|||||||
, retry ? 900
|
, retry ? 900
|
||||||
, expire ? 1209600
|
, expire ? 1209600
|
||||||
, minTtl ? 120
|
, minTtl ? 120
|
||||||
, nameservers ? [ "ns1" "ns2" "ns3" ]
|
, nameservers ? [ "ns1" "ns2" ]
|
||||||
, adminEmail ? "admin.test.2rjus.net"
|
, adminEmail ? "admin.test.2rjus.net"
|
||||||
}:
|
}:
|
||||||
let
|
let
|
||||||
|
|||||||
210
lib/monitoring.nix
Normal file
210
lib/monitoring.nix
Normal file
@@ -0,0 +1,210 @@
|
|||||||
|
{ lib }:
|
||||||
|
let
|
||||||
|
# Extract IP address from CIDR notation (e.g., "10.69.13.5/24" -> "10.69.13.5")
|
||||||
|
extractIP = address:
|
||||||
|
let
|
||||||
|
parts = lib.splitString "/" address;
|
||||||
|
in
|
||||||
|
builtins.head parts;
|
||||||
|
|
||||||
|
# Check if a network interface name looks like a VPN/tunnel interface
|
||||||
|
isVpnInterface = ifaceName:
|
||||||
|
lib.hasPrefix "wg" ifaceName ||
|
||||||
|
lib.hasPrefix "tun" ifaceName ||
|
||||||
|
lib.hasPrefix "tap" ifaceName ||
|
||||||
|
lib.hasPrefix "vti" ifaceName;
|
||||||
|
|
||||||
|
# Extract monitoring info from a single host configuration
|
||||||
|
# Returns null if host should not be included
|
||||||
|
extractHostMonitoring = name: hostConfig:
|
||||||
|
let
|
||||||
|
cfg = hostConfig.config;
|
||||||
|
monConfig = (cfg.homelab or { }).monitoring or { enable = true; scrapeTargets = [ ]; };
|
||||||
|
dnsConfig = (cfg.homelab or { }).dns or { enable = true; };
|
||||||
|
hostConfig' = (cfg.homelab or { }).host or { };
|
||||||
|
hostname = cfg.networking.hostName;
|
||||||
|
networks = cfg.systemd.network.networks or { };
|
||||||
|
|
||||||
|
# Filter out VPN interfaces and find networks with static addresses
|
||||||
|
physicalNetworks = lib.filterAttrs
|
||||||
|
(netName: netCfg:
|
||||||
|
let
|
||||||
|
ifaceName = netCfg.matchConfig.Name or "";
|
||||||
|
in
|
||||||
|
!(isVpnInterface ifaceName) && (netCfg.address or [ ]) != [ ])
|
||||||
|
networks;
|
||||||
|
|
||||||
|
# Get addresses from physical networks only
|
||||||
|
networkAddresses = lib.flatten (
|
||||||
|
lib.mapAttrsToList
|
||||||
|
(netName: netCfg: netCfg.address or [ ])
|
||||||
|
physicalNetworks
|
||||||
|
);
|
||||||
|
|
||||||
|
firstAddress = if networkAddresses != [ ] then builtins.head networkAddresses else null;
|
||||||
|
in
|
||||||
|
if !(monConfig.enable or true) || !(dnsConfig.enable or true) || firstAddress == null then
|
||||||
|
null
|
||||||
|
else
|
||||||
|
{
|
||||||
|
inherit hostname;
|
||||||
|
ip = extractIP firstAddress;
|
||||||
|
scrapeTargets = monConfig.scrapeTargets or [ ];
|
||||||
|
# Host metadata for label propagation
|
||||||
|
tier = hostConfig'.tier or "prod";
|
||||||
|
priority = hostConfig'.priority or "high";
|
||||||
|
role = hostConfig'.role or null;
|
||||||
|
labels = hostConfig'.labels or { };
|
||||||
|
};
|
||||||
|
|
||||||
|
# Build effective labels for a host
|
||||||
|
# Always includes hostname; only includes tier/priority/role if non-default
|
||||||
|
buildEffectiveLabels = host:
|
||||||
|
{ hostname = host.hostname; }
|
||||||
|
// (lib.optionalAttrs (host.tier != "prod") { tier = host.tier; })
|
||||||
|
// (lib.optionalAttrs (host.priority != "high") { priority = host.priority; })
|
||||||
|
// (lib.optionalAttrs (host.role != null) { role = host.role; })
|
||||||
|
// host.labels;
|
||||||
|
|
||||||
|
# Generate node-exporter targets from all flake hosts
|
||||||
|
# Returns a list of static_configs entries with labels
|
||||||
|
generateNodeExporterTargets = self: externalTargets:
|
||||||
|
let
|
||||||
|
nixosConfigs = self.nixosConfigurations or { };
|
||||||
|
hostList = lib.filter (x: x != null) (
|
||||||
|
lib.mapAttrsToList extractHostMonitoring nixosConfigs
|
||||||
|
);
|
||||||
|
|
||||||
|
# Extract hostname from a target string like "gunter.home.2rjus.net:9100"
|
||||||
|
extractHostnameFromTarget = target:
|
||||||
|
builtins.head (lib.splitString "." target);
|
||||||
|
|
||||||
|
# Build target entries with labels for each host
|
||||||
|
flakeEntries = map
|
||||||
|
(host: {
|
||||||
|
target = "${host.hostname}.home.2rjus.net:9100";
|
||||||
|
labels = buildEffectiveLabels host;
|
||||||
|
})
|
||||||
|
hostList;
|
||||||
|
|
||||||
|
# External targets get hostname extracted from the target string
|
||||||
|
externalEntries = map
|
||||||
|
(target: {
|
||||||
|
inherit target;
|
||||||
|
labels = { hostname = extractHostnameFromTarget target; };
|
||||||
|
})
|
||||||
|
(externalTargets.nodeExporter or [ ]);
|
||||||
|
|
||||||
|
allEntries = flakeEntries ++ externalEntries;
|
||||||
|
|
||||||
|
# Group entries by their label set for efficient static_configs
|
||||||
|
# Convert labels attrset to a string key for grouping
|
||||||
|
labelKey = entry: builtins.toJSON entry.labels;
|
||||||
|
grouped = lib.groupBy labelKey allEntries;
|
||||||
|
|
||||||
|
# Convert groups to static_configs format
|
||||||
|
# Every flake host now has at least a hostname label
|
||||||
|
staticConfigs = lib.mapAttrsToList
|
||||||
|
(key: entries:
|
||||||
|
let
|
||||||
|
labels = (builtins.head entries).labels;
|
||||||
|
in
|
||||||
|
{ targets = map (e: e.target) entries; labels = labels; }
|
||||||
|
)
|
||||||
|
grouped;
|
||||||
|
in
|
||||||
|
staticConfigs;
|
||||||
|
|
||||||
|
# Generate scrape configs from all flake hosts and external targets
|
||||||
|
# Host labels are propagated to service targets for semantic alert filtering
|
||||||
|
generateScrapeConfigs = self: externalTargets:
|
||||||
|
let
|
||||||
|
nixosConfigs = self.nixosConfigurations or { };
|
||||||
|
hostList = lib.filter (x: x != null) (
|
||||||
|
lib.mapAttrsToList extractHostMonitoring nixosConfigs
|
||||||
|
);
|
||||||
|
|
||||||
|
# Collect all scrapeTargets from all hosts, including host labels
|
||||||
|
allTargets = lib.flatten (map
|
||||||
|
(host:
|
||||||
|
map
|
||||||
|
(target: {
|
||||||
|
inherit (target) job_name port metrics_path scheme scrape_interval honor_labels;
|
||||||
|
hostname = host.hostname;
|
||||||
|
hostLabels = buildEffectiveLabels host;
|
||||||
|
})
|
||||||
|
host.scrapeTargets
|
||||||
|
)
|
||||||
|
hostList
|
||||||
|
);
|
||||||
|
|
||||||
|
# Group targets by job_name
|
||||||
|
grouped = lib.groupBy (t: t.job_name) allTargets;
|
||||||
|
|
||||||
|
# Generate a scrape config for each job
|
||||||
|
# Within each job, group targets by their host labels for efficient static_configs
|
||||||
|
flakeScrapeConfigs = lib.mapAttrsToList
|
||||||
|
(jobName: targets:
|
||||||
|
let
|
||||||
|
first = builtins.head targets;
|
||||||
|
|
||||||
|
# Group targets within this job by their host labels
|
||||||
|
labelKey = t: builtins.toJSON t.hostLabels;
|
||||||
|
groupedByLabels = lib.groupBy labelKey targets;
|
||||||
|
|
||||||
|
# Every flake host now has at least a hostname label
|
||||||
|
staticConfigs = lib.mapAttrsToList
|
||||||
|
(key: labelTargets:
|
||||||
|
let
|
||||||
|
labels = (builtins.head labelTargets).hostLabels;
|
||||||
|
targetAddrs = map
|
||||||
|
(t: "${t.hostname}.home.2rjus.net:${toString t.port}")
|
||||||
|
labelTargets;
|
||||||
|
in
|
||||||
|
{ targets = targetAddrs; labels = labels; }
|
||||||
|
)
|
||||||
|
groupedByLabels;
|
||||||
|
|
||||||
|
config = {
|
||||||
|
job_name = jobName;
|
||||||
|
static_configs = staticConfigs;
|
||||||
|
}
|
||||||
|
// (lib.optionalAttrs (first.metrics_path != "/metrics") {
|
||||||
|
metrics_path = first.metrics_path;
|
||||||
|
})
|
||||||
|
// (lib.optionalAttrs (first.scheme != "http") {
|
||||||
|
scheme = first.scheme;
|
||||||
|
})
|
||||||
|
// (lib.optionalAttrs (first.scrape_interval != null) {
|
||||||
|
scrape_interval = first.scrape_interval;
|
||||||
|
})
|
||||||
|
// (lib.optionalAttrs first.honor_labels {
|
||||||
|
honor_labels = true;
|
||||||
|
});
|
||||||
|
in
|
||||||
|
config
|
||||||
|
)
|
||||||
|
grouped;
|
||||||
|
|
||||||
|
# External scrape configs
|
||||||
|
externalScrapeConfigs = map
|
||||||
|
(ext: {
|
||||||
|
job_name = ext.job_name;
|
||||||
|
static_configs = [{
|
||||||
|
targets = ext.targets;
|
||||||
|
}];
|
||||||
|
} // (lib.optionalAttrs (ext ? metrics_path) {
|
||||||
|
metrics_path = ext.metrics_path;
|
||||||
|
}) // (lib.optionalAttrs (ext ? scheme) {
|
||||||
|
scheme = ext.scheme;
|
||||||
|
}) // (lib.optionalAttrs (ext ? scrape_interval) {
|
||||||
|
scrape_interval = ext.scrape_interval;
|
||||||
|
}))
|
||||||
|
(externalTargets.scrapeConfigs or [ ]);
|
||||||
|
in
|
||||||
|
flakeScrapeConfigs ++ externalScrapeConfigs;
|
||||||
|
|
||||||
|
in
|
||||||
|
{
|
||||||
|
inherit extractHostMonitoring generateNodeExporterTargets generateScrapeConfigs;
|
||||||
|
}
|
||||||
@@ -1,6 +1,9 @@
|
|||||||
{ ... }:
|
{ ... }:
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
|
./deploy.nix
|
||||||
./dns.nix
|
./dns.nix
|
||||||
|
./host.nix
|
||||||
|
./monitoring.nix
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|||||||
16
modules/homelab/deploy.nix
Normal file
16
modules/homelab/deploy.nix
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
{ config, lib, ... }:
|
||||||
|
|
||||||
|
{
|
||||||
|
options.homelab.deploy = {
|
||||||
|
enable = lib.mkEnableOption "homelab-deploy listener for NATS-based deployments";
|
||||||
|
};
|
||||||
|
|
||||||
|
config = {
|
||||||
|
assertions = [
|
||||||
|
{
|
||||||
|
assertion = config.homelab.deploy.enable -> config.vault.enable;
|
||||||
|
message = "homelab.deploy.enable requires vault.enable to be true (needed for NKey secret)";
|
||||||
|
}
|
||||||
|
];
|
||||||
|
};
|
||||||
|
}
|
||||||
28
modules/homelab/host.nix
Normal file
28
modules/homelab/host.nix
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
{ lib, ... }:
|
||||||
|
{
|
||||||
|
options.homelab.host = {
|
||||||
|
tier = lib.mkOption {
|
||||||
|
type = lib.types.enum [ "test" "prod" ];
|
||||||
|
default = "prod";
|
||||||
|
description = "Deployment tier - controls which credentials can deploy to this host";
|
||||||
|
};
|
||||||
|
|
||||||
|
priority = lib.mkOption {
|
||||||
|
type = lib.types.enum [ "high" "low" ];
|
||||||
|
default = "high";
|
||||||
|
description = "Alerting priority - low priority hosts have relaxed thresholds";
|
||||||
|
};
|
||||||
|
|
||||||
|
role = lib.mkOption {
|
||||||
|
type = lib.types.nullOr lib.types.str;
|
||||||
|
default = null;
|
||||||
|
description = "Primary role of this host (dns, database, monitoring, etc.)";
|
||||||
|
};
|
||||||
|
|
||||||
|
labels = lib.mkOption {
|
||||||
|
type = lib.types.attrsOf lib.types.str;
|
||||||
|
default = { };
|
||||||
|
description = "Additional free-form labels (e.g., dns_role = 'primary')";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
50
modules/homelab/monitoring.nix
Normal file
50
modules/homelab/monitoring.nix
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
{ config, lib, ... }:
|
||||||
|
let
|
||||||
|
cfg = config.homelab.monitoring;
|
||||||
|
in
|
||||||
|
{
|
||||||
|
options.homelab.monitoring = {
|
||||||
|
enable = lib.mkOption {
|
||||||
|
type = lib.types.bool;
|
||||||
|
default = true;
|
||||||
|
description = "Include this host in Prometheus node-exporter scrape targets";
|
||||||
|
};
|
||||||
|
|
||||||
|
scrapeTargets = lib.mkOption {
|
||||||
|
type = lib.types.listOf (lib.types.submodule {
|
||||||
|
options = {
|
||||||
|
job_name = lib.mkOption {
|
||||||
|
type = lib.types.str;
|
||||||
|
description = "Prometheus scrape job name";
|
||||||
|
};
|
||||||
|
port = lib.mkOption {
|
||||||
|
type = lib.types.port;
|
||||||
|
description = "Port to scrape metrics from";
|
||||||
|
};
|
||||||
|
metrics_path = lib.mkOption {
|
||||||
|
type = lib.types.str;
|
||||||
|
default = "/metrics";
|
||||||
|
description = "HTTP path to scrape metrics from";
|
||||||
|
};
|
||||||
|
scheme = lib.mkOption {
|
||||||
|
type = lib.types.str;
|
||||||
|
default = "http";
|
||||||
|
description = "HTTP scheme (http or https)";
|
||||||
|
};
|
||||||
|
scrape_interval = lib.mkOption {
|
||||||
|
type = lib.types.nullOr lib.types.str;
|
||||||
|
default = null;
|
||||||
|
description = "Override the global scrape interval for this target";
|
||||||
|
};
|
||||||
|
honor_labels = lib.mkOption {
|
||||||
|
type = lib.types.bool;
|
||||||
|
default = false;
|
||||||
|
description = "Whether to honor labels from the scraped target";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
});
|
||||||
|
default = [ ];
|
||||||
|
description = "Additional Prometheus scrape targets exposed by this host";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -99,3 +99,48 @@
|
|||||||
- name: Display success message
|
- name: Display success message
|
||||||
ansible.builtin.debug:
|
ansible.builtin.debug:
|
||||||
msg: "Template VM {{ template_vmid }} created successfully on {{ storage }}"
|
msg: "Template VM {{ template_vmid }} created successfully on {{ storage }}"
|
||||||
|
|
||||||
|
- name: Update Terraform template name
|
||||||
|
hosts: localhost
|
||||||
|
gather_facts: false
|
||||||
|
|
||||||
|
vars:
|
||||||
|
terraform_dir: "{{ playbook_dir }}/../terraform"
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
- name: Get image filename from earlier play
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
image_filename: "{{ hostvars['localhost']['image_filename'] }}"
|
||||||
|
|
||||||
|
- name: Extract template name from image filename
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
new_template_name: "{{ image_filename | regex_replace('\\.vma\\.zst$', '') | regex_replace('^vzdump-qemu-', '') }}"
|
||||||
|
|
||||||
|
- name: Read current Terraform variables file
|
||||||
|
ansible.builtin.slurp:
|
||||||
|
src: "{{ terraform_dir }}/variables.tf"
|
||||||
|
register: variables_tf_content
|
||||||
|
|
||||||
|
- name: Extract current template name from variables.tf
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
current_template_name: "{{ (variables_tf_content.content | b64decode) | regex_search('variable \"default_template_name\"[^}]+default\\s*=\\s*\"([^\"]+)\"', '\\1') | first }}"
|
||||||
|
|
||||||
|
- name: Check if template name has changed
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
template_name_changed: "{{ current_template_name != new_template_name }}"
|
||||||
|
|
||||||
|
- name: Display template name status
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "Template name: {{ current_template_name }} -> {{ new_template_name }} ({{ 'changed' if template_name_changed else 'unchanged' }})"
|
||||||
|
|
||||||
|
- name: Update default_template_name in variables.tf
|
||||||
|
ansible.builtin.replace:
|
||||||
|
path: "{{ terraform_dir }}/variables.tf"
|
||||||
|
regexp: '(variable "default_template_name"[^}]+default\s*=\s*)"[^"]+"'
|
||||||
|
replace: '\1"{{ new_template_name }}"'
|
||||||
|
when: template_name_changed
|
||||||
|
|
||||||
|
- name: Display update result
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "Updated terraform/variables.tf with new template name: {{ new_template_name }}"
|
||||||
|
when: template_name_changed
|
||||||
|
|||||||
78
playbooks/provision-approle.yml
Normal file
78
playbooks/provision-approle.yml
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
---
|
||||||
|
# Provision OpenBao AppRole credentials to an existing host
|
||||||
|
# Usage: nix develop -c ansible-playbook playbooks/provision-approle.yml -e hostname=ha1
|
||||||
|
# Requires: BAO_ADDR and BAO_TOKEN environment variables set
|
||||||
|
|
||||||
|
- name: Fetch AppRole credentials from OpenBao
|
||||||
|
hosts: localhost
|
||||||
|
connection: local
|
||||||
|
gather_facts: false
|
||||||
|
|
||||||
|
vars:
|
||||||
|
vault_addr: "{{ lookup('env', 'BAO_ADDR') | default('https://vault01.home.2rjus.net:8200', true) }}"
|
||||||
|
domain: "home.2rjus.net"
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
- name: Validate hostname is provided
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: "hostname variable is required. Use: -e hostname=<name>"
|
||||||
|
when: hostname is not defined
|
||||||
|
|
||||||
|
- name: Get role-id for host
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: "bao read -field=role_id auth/approle/role/{{ hostname }}/role-id"
|
||||||
|
environment:
|
||||||
|
BAO_ADDR: "{{ vault_addr }}"
|
||||||
|
BAO_SKIP_VERIFY: "1"
|
||||||
|
register: role_id_result
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Generate secret-id for host
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: "bao write -field=secret_id -f auth/approle/role/{{ hostname }}/secret-id"
|
||||||
|
environment:
|
||||||
|
BAO_ADDR: "{{ vault_addr }}"
|
||||||
|
BAO_SKIP_VERIFY: "1"
|
||||||
|
register: secret_id_result
|
||||||
|
changed_when: true
|
||||||
|
|
||||||
|
- name: Add target host to inventory
|
||||||
|
ansible.builtin.add_host:
|
||||||
|
name: "{{ hostname }}.{{ domain }}"
|
||||||
|
groups: vault_target
|
||||||
|
ansible_user: root
|
||||||
|
vault_role_id: "{{ role_id_result.stdout }}"
|
||||||
|
vault_secret_id: "{{ secret_id_result.stdout }}"
|
||||||
|
|
||||||
|
- name: Deploy AppRole credentials to host
|
||||||
|
hosts: vault_target
|
||||||
|
gather_facts: false
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
- name: Create AppRole directory
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: /var/lib/vault/approle
|
||||||
|
state: directory
|
||||||
|
mode: "0700"
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
|
||||||
|
- name: Write role-id
|
||||||
|
ansible.builtin.copy:
|
||||||
|
content: "{{ vault_role_id }}"
|
||||||
|
dest: /var/lib/vault/approle/role-id
|
||||||
|
mode: "0600"
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
|
||||||
|
- name: Write secret-id
|
||||||
|
ansible.builtin.copy:
|
||||||
|
content: "{{ vault_secret_id }}"
|
||||||
|
dest: /var/lib/vault/approle/secret-id
|
||||||
|
mode: "0600"
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
|
||||||
|
- name: Display success
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "AppRole credentials provisioned to {{ inventory_hostname }}"
|
||||||
@@ -1,20 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# array of hosts
|
|
||||||
HOSTS=(
|
|
||||||
"ns1"
|
|
||||||
"ns2"
|
|
||||||
"ca"
|
|
||||||
"ha1"
|
|
||||||
"http-proxy"
|
|
||||||
"jelly01"
|
|
||||||
"monitoring01"
|
|
||||||
"nix-cache01"
|
|
||||||
"pgdb1"
|
|
||||||
)
|
|
||||||
|
|
||||||
for host in "${HOSTS[@]}"; do
|
|
||||||
echo "Rebuilding $host"
|
|
||||||
nixos-rebuild boot --flake .#${host} --target-host root@${host}
|
|
||||||
done
|
|
||||||
@@ -18,6 +18,8 @@ from manipulators import (
|
|||||||
remove_from_flake_nix,
|
remove_from_flake_nix,
|
||||||
remove_from_terraform_vms,
|
remove_from_terraform_vms,
|
||||||
remove_from_vault_terraform,
|
remove_from_vault_terraform,
|
||||||
|
remove_from_approle_tf,
|
||||||
|
find_host_secrets,
|
||||||
check_entries_exist,
|
check_entries_exist,
|
||||||
)
|
)
|
||||||
from models import HostConfig
|
from models import HostConfig
|
||||||
@@ -255,7 +257,10 @@ def handle_remove(
|
|||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# Check what entries exist
|
# Check what entries exist
|
||||||
flake_exists, terraform_exists, vault_exists = check_entries_exist(hostname, repo_root)
|
flake_exists, terraform_exists, vault_exists, approle_exists = check_entries_exist(hostname, repo_root)
|
||||||
|
|
||||||
|
# Check for secrets in secrets.tf
|
||||||
|
host_secrets = find_host_secrets(hostname, repo_root)
|
||||||
|
|
||||||
# Collect all files in the host directory recursively
|
# Collect all files in the host directory recursively
|
||||||
files_in_host_dir = sorted([f for f in host_dir.rglob("*") if f.is_file()])
|
files_in_host_dir = sorted([f for f in host_dir.rglob("*") if f.is_file()])
|
||||||
@@ -294,11 +299,25 @@ def handle_remove(
|
|||||||
else:
|
else:
|
||||||
console.print(f" • terraform/vault/hosts-generated.tf [dim](not found)[/dim]")
|
console.print(f" • terraform/vault/hosts-generated.tf [dim](not found)[/dim]")
|
||||||
|
|
||||||
# Warn about secrets directory
|
if approle_exists:
|
||||||
|
console.print(f' • terraform/vault/approle.tf (host_policies["{hostname}"])')
|
||||||
|
else:
|
||||||
|
console.print(f" • terraform/vault/approle.tf [dim](not found)[/dim]")
|
||||||
|
|
||||||
|
# Warn about secrets in secrets.tf
|
||||||
|
if host_secrets:
|
||||||
|
console.print(f"\n[yellow]⚠️ Warning: Found {len(host_secrets)} secret(s) in terraform/vault/secrets.tf:[/yellow]")
|
||||||
|
for secret_path in host_secrets:
|
||||||
|
console.print(f' • "{secret_path}"')
|
||||||
|
console.print(f"\n [yellow]These will NOT be removed automatically.[/yellow]")
|
||||||
|
console.print(f" After removal, manually edit secrets.tf and run:")
|
||||||
|
for secret_path in host_secrets:
|
||||||
|
console.print(f" [white]vault kv delete secret/{secret_path}[/white]")
|
||||||
|
|
||||||
|
# Warn about legacy secrets directory
|
||||||
if secrets_exist:
|
if secrets_exist:
|
||||||
console.print(f"\n[yellow]⚠️ Warning: secrets/{hostname}/ directory exists and will NOT be deleted[/yellow]")
|
console.print(f"\n[yellow]⚠️ Warning: secrets/{hostname}/ directory exists (legacy SOPS)[/yellow]")
|
||||||
console.print(f" Manually remove if no longer needed: [white]rm -rf secrets/{hostname}/[/white]")
|
console.print(f" Manually remove if no longer needed: [white]rm -rf secrets/{hostname}/[/white]")
|
||||||
console.print(f" Also update .sops.yaml to remove the host's age key")
|
|
||||||
|
|
||||||
# Exit if dry run
|
# Exit if dry run
|
||||||
if dry_run:
|
if dry_run:
|
||||||
@@ -323,6 +342,13 @@ def handle_remove(
|
|||||||
else:
|
else:
|
||||||
console.print("[yellow]⚠[/yellow] Could not remove from terraform/vault/hosts-generated.tf")
|
console.print("[yellow]⚠[/yellow] Could not remove from terraform/vault/hosts-generated.tf")
|
||||||
|
|
||||||
|
# Remove from terraform/vault/approle.tf
|
||||||
|
if approle_exists:
|
||||||
|
if remove_from_approle_tf(hostname, repo_root):
|
||||||
|
console.print("[green]✓[/green] Removed from terraform/vault/approle.tf")
|
||||||
|
else:
|
||||||
|
console.print("[yellow]⚠[/yellow] Could not remove from terraform/vault/approle.tf")
|
||||||
|
|
||||||
# Remove from terraform/vms.tf
|
# Remove from terraform/vms.tf
|
||||||
if terraform_exists:
|
if terraform_exists:
|
||||||
if remove_from_terraform_vms(hostname, repo_root):
|
if remove_from_terraform_vms(hostname, repo_root):
|
||||||
@@ -345,19 +371,34 @@ def handle_remove(
|
|||||||
console.print(f"\n[bold green]✓ Host {hostname} removed successfully![/bold green]\n")
|
console.print(f"\n[bold green]✓ Host {hostname} removed successfully![/bold green]\n")
|
||||||
|
|
||||||
# Display next steps
|
# Display next steps
|
||||||
display_removal_next_steps(hostname, vault_exists)
|
display_removal_next_steps(hostname, vault_exists, approle_exists, host_secrets)
|
||||||
|
|
||||||
|
|
||||||
def display_removal_next_steps(hostname: str, had_vault: bool) -> None:
|
def display_removal_next_steps(hostname: str, had_vault: bool, had_approle: bool, host_secrets: list) -> None:
|
||||||
"""Display next steps after successful removal."""
|
"""Display next steps after successful removal."""
|
||||||
vault_file = " terraform/vault/hosts-generated.tf" if had_vault else ""
|
vault_files = ""
|
||||||
vault_apply = ""
|
|
||||||
if had_vault:
|
if had_vault:
|
||||||
|
vault_files += " terraform/vault/hosts-generated.tf"
|
||||||
|
if had_approle:
|
||||||
|
vault_files += " terraform/vault/approle.tf"
|
||||||
|
|
||||||
|
vault_apply = ""
|
||||||
|
if had_vault or had_approle:
|
||||||
vault_apply = f"""
|
vault_apply = f"""
|
||||||
3. Apply Vault changes:
|
3. Apply Vault changes:
|
||||||
[white]cd terraform/vault && tofu apply[/white]
|
[white]cd terraform/vault && tofu apply[/white]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
secrets_cleanup = ""
|
||||||
|
if host_secrets:
|
||||||
|
secrets_cleanup = f"""
|
||||||
|
5. Clean up secrets (manual):
|
||||||
|
Edit terraform/vault/secrets.tf to remove entries for {hostname}
|
||||||
|
Then delete from Vault:"""
|
||||||
|
for secret_path in host_secrets:
|
||||||
|
secrets_cleanup += f"\n [white]vault kv delete secret/{secret_path}[/white]"
|
||||||
|
secrets_cleanup += "\n"
|
||||||
|
|
||||||
next_steps = f"""[bold cyan]Next Steps:[/bold cyan]
|
next_steps = f"""[bold cyan]Next Steps:[/bold cyan]
|
||||||
|
|
||||||
1. Review changes:
|
1. Review changes:
|
||||||
@@ -367,9 +408,9 @@ def display_removal_next_steps(hostname: str, had_vault: bool) -> None:
|
|||||||
[white]cd terraform && tofu destroy -target='proxmox_vm_qemu.vm["{hostname}"]'[/white]
|
[white]cd terraform && tofu destroy -target='proxmox_vm_qemu.vm["{hostname}"]'[/white]
|
||||||
{vault_apply}
|
{vault_apply}
|
||||||
4. Commit changes:
|
4. Commit changes:
|
||||||
[white]git add -u hosts/{hostname} flake.nix terraform/vms.tf{vault_file}
|
[white]git add -u hosts/{hostname} flake.nix terraform/vms.tf{vault_files}
|
||||||
git commit -m "hosts: remove {hostname}"[/white]
|
git commit -m "hosts: remove {hostname}"[/white]
|
||||||
"""
|
{secrets_cleanup}"""
|
||||||
console.print(Panel(next_steps, border_style="cyan"))
|
console.print(Panel(next_steps, border_style="cyan"))
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -144,7 +144,7 @@ resource "vault_approle_auth_backend_role" "generated_hosts" {
|
|||||||
|
|
||||||
backend = vault_auth_backend.approle.path
|
backend = vault_auth_backend.approle.path
|
||||||
role_name = each.key
|
role_name = each.key
|
||||||
token_policies = ["host-\${each.key}"]
|
token_policies = ["host-\${each.key}", "homelab-deploy"]
|
||||||
secret_id_ttl = 0 # Never expire (wrapped tokens provide time limit)
|
secret_id_ttl = 0 # Never expire (wrapped tokens provide time limit)
|
||||||
token_ttl = 3600
|
token_ttl = 3600
|
||||||
token_max_ttl = 3600
|
token_max_ttl = 3600
|
||||||
|
|||||||
@@ -101,7 +101,68 @@ def remove_from_vault_terraform(hostname: str, repo_root: Path) -> bool:
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def check_entries_exist(hostname: str, repo_root: Path) -> Tuple[bool, bool, bool]:
|
def remove_from_approle_tf(hostname: str, repo_root: Path) -> bool:
|
||||||
|
"""
|
||||||
|
Remove host entry from terraform/vault/approle.tf locals.host_policies.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
hostname: Hostname to remove
|
||||||
|
repo_root: Path to repository root
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if found and removed, False if not found
|
||||||
|
"""
|
||||||
|
approle_path = repo_root / "terraform" / "vault" / "approle.tf"
|
||||||
|
|
||||||
|
if not approle_path.exists():
|
||||||
|
return False
|
||||||
|
|
||||||
|
content = approle_path.read_text()
|
||||||
|
|
||||||
|
# Check if hostname exists in host_policies
|
||||||
|
hostname_pattern = rf'^\s+"{re.escape(hostname)}" = \{{'
|
||||||
|
if not re.search(hostname_pattern, content, re.MULTILINE):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Match the entire block from "hostname" = { to closing }
|
||||||
|
# The block contains paths = [ ... ] and possibly extra_policies = [...]
|
||||||
|
replace_pattern = rf'\n?\s+"{re.escape(hostname)}" = \{{[^}}]*\}}\n?'
|
||||||
|
new_content, count = re.subn(replace_pattern, "\n", content, flags=re.DOTALL)
|
||||||
|
|
||||||
|
if count == 0:
|
||||||
|
return False
|
||||||
|
|
||||||
|
approle_path.write_text(new_content)
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def find_host_secrets(hostname: str, repo_root: Path) -> list:
|
||||||
|
"""
|
||||||
|
Find secrets in terraform/vault/secrets.tf that belong to a host.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
hostname: Hostname to search for
|
||||||
|
repo_root: Path to repository root
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of secret paths found (e.g., ["hosts/hostname/test-service"])
|
||||||
|
"""
|
||||||
|
secrets_path = repo_root / "terraform" / "vault" / "secrets.tf"
|
||||||
|
|
||||||
|
if not secrets_path.exists():
|
||||||
|
return []
|
||||||
|
|
||||||
|
content = secrets_path.read_text()
|
||||||
|
|
||||||
|
# Find all secret paths matching hosts/{hostname}/
|
||||||
|
pattern = rf'"(hosts/{re.escape(hostname)}/[^"]+)"'
|
||||||
|
matches = re.findall(pattern, content)
|
||||||
|
|
||||||
|
# Return unique paths, preserving order
|
||||||
|
return list(dict.fromkeys(matches))
|
||||||
|
|
||||||
|
|
||||||
|
def check_entries_exist(hostname: str, repo_root: Path) -> Tuple[bool, bool, bool, bool]:
|
||||||
"""
|
"""
|
||||||
Check which entries exist for a hostname.
|
Check which entries exist for a hostname.
|
||||||
|
|
||||||
@@ -110,7 +171,7 @@ def check_entries_exist(hostname: str, repo_root: Path) -> Tuple[bool, bool, boo
|
|||||||
repo_root: Path to repository root
|
repo_root: Path to repository root
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple of (flake_exists, terraform_vms_exists, vault_exists)
|
Tuple of (flake_exists, terraform_vms_exists, vault_generated_exists, approle_exists)
|
||||||
"""
|
"""
|
||||||
# Check flake.nix
|
# Check flake.nix
|
||||||
flake_path = repo_root / "flake.nix"
|
flake_path = repo_root / "flake.nix"
|
||||||
@@ -131,7 +192,15 @@ def check_entries_exist(hostname: str, repo_root: Path) -> Tuple[bool, bool, boo
|
|||||||
vault_content = vault_tf_path.read_text()
|
vault_content = vault_tf_path.read_text()
|
||||||
vault_exists = f'"{hostname}"' in vault_content
|
vault_exists = f'"{hostname}"' in vault_content
|
||||||
|
|
||||||
return (flake_exists, terraform_exists, vault_exists)
|
# Check terraform/vault/approle.tf
|
||||||
|
approle_path = repo_root / "terraform" / "vault" / "approle.tf"
|
||||||
|
approle_exists = False
|
||||||
|
if approle_path.exists():
|
||||||
|
approle_content = approle_path.read_text()
|
||||||
|
approle_pattern = rf'^\s+"{re.escape(hostname)}" = \{{'
|
||||||
|
approle_exists = bool(re.search(approle_pattern, approle_content, re.MULTILINE))
|
||||||
|
|
||||||
|
return (flake_exists, terraform_exists, vault_exists, approle_exists)
|
||||||
|
|
||||||
|
|
||||||
def update_flake_nix(config: HostConfig, repo_root: Path, force: bool = False) -> None:
|
def update_flake_nix(config: HostConfig, repo_root: Path, force: bool = False) -> None:
|
||||||
@@ -150,17 +219,10 @@ def update_flake_nix(config: HostConfig, repo_root: Path, force: bool = False) -
|
|||||||
new_entry = f""" {config.hostname} = nixpkgs.lib.nixosSystem {{
|
new_entry = f""" {config.hostname} = nixpkgs.lib.nixosSystem {{
|
||||||
inherit system;
|
inherit system;
|
||||||
specialArgs = {{
|
specialArgs = {{
|
||||||
inherit inputs self sops-nix;
|
inherit inputs self;
|
||||||
}};
|
}};
|
||||||
modules = [
|
modules = commonModules ++ [
|
||||||
(
|
|
||||||
{{ config, pkgs, ... }}:
|
|
||||||
{{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}}
|
|
||||||
)
|
|
||||||
./hosts/{config.hostname}
|
./hosts/{config.hostname}
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
];
|
||||||
}};
|
}};
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -13,6 +13,17 @@
|
|||||||
../../common/vm
|
../../common/vm
|
||||||
];
|
];
|
||||||
|
|
||||||
|
# Host metadata (adjust as needed)
|
||||||
|
homelab.host = {
|
||||||
|
tier = "test"; # Start in test tier, move to prod after validation
|
||||||
|
};
|
||||||
|
|
||||||
|
# Enable Vault integration
|
||||||
|
vault.enable = true;
|
||||||
|
|
||||||
|
# Enable remote deployment via NATS
|
||||||
|
homelab.deploy.enable = true;
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
nixpkgs.config.allowUnfree = true;
|
||||||
boot.loader.grub.enable = true;
|
boot.loader.grub.enable = true;
|
||||||
boot.loader.grub.device = "/dev/vda";
|
boot.loader.grub.device = "/dev/vda";
|
||||||
|
|||||||
@@ -140,20 +140,22 @@ def validate_ip_unique(ip: Optional[str], repo_root: Path) -> None:
|
|||||||
ip_part = ip.split("/")[0]
|
ip_part = ip.split("/")[0]
|
||||||
|
|
||||||
# Check all hosts/*/configuration.nix files
|
# Check all hosts/*/configuration.nix files
|
||||||
|
# Search for IP with CIDR notation to match static IP assignments
|
||||||
|
# (e.g., "10.69.13.5/24") but not DNS resolver entries (e.g., "10.69.13.5")
|
||||||
hosts_dir = repo_root / "hosts"
|
hosts_dir = repo_root / "hosts"
|
||||||
if hosts_dir.exists():
|
if hosts_dir.exists():
|
||||||
for config_file in hosts_dir.glob("*/configuration.nix"):
|
for config_file in hosts_dir.glob("*/configuration.nix"):
|
||||||
content = config_file.read_text()
|
content = config_file.read_text()
|
||||||
if ip_part in content:
|
if ip in content:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"IP address {ip_part} already in use in {config_file}"
|
f"IP address {ip_part} already in use in {config_file}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check terraform/vms.tf
|
# Check terraform/vms.tf - search for full IP with CIDR
|
||||||
terraform_file = repo_root / "terraform" / "vms.tf"
|
terraform_file = repo_root / "terraform" / "vms.tf"
|
||||||
if terraform_file.exists():
|
if terraform_file.exists():
|
||||||
content = terraform_file.read_text()
|
content = terraform_file.read_text()
|
||||||
if ip_part in content:
|
if ip in content:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"IP address {ip_part} already in use in {terraform_file}"
|
f"IP address {ip_part} already in use in {terraform_file}"
|
||||||
)
|
)
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user