Compare commits
383 Commits
host-vault
...
nrec-forge
| Author | SHA1 | Date | |
|---|---|---|---|
|
6a3e78a479
|
|||
|
cfc0c6f6cb
|
|||
|
822380695e
|
|||
|
0941bd52f5
|
|||
| 9ebdd94773 | |||
|
adc267bd95
|
|||
|
7ffe2d71d6
|
|||
|
dd9ba42eb5
|
|||
| 3ee0433a6f | |||
|
73d804105b
|
|||
|
d2a4e4a0a1
|
|||
| 28eba49d68 | |||
| 4bf726a674 | |||
| 774fd92524 | |||
|
55da459108
|
|||
|
813c5c0f29
|
|||
|
013ab8f621
|
|||
| f75b773485 | |||
| 58c3844950 | |||
| 80e5fa08fa | |||
|
cf55d07ce5
|
|||
| 4941e38dac | |||
| 03ffcc1ad0 | |||
|
5e92eb3220
|
|||
| 2321e191a2 | |||
|
136116ab33
|
|||
|
c8cadd09c5
|
|||
|
72acaa872b
|
|||
|
a7c1ce932d
|
|||
|
2b42145d94
|
|||
|
05e8556bda
|
|||
|
75fdd7ae40
|
|||
|
5346889b73
|
|||
|
7e19f51dfa
|
|||
|
9f7aab86a0
|
|||
|
bb53b922fa
|
|||
|
75cd7c6c2d
|
|||
|
72c3a938b0
|
|||
|
2f89d564f7
|
|||
|
4a83363ee5
|
|||
|
b578520905
|
|||
|
8a5aa1c4f5
|
|||
|
0f8c4783a8
|
|||
|
2ca2509083
|
|||
|
58702bd10b
|
|||
|
c9f47acb01
|
|||
|
09ce018fb2
|
|||
| 3042803c4d | |||
|
1e7200b494
|
|||
|
eec1e374b2
|
|||
|
fcc410afad
|
|||
|
59f0c7ceda
|
|||
| d713f06c6e | |||
|
7374d1ff7f
|
|||
| e912c75b6c | |||
|
b218b4f8bc
|
|||
|
65acf13e6f
|
|||
| 95a96b2192 | |||
|
4f593126c0
|
|||
| 1bba6f106a | |||
|
a6013d3950
|
|||
| 7f69c0738a | |||
|
35924c7b01
|
|||
|
87d8571d62
|
|||
|
43c81f6688
|
|||
|
58f901ad3e
|
|||
|
c13921d302
|
|||
|
2903873d52
|
|||
|
74e7c9faa4
|
|||
| 471f536f1f | |||
|
a013e80f1a
|
|||
|
4cbaa33475
|
|||
|
e329f87b0b
|
|||
|
c151f31011
|
|||
| f5362d6936 | |||
|
3e7aabc73a
|
|||
|
361e7f2a1b
|
|||
|
1942591d2e
|
|||
|
4d614d8716
|
|||
| fd7caf7f00 | |||
|
af8e385b6e
|
|||
|
0db9fc6802
|
|||
|
5d68662035
|
|||
|
d485948df0
|
|||
|
7b804450a3
|
|||
|
2f0dad1acc
|
|||
|
1544415ef3
|
|||
|
5babd7f507
|
|||
|
7e0c5fbf0f
|
|||
|
ffaf95d109
|
|||
|
b2b6ab4799
|
|||
|
5d3d93b280
|
|||
|
ae823e439d
|
|||
|
0d9f49a3b4
|
|||
|
08d9e1ec3f
|
|||
|
fa8d65b612
|
|||
|
6726f111e3
|
|||
| 3a083285cb | |||
|
ed1821b073
|
|||
|
fa4a418007
|
|||
| 963e5f6d3c | |||
|
0bc10cb1fe
|
|||
|
b03e2e8ee4
|
|||
|
ddcbc30665
|
|||
|
75210805d5
|
|||
|
ade0538717
|
|||
|
83fce5f927
|
|||
|
afff3f28ca
|
|||
|
49f7e3ae2e
|
|||
|
751edfc11d
|
|||
|
98a7301985
|
|||
| 34efa58cfe | |||
|
5bfb51a497
|
|||
|
f83145d97a
|
|||
|
47747329c4
|
|||
|
2d9ca2a73f
|
|||
|
98ea679ef2
|
|||
|
b709c0b703
|
|||
|
33c5d5b3f0
|
|||
|
0a28c5f495
|
|||
|
9bd48e0808
|
|||
|
1460eea700
|
|||
|
98c4f54f94
|
|||
|
d1b0a5dc20
|
|||
|
4d32707130
|
|||
|
8e1753c2c8
|
|||
|
75e4fb61a5
|
|||
|
2be213e454
|
|||
|
12c252653b
|
|||
|
6493338c4c
|
|||
|
6e08ba9720
|
|||
|
7ff3d2a09b
|
|||
|
e85f15b73d
|
|||
|
2f5a2a4bf1
|
|||
|
287141c623
|
|||
|
9ed11b712f
|
|||
|
ffad2dd205
|
|||
|
ed7d2aa727
|
|||
|
bf7a025364
|
|||
| 4ae99dbc89 | |||
|
5c142b1323
|
|||
|
4091e51f41
|
|||
|
a8e558a6b7
|
|||
|
4efc798c38
|
|||
|
016f8c9119
|
|||
| fec2a261ab | |||
|
60c04a2052
|
|||
|
39e3f37263
|
|||
| a2d93baba8 | |||
|
f66dfc753c
|
|||
| 79a6a72719 | |||
|
89d0a6f358
|
|||
|
03ebee4d82
|
|||
|
05630eb4d4
|
|||
|
1e52eec02a
|
|||
|
d333aa0164
|
|||
|
a5d5827dcc
|
|||
|
1c13ec12a4
|
|||
|
4bf0eeeadb
|
|||
| 304cb117ce | |||
|
02270a0e4a
|
|||
|
030e8518c5
|
|||
|
9ffdd4f862
|
|||
|
0b977808ca
|
|||
|
8786113f8f
|
|||
|
fdb2c31f84
|
|||
|
78eb04205f
|
|||
| 19cb61ebbc | |||
|
9ed09c9a9c
|
|||
|
b31c64f1b9
|
|||
|
54b6e37420
|
|||
|
b845a8bb8b
|
|||
|
bfbf0cea68
|
|||
|
3abe5e83a7
|
|||
|
67c27555f3
|
|||
|
1674b6a844
|
|||
|
311be282b6
|
|||
|
11cbb64097
|
|||
|
e2dd21c994
|
|||
|
463342133e
|
|||
|
de36b9d016
|
|||
|
3f1d966919
|
|||
|
7fcc043a4d
|
|||
|
70ec5f8109
|
|||
|
c2ec34cab9
|
|||
|
8fbf1224fa
|
|||
|
8959829f77
|
|||
|
93dbb45802
|
|||
|
538c2ad097
|
|||
|
d99c82c74c
|
|||
|
ca0e3fd629
|
|||
|
732e9b8c22
|
|||
|
3a14ffd6b5
|
|||
|
f9a3961457
|
|||
|
003d4ccf03
|
|||
|
735b8a9ee3
|
|||
|
94feae82a0
|
|||
|
3f94f7ee95
|
|||
|
b7e398c9a7
|
|||
|
8ec2a083bd
|
|||
|
ec4ac1477e
|
|||
|
e937c68965
|
|||
|
98e808cd6c
|
|||
|
ba9f47f914
|
|||
|
1066e81ba8
|
|||
|
f0950b33de
|
|||
|
bf199bd7c6
|
|||
| 4e8ecb8a99 | |||
|
38c104ea8c
|
|||
|
536daee4c7
|
|||
| 4c1debf0a3 | |||
|
f36457ee0d
|
|||
|
aedccbd9a0
|
|||
|
bdc6057689
|
|||
| 3a25e3f7bc | |||
|
46f03871f1
|
|||
|
9d019f2b9a
|
|||
|
21db7e9573
|
|||
|
979040aaf7
|
|||
|
8791c29402
|
|||
|
c7a067d7b3
|
|||
|
c518093578
|
|||
| 0b462f0a96 | |||
|
116abf3bec
|
|||
|
b794aa89db
|
|||
|
50a85daa44
|
|||
|
23e561cf49
|
|||
|
7d291f85bf
|
|||
|
2a842c655a
|
|||
|
1f4a5571dc
|
|||
| 13d6d0ea3a | |||
|
eea000b337
|
|||
|
f19ba2f4b6
|
|||
|
a90d9c33d5
|
|||
|
09c9df1bbe
|
|||
|
ae3039af19
|
|||
|
11261c4636
|
|||
|
4ca3c8890f
|
|||
|
78e8d7a600
|
|||
|
0cf72ec191
|
|||
|
6a3a51407e
|
|||
|
a1ae766eb8
|
|||
|
11999b37f3
|
|||
|
29b2b7db52
|
|||
|
b046a1b862
|
|||
|
38348c5980
|
|||
|
370cf2b03a
|
|||
|
7bc465b414
|
|||
|
8d7bc50108
|
|||
|
03e70ac094
|
|||
|
3b32c9479f
|
|||
|
b0d35f9a99
|
|||
|
26ca6817f0
|
|||
|
b03a9b3b64
|
|||
|
f805b9f629
|
|||
|
f3adf7e77f
|
|||
|
f6eca9decc
|
|||
| 6e93b8eae3 | |||
|
c214f8543c
|
|||
|
7933127d77
|
|||
|
13c3897e86
|
|||
|
0643f23281
|
|||
|
ad8570f8db
|
|||
| 2f195d26d3 | |||
|
a926d34287
|
|||
|
be2421746e
|
|||
|
12bf0683f5
|
|||
|
e8a43c6715
|
|||
|
eef52bb8c5
|
|||
|
c6cdbc6799
|
|||
|
4d724329a6
|
|||
|
881e70df27
|
|||
|
b9a269d280
|
|||
|
fcf1a66103
|
|||
|
2034004280
|
|||
| af43f88394 | |||
|
a834497fe8
|
|||
| d3de2a1511 | |||
|
97ff774d3f
|
|||
|
f2c30cc24f
|
|||
|
7e80d2e0bc
|
|||
|
1f5b7b13e2
|
|||
|
c53e36c3f3
|
|||
|
04a252b857
|
|||
|
5d26f52e0d
|
|||
|
506a692548
|
|||
|
fa8f4f0784
|
|||
|
025570dea1
|
|||
|
15c00393f1
|
|||
|
787c14c7a6
|
|||
|
eee3dde04f
|
|||
| 682b07b977 | |||
| 70661ac3d9 | |||
|
506e93a5e2
|
|||
|
b6c41aa910
|
|||
| aa6e00a327 | |||
|
258e350b89
|
|||
|
eba195c192
|
|||
|
bbb22e588e
|
|||
|
879e7aba60
|
|||
|
39a4ea98ab
|
|||
| 1d90dc2181 | |||
|
e9857afc11
|
|||
| 88e9036cb4 | |||
|
59e1962d75
|
|||
|
3dc4422ba0
|
|||
|
f0963624bc
|
|||
| 7b46f94e48 | |||
|
32968147b5
|
|||
|
c515a6b4e1
|
|||
|
4d8b94ce83
|
|||
|
8b0a4ea33a
|
|||
| 5be1f43c24 | |||
|
b322b1156b
|
|||
|
3cccfc0487
|
|||
|
41d4226812
|
|||
|
351fb6f720
|
|||
|
7d92c55d37
|
|||
|
6d117d68ca
|
|||
| a46fbdaa70 | |||
|
2c9d86eaf2
|
|||
|
ccb1c3fe2e
|
|||
|
0700033c0a
|
|||
|
4d33018285
|
|||
|
678fd3d6de
|
|||
|
9d74aa5c04
|
|||
|
fe80ec3576
|
|||
|
870fb3e532
|
|||
|
e602e8d70b
|
|||
|
28b8d7c115
|
|||
|
64f2688349
|
|||
|
09d9d71e2b
|
|||
|
cc799f5929
|
|||
|
0abdda8e8a
|
|||
| 4076361bf7 | |||
|
0ef63ad874
|
|||
| 8f29141dd1 | |||
|
3a9a47f1ad
|
|||
|
fa6380e767
|
|||
|
86a077e152
|
|||
| 9da57c6a2f | |||
| da9dd02d10 | |||
|
e7980978c7
|
|||
|
dd1b64de27
|
|||
|
4e8cc124f2
|
|||
|
a2a55f3955
|
|||
|
c38034ba41
|
|||
|
d7d4b0846c
|
|||
| 8ca7c4e402 | |||
|
106912499b
|
|||
|
83af00458b
|
|||
|
67d5de3eb8
|
|||
|
cee1b264cd
|
|||
|
4ceee04308
|
|||
| e3ced5bcda | |||
| 15459870cd | |||
|
d1861eefb5
|
|||
|
d25fc99e1d
|
|||
|
b5da9431aa
|
|||
| 0e5dea635e | |||
| 86249c466b | |||
| 5d560267cf | |||
|
63662b89e0
|
|||
|
7ae474fd3e
|
|||
|
f0525b5c74
|
|||
|
42c391b355
|
|||
|
048536ba70
|
|||
| cccce09406 | |||
|
01d4812280
|
|||
| b5364d2ccc | |||
|
7fc69c40a6
|
|||
|
34a2f2ab50
|
|||
| 16b3214982 | |||
| 244dd0c78b | |||
|
238ad45c14
|
|||
|
c694b9889a
|
|||
|
3f2f91aedd
|
|||
|
5d513fd5af
|
|||
|
b6f1e80c2a
|
|||
|
4133eafc4e
|
|||
|
ace848b29c
|
|||
|
b012df9f34
|
|||
|
ab053c25bd
|
180
.claude/agents/auditor.md
Normal file
180
.claude/agents/auditor.md
Normal file
@@ -0,0 +1,180 @@
|
|||||||
|
---
|
||||||
|
name: auditor
|
||||||
|
description: Analyzes audit logs to investigate user activity, command execution, and suspicious behavior on hosts. Can be used standalone for security reviews or called by other agents for behavioral context.
|
||||||
|
tools: Read, Grep, Glob
|
||||||
|
mcpServers:
|
||||||
|
- lab-monitoring
|
||||||
|
---
|
||||||
|
|
||||||
|
You are a security auditor for a NixOS homelab infrastructure. Your task is to analyze audit logs and reconstruct user activity on hosts.
|
||||||
|
|
||||||
|
## Input
|
||||||
|
|
||||||
|
You may receive:
|
||||||
|
- A host or list of hosts to investigate
|
||||||
|
- A time window (e.g., "last hour", "today", "between 14:00 and 15:00")
|
||||||
|
- Optional context: specific events to look for, user to focus on, or suspicious activity to investigate
|
||||||
|
- Optional context from a parent investigation (e.g., "a service stopped at 14:32, what happened around that time?")
|
||||||
|
|
||||||
|
## Audit Log Structure
|
||||||
|
|
||||||
|
Logs are shipped to Loki via promtail. Audit events use these labels:
|
||||||
|
- `hostname` - hostname
|
||||||
|
- `systemd_unit` - typically `auditd.service` for audit logs
|
||||||
|
- `job` - typically `systemd-journal`
|
||||||
|
|
||||||
|
Audit log entries contain structured data:
|
||||||
|
- `EXECVE` - command execution with full arguments
|
||||||
|
- `USER_LOGIN` / `USER_LOGOUT` - session start/end
|
||||||
|
- `USER_CMD` - sudo command execution
|
||||||
|
- `CRED_ACQ` / `CRED_DISP` - credential acquisition/disposal
|
||||||
|
- `SERVICE_START` / `SERVICE_STOP` - systemd service events
|
||||||
|
|
||||||
|
## Investigation Techniques
|
||||||
|
|
||||||
|
### 1. SSH Session Activity
|
||||||
|
|
||||||
|
Find SSH logins and session activity:
|
||||||
|
```logql
|
||||||
|
{hostname="<hostname>", systemd_unit="sshd.service"}
|
||||||
|
```
|
||||||
|
|
||||||
|
Look for:
|
||||||
|
- Accepted/Failed authentication
|
||||||
|
- Session opened/closed
|
||||||
|
- Unusual source IPs or users
|
||||||
|
|
||||||
|
### 2. Command Execution
|
||||||
|
|
||||||
|
Query executed commands (filter out noise):
|
||||||
|
```logql
|
||||||
|
{hostname="<hostname>"} |= "EXECVE" != "PATH item" != "PROCTITLE" != "SYSCALL" != "BPF"
|
||||||
|
```
|
||||||
|
|
||||||
|
Further filtering:
|
||||||
|
- Exclude systemd noise: `!= "systemd" != "/nix/store"`
|
||||||
|
- Focus on specific commands: `|= "rm" |= "-rf"`
|
||||||
|
- Focus on specific user: `|= "uid=1000"`
|
||||||
|
|
||||||
|
### 3. Sudo Activity
|
||||||
|
|
||||||
|
Check for privilege escalation:
|
||||||
|
```logql
|
||||||
|
{hostname="<hostname>"} |= "sudo" |= "COMMAND"
|
||||||
|
```
|
||||||
|
|
||||||
|
Or via audit:
|
||||||
|
```logql
|
||||||
|
{hostname="<hostname>"} |= "USER_CMD"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Service Manipulation
|
||||||
|
|
||||||
|
Check if services were manually stopped/started:
|
||||||
|
```logql
|
||||||
|
{hostname="<hostname>"} |= "EXECVE" |= "systemctl"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. File Operations
|
||||||
|
|
||||||
|
Look for file modifications (if auditd rules are configured):
|
||||||
|
```logql
|
||||||
|
{hostname="<hostname>"} |= "EXECVE" |= "vim"
|
||||||
|
{hostname="<hostname>"} |= "EXECVE" |= "nano"
|
||||||
|
{hostname="<hostname>"} |= "EXECVE" |= "rm"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Query Guidelines
|
||||||
|
|
||||||
|
**Start narrow, expand if needed:**
|
||||||
|
- Begin with `limit: 20-30`
|
||||||
|
- Use tight time windows: `start: "15m"` or `start: "30m"`
|
||||||
|
- Add filters progressively
|
||||||
|
|
||||||
|
**Avoid:**
|
||||||
|
- Querying all audit logs without EXECVE filter (extremely verbose)
|
||||||
|
- Large time ranges without specific filters
|
||||||
|
- Limits over 50 without tight filters
|
||||||
|
|
||||||
|
**Time-bounded queries:**
|
||||||
|
When investigating around a specific event:
|
||||||
|
```logql
|
||||||
|
{hostname="<hostname>"} |= "EXECVE" != "systemd"
|
||||||
|
```
|
||||||
|
With `start: "2026-02-08T14:30:00Z"` and `end: "2026-02-08T14:35:00Z"`
|
||||||
|
|
||||||
|
## Suspicious Patterns to Watch For
|
||||||
|
|
||||||
|
1. **Unusual login times** - Activity outside normal hours
|
||||||
|
2. **Failed authentication** - Brute force attempts
|
||||||
|
3. **Privilege escalation** - Unexpected sudo usage
|
||||||
|
4. **Reconnaissance commands** - `whoami`, `id`, `uname`, `cat /etc/passwd`
|
||||||
|
5. **Data exfiltration indicators** - `curl`, `wget`, `scp`, `rsync` to external destinations
|
||||||
|
6. **Persistence mechanisms** - Cron modifications, systemd service creation
|
||||||
|
7. **Log tampering** - Commands targeting log files
|
||||||
|
8. **Lateral movement** - SSH to other internal hosts
|
||||||
|
9. **Service manipulation** - Stopping security services, disabling firewalls
|
||||||
|
10. **Cleanup activity** - Deleting bash history, clearing logs
|
||||||
|
|
||||||
|
## Output Format
|
||||||
|
|
||||||
|
### For Standalone Security Reviews
|
||||||
|
|
||||||
|
```
|
||||||
|
## Activity Summary
|
||||||
|
|
||||||
|
**Host:** <hostname>
|
||||||
|
**Time Period:** <start> to <end>
|
||||||
|
**Sessions Found:** <count>
|
||||||
|
|
||||||
|
## User Sessions
|
||||||
|
|
||||||
|
### Session 1: <user> from <source_ip>
|
||||||
|
- **Login:** HH:MM:SSZ
|
||||||
|
- **Logout:** HH:MM:SSZ (or ongoing)
|
||||||
|
- **Commands executed:**
|
||||||
|
- HH:MM:SSZ - <command>
|
||||||
|
- HH:MM:SSZ - <command>
|
||||||
|
|
||||||
|
## Suspicious Activity
|
||||||
|
|
||||||
|
[If any patterns from the watch list were detected]
|
||||||
|
- **Finding:** <description>
|
||||||
|
- **Evidence:** <log entries>
|
||||||
|
- **Risk Level:** Low / Medium / High
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
[Overall assessment: normal activity, concerning patterns, or clear malicious activity]
|
||||||
|
```
|
||||||
|
|
||||||
|
### When Called by Another Agent
|
||||||
|
|
||||||
|
Provide a focused response addressing the specific question:
|
||||||
|
|
||||||
|
```
|
||||||
|
## Audit Findings
|
||||||
|
|
||||||
|
**Query:** <what was asked>
|
||||||
|
**Time Window:** <investigated period>
|
||||||
|
|
||||||
|
## Relevant Activity
|
||||||
|
|
||||||
|
[Chronological list of relevant events]
|
||||||
|
- HH:MM:SSZ - <event>
|
||||||
|
- HH:MM:SSZ - <event>
|
||||||
|
|
||||||
|
## Assessment
|
||||||
|
|
||||||
|
[Direct answer to the question with supporting evidence]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Guidelines
|
||||||
|
|
||||||
|
- Reconstruct timelines chronologically
|
||||||
|
- Correlate events (login → commands → logout)
|
||||||
|
- Note gaps or missing data
|
||||||
|
- Distinguish between automated (systemd, cron) and interactive activity
|
||||||
|
- Consider the host's role and tier when assessing severity
|
||||||
|
- When called by another agent, focus on answering their specific question
|
||||||
|
- Don't speculate without evidence - state what the logs show and don't show
|
||||||
211
.claude/agents/investigate-alarm.md
Normal file
211
.claude/agents/investigate-alarm.md
Normal file
@@ -0,0 +1,211 @@
|
|||||||
|
---
|
||||||
|
name: investigate-alarm
|
||||||
|
description: Investigates a single system alarm by querying Prometheus metrics and Loki logs, analyzing configuration files for affected hosts/services, and providing root cause analysis.
|
||||||
|
tools: Read, Grep, Glob
|
||||||
|
mcpServers:
|
||||||
|
- lab-monitoring
|
||||||
|
- git-explorer
|
||||||
|
---
|
||||||
|
|
||||||
|
You are an alarm investigation specialist for a NixOS homelab infrastructure. Your task is to analyze a single alarm and determine its root cause.
|
||||||
|
|
||||||
|
## Input
|
||||||
|
|
||||||
|
You will receive information about an alarm, which may include:
|
||||||
|
- Alert name and severity
|
||||||
|
- Affected host or service
|
||||||
|
- Alert expression/threshold
|
||||||
|
- Current value or status
|
||||||
|
- When it started firing
|
||||||
|
|
||||||
|
## Investigation Process
|
||||||
|
|
||||||
|
### 1. Understand the Alert Context
|
||||||
|
|
||||||
|
Start by understanding what the alert is measuring:
|
||||||
|
- Use `get_alert` if you have a fingerprint, or `list_alerts` to find matching alerts
|
||||||
|
- Use `get_metric_metadata` to understand the metric being monitored
|
||||||
|
- Use `search_metrics` to find related metrics
|
||||||
|
|
||||||
|
### 2. Query Current State
|
||||||
|
|
||||||
|
Gather evidence about the current system state:
|
||||||
|
- Use `query` to check the current metric values and related metrics
|
||||||
|
- Use `list_targets` to verify the host/service is being scraped successfully
|
||||||
|
- Look for correlated metrics that might explain the issue
|
||||||
|
|
||||||
|
### 3. Check Service Logs
|
||||||
|
|
||||||
|
Search for relevant log entries using `query_logs`. Focus on service-specific logs and errors.
|
||||||
|
|
||||||
|
**Query strategies (start narrow, expand if needed):**
|
||||||
|
- Start with `limit: 20-30`, increase only if needed
|
||||||
|
- Use tight time windows: `start: "15m"` or `start: "30m"` initially
|
||||||
|
- Filter to specific services: `{hostname="<hostname>", systemd_unit="<service>.service"}`
|
||||||
|
- Search for errors: `{hostname="<hostname>"} |= "error"` or `|= "failed"`
|
||||||
|
|
||||||
|
**Common patterns:**
|
||||||
|
- Service logs: `{hostname="<hostname>", systemd_unit="<service>.service"}`
|
||||||
|
- All errors on host: `{hostname="<hostname>"} |= "error"`
|
||||||
|
- Journal for a unit: `{hostname="<hostname>", systemd_unit="nginx.service"} |= "failed"`
|
||||||
|
|
||||||
|
**Avoid:**
|
||||||
|
- Using `start: "1h"` with no filters on busy hosts
|
||||||
|
- Limits over 50 without specific filters
|
||||||
|
|
||||||
|
### 4. Investigate User Activity
|
||||||
|
|
||||||
|
For any analysis of user activity, **always spawn the `auditor` agent**. Do not query audit logs (EXECVE, USER_LOGIN, etc.) directly - delegate this to the auditor.
|
||||||
|
|
||||||
|
**Always call the auditor when:**
|
||||||
|
- A service stopped unexpectedly (may have been manually stopped)
|
||||||
|
- A process was killed or a config was changed
|
||||||
|
- You need to know who was logged in around the time of an incident
|
||||||
|
- You need to understand what commands led to the current state
|
||||||
|
- The cause isn't obvious from service logs alone
|
||||||
|
|
||||||
|
**Do NOT try to query audit logs yourself.** The auditor is specialized for:
|
||||||
|
- Parsing EXECVE records and reconstructing command lines
|
||||||
|
- Correlating SSH sessions with commands executed
|
||||||
|
- Identifying suspicious patterns
|
||||||
|
- Filtering out systemd/nix-store noise
|
||||||
|
|
||||||
|
**Example prompt for auditor:**
|
||||||
|
```
|
||||||
|
Investigate user activity on <hostname> between <start_time> and <end_time>.
|
||||||
|
Context: The prometheus-node-exporter service stopped at 14:32.
|
||||||
|
Determine if it was manually stopped and by whom.
|
||||||
|
```
|
||||||
|
|
||||||
|
Incorporate the auditor's findings into your timeline and root cause analysis.
|
||||||
|
|
||||||
|
### 5. Check Configuration (if relevant)
|
||||||
|
|
||||||
|
If the alert relates to a NixOS-managed service:
|
||||||
|
- Check host configuration in `/hosts/<hostname>/`
|
||||||
|
- Check service modules in `/services/<service>/`
|
||||||
|
- Look for thresholds, resource limits, or misconfigurations
|
||||||
|
- Check `homelab.host` options for tier/priority/role metadata
|
||||||
|
|
||||||
|
### 6. Check for Configuration Drift
|
||||||
|
|
||||||
|
Use the git-explorer MCP server to compare the host's deployed configuration against the current master branch. This helps identify:
|
||||||
|
- Hosts running outdated configurations
|
||||||
|
- Recent changes that might have caused the issue
|
||||||
|
- Whether a fix has already been committed but not deployed
|
||||||
|
|
||||||
|
**Step 1: Get the deployed revision from Prometheus**
|
||||||
|
```promql
|
||||||
|
nixos_flake_info{hostname="<hostname>"}
|
||||||
|
```
|
||||||
|
The `current_rev` label contains the deployed git commit hash.
|
||||||
|
|
||||||
|
**Step 2: Check if the host is behind master**
|
||||||
|
```
|
||||||
|
resolve_ref("master") # Get current master commit
|
||||||
|
is_ancestor(deployed, master) # Check if host is behind
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 3: See what commits are missing**
|
||||||
|
```
|
||||||
|
commits_between(deployed, master) # List commits not yet deployed
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 4: Check which files changed**
|
||||||
|
```
|
||||||
|
get_diff_files(deployed, master) # Files modified since deployment
|
||||||
|
```
|
||||||
|
Look for files in `hosts/<hostname>/`, `services/<relevant-service>/`, or `system/` that affect this host.
|
||||||
|
|
||||||
|
**Step 5: View configuration at the deployed revision**
|
||||||
|
```
|
||||||
|
get_file_at_commit(deployed, "services/<service>/default.nix")
|
||||||
|
```
|
||||||
|
Compare against the current file to understand differences.
|
||||||
|
|
||||||
|
**Step 6: Find when something changed**
|
||||||
|
```
|
||||||
|
search_commits("<service-name>") # Find commits mentioning the service
|
||||||
|
get_commit_info(<hash>) # Get full details of a specific change
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example workflow for a service-related alert:**
|
||||||
|
1. Query `nixos_flake_info{hostname="monitoring02"}` → `current_rev: 8959829`
|
||||||
|
2. `resolve_ref("master")` → `4633421`
|
||||||
|
3. `is_ancestor("8959829", "4633421")` → Yes, host is behind
|
||||||
|
4. `commits_between("8959829", "4633421")` → 7 commits missing
|
||||||
|
5. `get_diff_files("8959829", "4633421")` → Check if relevant service files changed
|
||||||
|
6. If a fix was committed after the deployed rev, recommend deployment
|
||||||
|
|
||||||
|
### 7. Consider Common Causes
|
||||||
|
|
||||||
|
For infrastructure alerts, common causes include:
|
||||||
|
- **Manual intervention**: Service manually stopped/restarted (call auditor to confirm)
|
||||||
|
- **Configuration drift**: Host running outdated config, fix already in master
|
||||||
|
- **Disk space**: Nix store growth, logs, temp files
|
||||||
|
- **Memory pressure**: Service memory leaks, insufficient limits
|
||||||
|
- **CPU**: Runaway processes, build jobs
|
||||||
|
- **Network**: DNS issues, connectivity problems
|
||||||
|
- **Service restarts**: Failed upgrades, configuration errors
|
||||||
|
- **Scrape failures**: Service down, firewall issues, port changes
|
||||||
|
|
||||||
|
**Note:** If a service stopped unexpectedly and service logs don't show a crash or error, it was likely manual intervention - call the auditor to investigate.
|
||||||
|
|
||||||
|
## Output Format
|
||||||
|
|
||||||
|
Provide a concise report with one of two outcomes:
|
||||||
|
|
||||||
|
### If Root Cause Identified:
|
||||||
|
|
||||||
|
```
|
||||||
|
## Root Cause
|
||||||
|
[1-2 sentence summary of the root cause]
|
||||||
|
|
||||||
|
## Timeline
|
||||||
|
[Chronological sequence of relevant events leading to the alert]
|
||||||
|
- HH:MM:SSZ - [Event description]
|
||||||
|
- HH:MM:SSZ - [Event description]
|
||||||
|
- HH:MM:SSZ - [Alert fired]
|
||||||
|
|
||||||
|
### Timeline sources
|
||||||
|
- HH:MM:SSZ - [Source for information about this event. Which metric or log file]
|
||||||
|
- HH:MM:SSZ - [Source for information about this event. Which metric or log file]
|
||||||
|
- HH:MM:SSZ - [Alert fired]
|
||||||
|
|
||||||
|
|
||||||
|
## Evidence
|
||||||
|
- [Specific metric values or log entries that support the conclusion]
|
||||||
|
- [Configuration details if relevant]
|
||||||
|
|
||||||
|
|
||||||
|
## Recommended Actions
|
||||||
|
1. [Specific remediation step]
|
||||||
|
2. [Follow-up actions if any]
|
||||||
|
```
|
||||||
|
|
||||||
|
### If Root Cause Unclear:
|
||||||
|
|
||||||
|
```
|
||||||
|
## Investigation Summary
|
||||||
|
[What was checked and what was found]
|
||||||
|
|
||||||
|
## Possible Causes
|
||||||
|
- [Hypothesis 1 with supporting/contradicting evidence]
|
||||||
|
- [Hypothesis 2 with supporting/contradicting evidence]
|
||||||
|
|
||||||
|
## Additional Information Needed
|
||||||
|
- [Specific data, logs, or access that would help]
|
||||||
|
- [Suggested queries or checks for the operator]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Guidelines
|
||||||
|
|
||||||
|
- Be concise and actionable
|
||||||
|
- Reference specific metric names and values as evidence
|
||||||
|
- Include log snippets when they're informative
|
||||||
|
- Don't speculate without evidence
|
||||||
|
- If the alert is a false positive or expected behavior, explain why
|
||||||
|
- Consider the host's tier (test vs prod) when assessing severity
|
||||||
|
- Build a timeline from log timestamps and metrics to show the sequence of events
|
||||||
|
- **Query logs incrementally**: start with narrow filters and small limits, expand only if needed
|
||||||
|
- **Always delegate to the auditor agent** for any user activity analysis - never query EXECVE or audit logs directly
|
||||||
372
.claude/skills/observability/SKILL.md
Normal file
372
.claude/skills/observability/SKILL.md
Normal file
@@ -0,0 +1,372 @@
|
|||||||
|
---
|
||||||
|
name: observability
|
||||||
|
description: Reference guide for exploring Prometheus metrics and Loki logs when troubleshooting homelab issues. Use when investigating system state, deployments, service health, or searching logs.
|
||||||
|
---
|
||||||
|
|
||||||
|
# Observability Troubleshooting Guide
|
||||||
|
|
||||||
|
Quick reference for exploring Prometheus metrics and Loki logs to troubleshoot homelab issues.
|
||||||
|
|
||||||
|
## Available Tools
|
||||||
|
|
||||||
|
Use the `lab-monitoring` MCP server tools:
|
||||||
|
|
||||||
|
**Metrics:**
|
||||||
|
- `search_metrics` - Find metrics by name substring
|
||||||
|
- `get_metric_metadata` - Get type/help for a specific metric
|
||||||
|
- `query` - Execute PromQL queries
|
||||||
|
- `list_targets` - Check scrape target health
|
||||||
|
- `list_alerts` / `get_alert` - View active alerts
|
||||||
|
|
||||||
|
**Logs:**
|
||||||
|
- `query_logs` - Execute LogQL queries against Loki
|
||||||
|
- `list_labels` - List available log labels
|
||||||
|
- `list_label_values` - List values for a specific label
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Logs Reference
|
||||||
|
|
||||||
|
### Label Reference
|
||||||
|
|
||||||
|
Available labels for log queries:
|
||||||
|
- `hostname` - Hostname (e.g., `ns1`, `monitoring02`, `ha1`) - matches the Prometheus `hostname` label
|
||||||
|
- `systemd_unit` - Systemd unit name (e.g., `nsd.service`, `nixos-upgrade.service`)
|
||||||
|
- `job` - Either `systemd-journal` (most logs), `varlog` (file-based logs), or `bootstrap` (VM bootstrap logs)
|
||||||
|
- `filename` - For `varlog` job, the log file path
|
||||||
|
- `tier` - Deployment tier (`test` or `prod`)
|
||||||
|
- `role` - Host role (e.g., `dns`, `vault`, `monitoring`) - matches the Prometheus `role` label
|
||||||
|
- `level` - Log level mapped from journal PRIORITY (`critical`, `error`, `warning`, `notice`, `info`, `debug`) - journal scrape only
|
||||||
|
|
||||||
|
### Log Format
|
||||||
|
|
||||||
|
Journal logs are JSON-formatted. Key fields:
|
||||||
|
- `MESSAGE` - The actual log message
|
||||||
|
- `PRIORITY` - Syslog priority (6=info, 4=warning, 3=error)
|
||||||
|
- `SYSLOG_IDENTIFIER` - Program name
|
||||||
|
|
||||||
|
### Basic LogQL Queries
|
||||||
|
|
||||||
|
**Logs from a specific service on a host:**
|
||||||
|
```logql
|
||||||
|
{hostname="ns1", systemd_unit="nsd.service"}
|
||||||
|
```
|
||||||
|
|
||||||
|
**All logs from a host:**
|
||||||
|
```logql
|
||||||
|
{hostname="monitoring02"}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Logs from a service across all hosts:**
|
||||||
|
```logql
|
||||||
|
{systemd_unit="nixos-upgrade.service"}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Substring matching (case-sensitive):**
|
||||||
|
```logql
|
||||||
|
{hostname="ha1"} |= "error"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Exclude pattern:**
|
||||||
|
```logql
|
||||||
|
{hostname="ns1"} != "routine"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Regex matching:**
|
||||||
|
```logql
|
||||||
|
{systemd_unit="victoriametrics.service"} |~ "scrape.*failed"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Filter by level (journal scrape only):**
|
||||||
|
```logql
|
||||||
|
{level="error"} # All errors across the fleet
|
||||||
|
{level=~"critical|error", tier="prod"} # Prod errors and criticals
|
||||||
|
{hostname="ns1", level="warning"} # Warnings from a specific host
|
||||||
|
```
|
||||||
|
|
||||||
|
**Filter by tier/role:**
|
||||||
|
```logql
|
||||||
|
{tier="prod"} |= "error" # All errors on prod hosts
|
||||||
|
{role="dns"} # All DNS server logs
|
||||||
|
{tier="test", job="systemd-journal"} # Journal logs from test hosts
|
||||||
|
```
|
||||||
|
|
||||||
|
**File-based logs (caddy access logs, etc):**
|
||||||
|
```logql
|
||||||
|
{job="varlog", hostname="nix-cache01"}
|
||||||
|
{job="varlog", filename="/var/log/caddy/nix-cache.log"}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Time Ranges
|
||||||
|
|
||||||
|
Default lookback is 1 hour. Use `start` parameter for older logs:
|
||||||
|
- `start: "1h"` - Last hour (default)
|
||||||
|
- `start: "24h"` - Last 24 hours
|
||||||
|
- `start: "168h"` - Last 7 days
|
||||||
|
|
||||||
|
### Common Services
|
||||||
|
|
||||||
|
Useful systemd units for troubleshooting:
|
||||||
|
- `nixos-upgrade.service` - Daily auto-upgrade logs
|
||||||
|
- `nsd.service` - DNS server (ns1/ns2)
|
||||||
|
- `victoriametrics.service` - Metrics collection
|
||||||
|
- `loki.service` - Log aggregation
|
||||||
|
- `caddy.service` - Reverse proxy
|
||||||
|
- `home-assistant.service` - Home automation
|
||||||
|
- `step-ca.service` - Internal CA
|
||||||
|
- `openbao.service` - Secrets management
|
||||||
|
- `sshd.service` - SSH daemon
|
||||||
|
- `nix-gc.service` - Nix garbage collection
|
||||||
|
|
||||||
|
### Bootstrap Logs
|
||||||
|
|
||||||
|
VMs provisioned from template2 send bootstrap progress directly to Loki via curl (before promtail is available). These logs use `job="bootstrap"` with additional labels:
|
||||||
|
|
||||||
|
- `hostname` - Target hostname
|
||||||
|
- `branch` - Git branch being deployed
|
||||||
|
- `stage` - Bootstrap stage (see table below)
|
||||||
|
|
||||||
|
**Bootstrap stages:**
|
||||||
|
|
||||||
|
| Stage | Message | Meaning |
|
||||||
|
|-------|---------|---------|
|
||||||
|
| `starting` | Bootstrap starting for \<host\> (branch: \<branch\>) | Bootstrap service has started |
|
||||||
|
| `network_ok` | Network connectivity confirmed | Can reach git server |
|
||||||
|
| `vault_ok` | Vault credentials unwrapped and stored | AppRole credentials provisioned |
|
||||||
|
| `vault_skip` | No Vault token provided - skipping credential setup | No wrapped token was provided |
|
||||||
|
| `vault_warn` | Failed to unwrap Vault token - continuing without secrets | Token unwrap failed (expired/used) |
|
||||||
|
| `building` | Starting nixos-rebuild boot | NixOS build starting |
|
||||||
|
| `success` | Build successful - rebooting into new configuration | Build complete, rebooting |
|
||||||
|
| `failed` | nixos-rebuild failed - manual intervention required | Build failed |
|
||||||
|
|
||||||
|
**Bootstrap queries:**
|
||||||
|
|
||||||
|
```logql
|
||||||
|
{job="bootstrap"} # All bootstrap logs
|
||||||
|
{job="bootstrap", hostname="myhost"} # Specific host
|
||||||
|
{job="bootstrap", stage="failed"} # All failures
|
||||||
|
{job="bootstrap", stage=~"building|success"} # Track build progress
|
||||||
|
```
|
||||||
|
|
||||||
|
### Extracting JSON Fields
|
||||||
|
|
||||||
|
Parse JSON and filter on fields:
|
||||||
|
```logql
|
||||||
|
{systemd_unit="victoriametrics.service"} | json | PRIORITY="3"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Metrics Reference
|
||||||
|
|
||||||
|
### Deployment & Version Status
|
||||||
|
|
||||||
|
Check which NixOS revision hosts are running:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
nixos_flake_info
|
||||||
|
```
|
||||||
|
|
||||||
|
Labels:
|
||||||
|
- `current_rev` - Git commit of the running NixOS configuration
|
||||||
|
- `remote_rev` - Latest commit on the remote repository
|
||||||
|
- `nixpkgs_rev` - Nixpkgs revision used to build the system
|
||||||
|
- `nixos_version` - Full NixOS version string (e.g., `25.11.20260203.e576e3c`)
|
||||||
|
|
||||||
|
Check if hosts are behind on updates:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
nixos_flake_revision_behind == 1
|
||||||
|
```
|
||||||
|
|
||||||
|
View flake input versions:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
nixos_flake_input_info
|
||||||
|
```
|
||||||
|
|
||||||
|
Labels: `input` (name), `rev` (revision), `type` (git/github)
|
||||||
|
|
||||||
|
Check flake input age:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
nixos_flake_input_age_seconds / 86400
|
||||||
|
```
|
||||||
|
|
||||||
|
Returns age in days for each flake input.
|
||||||
|
|
||||||
|
### System Health
|
||||||
|
|
||||||
|
Basic host availability:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
up{job="node-exporter"}
|
||||||
|
```
|
||||||
|
|
||||||
|
CPU usage by host:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
|
||||||
|
```
|
||||||
|
|
||||||
|
Memory usage:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)
|
||||||
|
```
|
||||||
|
|
||||||
|
Disk space (root filesystem):
|
||||||
|
|
||||||
|
```promql
|
||||||
|
node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Prometheus Jobs
|
||||||
|
|
||||||
|
All available Prometheus job names:
|
||||||
|
|
||||||
|
**System exporters (on all/most hosts):**
|
||||||
|
- `node-exporter` - System metrics (CPU, memory, disk, network)
|
||||||
|
- `nixos-exporter` - NixOS flake revision and generation info
|
||||||
|
- `systemd-exporter` - Systemd unit status metrics
|
||||||
|
- `homelab-deploy` - Deployment listener metrics
|
||||||
|
|
||||||
|
**Service-specific exporters:**
|
||||||
|
- `caddy` - Reverse proxy metrics (http-proxy)
|
||||||
|
- `nix-cache_caddy` - Nix binary cache metrics
|
||||||
|
- `home-assistant` - Home automation metrics (ha1)
|
||||||
|
- `jellyfin` - Media server metrics (jelly01)
|
||||||
|
- `kanidm` - Authentication server metrics (kanidm01)
|
||||||
|
- `nats` - NATS messaging metrics (nats1)
|
||||||
|
- `openbao` - Secrets management metrics (vault01)
|
||||||
|
- `unbound` - DNS resolver metrics (ns1, ns2)
|
||||||
|
- `wireguard` - VPN tunnel metrics (http-proxy)
|
||||||
|
|
||||||
|
**Monitoring stack (localhost on monitoring02):**
|
||||||
|
- `victoriametrics` - VictoriaMetrics self-metrics
|
||||||
|
- `loki` - Loki self-metrics
|
||||||
|
- `grafana` - Grafana self-metrics
|
||||||
|
- `alertmanager` - Alertmanager metrics
|
||||||
|
|
||||||
|
**External/infrastructure:**
|
||||||
|
- `pve-exporter` - Proxmox hypervisor metrics
|
||||||
|
- `smartctl` - Disk SMART health (gunter)
|
||||||
|
- `restic_rest` - Backup server metrics
|
||||||
|
- `ghettoptt` - PTT service metrics (gunter)
|
||||||
|
|
||||||
|
### Target Labels
|
||||||
|
|
||||||
|
All scrape targets have these labels:
|
||||||
|
|
||||||
|
**Standard labels:**
|
||||||
|
- `instance` - Full target address (`<hostname>.home.2rjus.net:<port>`)
|
||||||
|
- `job` - Job name (e.g., `node-exporter`, `unbound`, `nixos-exporter`)
|
||||||
|
- `hostname` - Short hostname (e.g., `ns1`, `monitoring02`) - use this for host filtering
|
||||||
|
|
||||||
|
**Host metadata labels** (when configured in `homelab.host`):
|
||||||
|
- `role` - Host role (e.g., `dns`, `build-host`, `vault`)
|
||||||
|
- `tier` - Deployment tier (`test` for test VMs, absent for prod)
|
||||||
|
- `dns_role` - DNS-specific role (`primary` or `secondary` for ns1/ns2)
|
||||||
|
|
||||||
|
### Filtering by Host
|
||||||
|
|
||||||
|
Use the `hostname` label for easy host filtering across all jobs:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
{hostname="ns1"} # All metrics from ns1
|
||||||
|
node_load1{hostname="monitoring02"} # Specific metric by hostname
|
||||||
|
up{hostname="ha1"} # Check if ha1 is up
|
||||||
|
```
|
||||||
|
|
||||||
|
This is simpler than wildcarding the `instance` label:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
# Old way (still works but verbose)
|
||||||
|
up{instance=~"monitoring02.*"}
|
||||||
|
|
||||||
|
# New way (preferred)
|
||||||
|
up{hostname="monitoring02"}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Filtering by Role/Tier
|
||||||
|
|
||||||
|
Filter hosts by their role or tier:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
up{role="dns"} # All DNS servers (ns1, ns2)
|
||||||
|
node_cpu_seconds_total{role="build-host"} # Build hosts only (nix-cache01)
|
||||||
|
up{tier="test"} # All test-tier VMs
|
||||||
|
up{dns_role="primary"} # Primary DNS only (ns1)
|
||||||
|
```
|
||||||
|
|
||||||
|
Current host labels:
|
||||||
|
| Host | Labels |
|
||||||
|
|------|--------|
|
||||||
|
| ns1 | `role=dns`, `dns_role=primary` |
|
||||||
|
| ns2 | `role=dns`, `dns_role=secondary` |
|
||||||
|
| nix-cache01 | `role=build-host` |
|
||||||
|
| vault01 | `role=vault` |
|
||||||
|
| kanidm01 | `role=auth`, `tier=test` |
|
||||||
|
| testvm01/02/03 | `tier=test` |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Troubleshooting Workflows
|
||||||
|
|
||||||
|
### Check Deployment Status Across Fleet
|
||||||
|
|
||||||
|
1. Query `nixos_flake_info` to see all hosts' current revisions
|
||||||
|
2. Check `nixos_flake_revision_behind` for hosts needing updates
|
||||||
|
3. Look at upgrade logs: `{systemd_unit="nixos-upgrade.service"}` with `start: "24h"`
|
||||||
|
|
||||||
|
### Investigate Service Issues
|
||||||
|
|
||||||
|
1. Check `up{job="<service>"}` or `up{hostname="<host>"}` for scrape failures
|
||||||
|
2. Use `list_targets` to see target health details
|
||||||
|
3. Query service logs: `{hostname="<host>", systemd_unit="<service>.service"}`
|
||||||
|
4. Search for errors: `{hostname="<host>"} |= "error"`
|
||||||
|
5. Check `list_alerts` for related alerts
|
||||||
|
6. Use role filters for group issues: `up{role="dns"}` to check all DNS servers
|
||||||
|
|
||||||
|
### After Deploying Changes
|
||||||
|
|
||||||
|
1. Verify `current_rev` updated in `nixos_flake_info`
|
||||||
|
2. Confirm `nixos_flake_revision_behind == 0`
|
||||||
|
3. Check service logs for startup issues
|
||||||
|
4. Check service metrics are being scraped
|
||||||
|
|
||||||
|
### Monitor VM Bootstrap
|
||||||
|
|
||||||
|
When provisioning new VMs, track bootstrap progress:
|
||||||
|
|
||||||
|
1. Watch bootstrap logs: `{job="bootstrap", hostname="<hostname>"}`
|
||||||
|
2. Check for failures: `{job="bootstrap", hostname="<hostname>", stage="failed"}`
|
||||||
|
3. After success, verify host appears in metrics: `up{hostname="<hostname>"}`
|
||||||
|
4. Check logs are flowing: `{hostname="<hostname>"}`
|
||||||
|
|
||||||
|
See [docs/host-creation.md](../../../docs/host-creation.md) for the full host creation pipeline.
|
||||||
|
|
||||||
|
### Debug SSH/Access Issues
|
||||||
|
|
||||||
|
```logql
|
||||||
|
{hostname="<host>", systemd_unit="sshd.service"}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Check Recent Upgrades
|
||||||
|
|
||||||
|
```logql
|
||||||
|
{systemd_unit="nixos-upgrade.service"}
|
||||||
|
```
|
||||||
|
|
||||||
|
With `start: "24h"` to see last 24 hours of upgrades across all hosts.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Default scrape interval is 15s for most metrics targets
|
||||||
|
- Default log lookback is 1h - use `start` parameter for older logs
|
||||||
|
- Use `rate()` for counter metrics, direct queries for gauges
|
||||||
|
- Use the `hostname` label to filter metrics by host (simpler than regex on `instance`)
|
||||||
|
- Host metadata labels (`role`, `tier`, `dns_role`) are propagated to all scrape targets
|
||||||
|
- Log `MESSAGE` field contains the actual log content in JSON format
|
||||||
90
.claude/skills/quick-plan/SKILL.md
Normal file
90
.claude/skills/quick-plan/SKILL.md
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
---
|
||||||
|
name: quick-plan
|
||||||
|
description: Create a planning document for a future homelab project. Use when the user wants to document ideas for future work without implementing immediately.
|
||||||
|
argument-hint: [topic or feature to plan]
|
||||||
|
---
|
||||||
|
|
||||||
|
# Quick Plan Generator
|
||||||
|
|
||||||
|
Create a planning document for a future homelab infrastructure project. Plans are for documenting ideas and approaches that will be implemented later, not immediately.
|
||||||
|
|
||||||
|
## Input
|
||||||
|
|
||||||
|
The user provides: $ARGUMENTS
|
||||||
|
|
||||||
|
## Process
|
||||||
|
|
||||||
|
1. **Understand the topic**: Research the codebase to understand:
|
||||||
|
- Current state of related systems
|
||||||
|
- Existing patterns and conventions
|
||||||
|
- Relevant NixOS options or packages
|
||||||
|
- Any constraints or dependencies
|
||||||
|
|
||||||
|
2. **Evaluate options**: If there are multiple approaches, research and compare them with pros/cons.
|
||||||
|
|
||||||
|
3. **Draft the plan**: Create a markdown document following the structure below.
|
||||||
|
|
||||||
|
4. **Save the plan**: Write to `docs/plans/<topic-slug>.md` using a kebab-case filename derived from the topic.
|
||||||
|
|
||||||
|
## Plan Structure
|
||||||
|
|
||||||
|
Use these sections as appropriate (not all plans need every section):
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
# Title
|
||||||
|
|
||||||
|
## Overview/Goal
|
||||||
|
Brief description of what this plan addresses and why.
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
What exists today that's relevant to this plan.
|
||||||
|
|
||||||
|
## Options Evaluated (if multiple approaches)
|
||||||
|
For each option:
|
||||||
|
- **Option Name**
|
||||||
|
- **Pros:** bullet points
|
||||||
|
- **Cons:** bullet points
|
||||||
|
- **Verdict:** brief assessment
|
||||||
|
|
||||||
|
Or use a comparison table for structured evaluation.
|
||||||
|
|
||||||
|
## Recommendation/Decision
|
||||||
|
What approach is recommended and why. Include rationale.
|
||||||
|
|
||||||
|
## Implementation Steps
|
||||||
|
Numbered phases or steps. Be specific but not overly detailed.
|
||||||
|
Can use sub-sections for major phases.
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
Things still to be determined. Use checkbox format:
|
||||||
|
- [ ] Question 1?
|
||||||
|
- [ ] Question 2?
|
||||||
|
|
||||||
|
## Notes (optional)
|
||||||
|
Additional context, caveats, or references.
|
||||||
|
```
|
||||||
|
|
||||||
|
## Style Guidelines
|
||||||
|
|
||||||
|
- **Concise**: Use bullet points, avoid verbose paragraphs
|
||||||
|
- **Technical but accessible**: Include NixOS config snippets when relevant
|
||||||
|
- **Future-oriented**: These are plans, not specifications
|
||||||
|
- **Acknowledge uncertainty**: Use "Open Questions" for unresolved decisions
|
||||||
|
- **Reference existing patterns**: Mention how this fits with existing infrastructure
|
||||||
|
- **Tables for comparisons**: Use markdown tables when comparing options
|
||||||
|
- **Practical focus**: Emphasize what needs to happen, not theory
|
||||||
|
- **Mermaid diagrams**: Use mermaid code blocks for architecture diagrams, flow charts, or other graphs when relevant to the plan. Keep node labels short and use `<br/>` for line breaks
|
||||||
|
|
||||||
|
## Examples of Good Plans
|
||||||
|
|
||||||
|
Reference these existing plans for style guidance:
|
||||||
|
- `docs/plans/auth-system-replacement.md` - Good option evaluation with table
|
||||||
|
- `docs/plans/truenas-migration.md` - Good decision documentation with rationale
|
||||||
|
- `docs/plans/remote-access.md` - Good multi-option comparison
|
||||||
|
- `docs/plans/prometheus-scrape-target-labels.md` - Good implementation detail level
|
||||||
|
|
||||||
|
## After Creating the Plan
|
||||||
|
|
||||||
|
1. Tell the user the plan was saved to `docs/plans/<filename>.md`
|
||||||
|
2. Summarize the key points
|
||||||
|
3. Ask if they want any adjustments before committing
|
||||||
13
.gitignore
vendored
13
.gitignore
vendored
@@ -1,5 +1,9 @@
|
|||||||
.direnv/
|
.direnv/
|
||||||
result
|
result
|
||||||
|
result-*
|
||||||
|
|
||||||
|
# MCP config (contains secrets)
|
||||||
|
.mcp.json
|
||||||
|
|
||||||
# Terraform/OpenTofu
|
# Terraform/OpenTofu
|
||||||
terraform/.terraform/
|
terraform/.terraform/
|
||||||
@@ -10,3 +14,12 @@ terraform/terraform.tfvars
|
|||||||
terraform/*.auto.tfvars
|
terraform/*.auto.tfvars
|
||||||
terraform/crash.log
|
terraform/crash.log
|
||||||
terraform/crash.*.log
|
terraform/crash.*.log
|
||||||
|
|
||||||
|
terraform/vault/.terraform/
|
||||||
|
terraform/vault/.terraform.lock.hcl
|
||||||
|
terraform/vault/*.tfstate
|
||||||
|
terraform/vault/*.tfstate.*
|
||||||
|
terraform/vault/terraform.tfvars
|
||||||
|
terraform/vault/*.auto.tfvars
|
||||||
|
terraform/vault/crash.log
|
||||||
|
terraform/vault/crash.*.log
|
||||||
|
|||||||
48
.mcp.json.example
Normal file
48
.mcp.json.example
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
{
|
||||||
|
"mcpServers": {
|
||||||
|
"nixpkgs-options": {
|
||||||
|
"command": "nix",
|
||||||
|
"args": ["run", "git+https://git.t-juice.club/torjus/labmcp#nixpkgs-search", "--", "options", "serve"],
|
||||||
|
"env": {
|
||||||
|
"NIXPKGS_SEARCH_DATABASE": "sqlite:///run/user/1000/labmcp/nixpkgs-search.db"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nixpkgs-packages": {
|
||||||
|
"command": "nix",
|
||||||
|
"args": ["run", "git+https://git.t-juice.club/torjus/labmcp#nixpkgs-search", "--", "packages", "serve"],
|
||||||
|
"env": {
|
||||||
|
"NIXPKGS_SEARCH_DATABASE": "sqlite:///run/user/1000/labmcp/nixpkgs-search.db"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"lab-monitoring": {
|
||||||
|
"command": "nix",
|
||||||
|
"args": ["run", "git+https://git.t-juice.club/torjus/labmcp#lab-monitoring", "--", "serve", "--enable-silences"],
|
||||||
|
"env": {
|
||||||
|
"PROMETHEUS_URL": "https://prometheus.home.2rjus.net",
|
||||||
|
"ALERTMANAGER_URL": "https://alertmanager.home.2rjus.net",
|
||||||
|
"LOKI_URL": "https://loki.home.2rjus.net",
|
||||||
|
"LOKI_USERNAME": "promtail",
|
||||||
|
"LOKI_PASSWORD": "<password from: bao kv get -field=password secret/shared/loki/push-auth>"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"homelab-deploy": {
|
||||||
|
"command": "nix",
|
||||||
|
"args": [
|
||||||
|
"run",
|
||||||
|
"git+https://git.t-juice.club/torjus/homelab-deploy",
|
||||||
|
"--",
|
||||||
|
"mcp",
|
||||||
|
"--nats-url", "nats://nats1.home.2rjus.net:4222",
|
||||||
|
"--nkey-file", "/home/torjus/.config/homelab-deploy/test-deployer.nkey",
|
||||||
|
"--enable-builds"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"git-explorer": {
|
||||||
|
"command": "nix",
|
||||||
|
"args": ["run", "git+https://git.t-juice.club/torjus/labmcp#git-explorer", "--", "serve"],
|
||||||
|
"env": {
|
||||||
|
"GIT_REPO_PATH": "/home/torjus/git/nixos-servers"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
72
.sops.yaml
72
.sops.yaml
@@ -1,72 +0,0 @@
|
|||||||
keys:
|
|
||||||
- &admin_torjus age1lznyk4ee7e7x8n92cq2n87kz9920473ks5u9jlhd3dczfzq4wamqept56u
|
|
||||||
- &server_ns1 age1hz2lz4k050ru3shrk5j3zk3f8azxmrp54pktw5a7nzjml4saudesx6jsl0
|
|
||||||
- &server_ns2 age1w2q4gm2lrcgdzscq8du3ssyvk6qtzm4fcszc92z9ftclq23yyydqdga5um
|
|
||||||
- &server_ns3 age1snmhmpavqy7xddmw4nuny0u4xusqmnqxqarjmghkm5zaluff84eq5xatrd
|
|
||||||
- &server_ns4 age12a3nyvjs8jrwmpkf3tgawel3nwcklwsr35ktmytnvhpawqwzrsfqpgcy0q
|
|
||||||
- &server_ha1 age1d2w5zece9647qwyq4vas9qyqegg96xwmg6c86440a6eg4uj6dd2qrq0w3l
|
|
||||||
- &server_nixos-test1 age1gcyfkxh4fq5zdp0dh484aj82ksz66wrly7qhnpv0r0p576sn9ekse8e9ju
|
|
||||||
- &server_inc1 age1g5luz2rtel3surgzuh62rkvtey7lythrvfenyq954vmeyfpxjqkqdj3wt8
|
|
||||||
- &server_http-proxy age1gq8434ku0xekqmvnseeunv83e779cg03c06gwrusnymdsr3rpufqx6vr3m
|
|
||||||
- &server_ca age1288993th0ge00reg4zqueyvmkrsvk829cs068eekjqfdprsrkeqql7mljk
|
|
||||||
- &server_monitoring01 age1vpns76ykll8jgdlu3h05cur4ew2t3k7u03kxdg8y6ypfhsfhq9fqyurjey
|
|
||||||
- &server_jelly01 age1hchvlf3apn8g8jq2743pw53sd6v6ay6xu6lqk0qufrjeccan9vzsc7hdfq
|
|
||||||
- &server_nix-cache01 age1w029fksjv0edrff9p7s03tgk3axecdkppqymfpwfn2nu2gsqqefqc37sxq
|
|
||||||
- &server_pgdb1 age1ha34qeksr4jeaecevqvv2afqem67eja2mvawlmrqsudch0e7fe7qtpsekv
|
|
||||||
- &server_nats1 age1cxt8kwqzx35yuldazcc49q88qvgy9ajkz30xu0h37uw3ts97jagqgmn2ga
|
|
||||||
- &server_auth01 age16prza00sqzuhwwcyakj6z4hvwkruwkqpmmrsn94a5ucgpkelncdq2ldctk
|
|
||||||
creation_rules:
|
|
||||||
- path_regex: secrets/[^/]+\.(yaml|json|env|ini)
|
|
||||||
key_groups:
|
|
||||||
- age:
|
|
||||||
- *admin_torjus
|
|
||||||
- *server_ns1
|
|
||||||
- *server_ns2
|
|
||||||
- *server_ns3
|
|
||||||
- *server_ns4
|
|
||||||
- *server_ha1
|
|
||||||
- *server_nixos-test1
|
|
||||||
- *server_inc1
|
|
||||||
- *server_http-proxy
|
|
||||||
- *server_ca
|
|
||||||
- *server_monitoring01
|
|
||||||
- *server_jelly01
|
|
||||||
- *server_nix-cache01
|
|
||||||
- *server_pgdb1
|
|
||||||
- *server_nats1
|
|
||||||
- *server_auth01
|
|
||||||
- path_regex: secrets/ns3/[^/]+\.(yaml|json|env|ini)
|
|
||||||
key_groups:
|
|
||||||
- age:
|
|
||||||
- *admin_torjus
|
|
||||||
- *server_ns3
|
|
||||||
- path_regex: secrets/ca/[^/]+\.(yaml|json|env|ini|)
|
|
||||||
key_groups:
|
|
||||||
- age:
|
|
||||||
- *admin_torjus
|
|
||||||
- *server_ca
|
|
||||||
- path_regex: secrets/monitoring01/[^/]+\.(yaml|json|env|ini)
|
|
||||||
key_groups:
|
|
||||||
- age:
|
|
||||||
- *admin_torjus
|
|
||||||
- *server_monitoring01
|
|
||||||
- path_regex: secrets/ca/keys/.+
|
|
||||||
key_groups:
|
|
||||||
- age:
|
|
||||||
- *admin_torjus
|
|
||||||
- *server_ca
|
|
||||||
- path_regex: secrets/nix-cache01/.+
|
|
||||||
key_groups:
|
|
||||||
- age:
|
|
||||||
- *admin_torjus
|
|
||||||
- *server_nix-cache01
|
|
||||||
- path_regex: secrets/http-proxy/.+
|
|
||||||
key_groups:
|
|
||||||
- age:
|
|
||||||
- *admin_torjus
|
|
||||||
- *server_http-proxy
|
|
||||||
- path_regex: secrets/auth01/[^/]+\.(yaml|json|env|ini|)
|
|
||||||
key_groups:
|
|
||||||
- age:
|
|
||||||
- *admin_torjus
|
|
||||||
- *server_auth01
|
|
||||||
409
CLAUDE.md
409
CLAUDE.md
@@ -21,10 +21,63 @@ nixos-rebuild build --flake .#<hostname>
|
|||||||
nix build .#nixosConfigurations.<hostname>.config.system.build.toplevel
|
nix build .#nixosConfigurations.<hostname>.config.system.build.toplevel
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Important:** Do NOT pipe `nix build` commands to other commands like `tail` or `head`. Piping can hide errors and make builds appear successful when they actually failed. Always run `nix build` without piping to see the full output.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# BAD - hides errors
|
||||||
|
nix build .#create-host 2>&1 | tail -20
|
||||||
|
|
||||||
|
# GOOD - shows all output and errors
|
||||||
|
nix build .#create-host
|
||||||
|
```
|
||||||
|
|
||||||
### Deployment
|
### Deployment
|
||||||
|
|
||||||
Do not automatically deploy changes. Deployments are usually done by updating the master branch, and then triggering the auto update on the specific host.
|
Do not automatically deploy changes. Deployments are usually done by updating the master branch, and then triggering the auto update on the specific host.
|
||||||
|
|
||||||
|
### SSH Commands
|
||||||
|
|
||||||
|
Do not run SSH commands directly. If a command needs to be run on a remote host, provide the command to the user and ask them to run it manually.
|
||||||
|
|
||||||
|
### Sharing Command Output via Loki
|
||||||
|
|
||||||
|
All hosts have the `pipe-to-loki` script for sending command output or terminal sessions to Loki, allowing users to share output with Claude without copy-pasting.
|
||||||
|
|
||||||
|
**Pipe mode** - send command output:
|
||||||
|
```bash
|
||||||
|
command | pipe-to-loki # Auto-generated ID
|
||||||
|
command | pipe-to-loki --id my-test # Custom ID
|
||||||
|
```
|
||||||
|
|
||||||
|
**Session mode** - record interactive terminal session:
|
||||||
|
```bash
|
||||||
|
pipe-to-loki --record # Start recording, exit to send
|
||||||
|
pipe-to-loki --record --id my-session # With custom ID
|
||||||
|
```
|
||||||
|
|
||||||
|
The script prints the session ID which the user can share. Query results with:
|
||||||
|
```logql
|
||||||
|
{job="pipe-to-loki"} # All entries
|
||||||
|
{job="pipe-to-loki", id="my-test"} # Specific ID
|
||||||
|
{job="pipe-to-loki", hostname="testvm01"} # From specific host
|
||||||
|
{job="pipe-to-loki", type="session"} # Only sessions
|
||||||
|
```
|
||||||
|
|
||||||
|
### Testing Feature Branches on Hosts
|
||||||
|
|
||||||
|
All hosts have the `nixos-rebuild-test` helper script for testing feature branches before merging:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On the target host, test a feature branch
|
||||||
|
nixos-rebuild-test boot <branch-name>
|
||||||
|
nixos-rebuild-test switch <branch-name>
|
||||||
|
|
||||||
|
# Additional arguments are passed through to nixos-rebuild
|
||||||
|
nixos-rebuild-test boot my-feature --show-trace
|
||||||
|
```
|
||||||
|
|
||||||
|
When working on a feature branch that requires testing on a live host, suggest using this command instead of the full flake URL syntax.
|
||||||
|
|
||||||
### Flake Management
|
### Flake Management
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@@ -36,13 +89,61 @@ Do not run `nix flake update`. Should only be done manually by user.
|
|||||||
### Development Environment
|
### Development Environment
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Enter development shell (provides ansible, python3)
|
# Enter development shell
|
||||||
nix develop
|
nix develop
|
||||||
```
|
```
|
||||||
|
|
||||||
|
The devshell provides: `ansible`, `tofu` (OpenTofu), `bao` (OpenBao CLI), `create-host`, and `homelab-deploy`.
|
||||||
|
|
||||||
|
**Important:** When suggesting commands that use devshell tools, always use `nix develop -c <command>` syntax rather than assuming the user is already in a devshell. For example:
|
||||||
|
```bash
|
||||||
|
# Good - works regardless of current shell
|
||||||
|
nix develop -c tofu plan
|
||||||
|
|
||||||
|
# Avoid - requires user to be in devshell
|
||||||
|
tofu plan
|
||||||
|
```
|
||||||
|
|
||||||
|
**OpenTofu:** Use the `-chdir` option instead of `cd` when running tofu commands in subdirectories:
|
||||||
|
```bash
|
||||||
|
# Good - uses -chdir option
|
||||||
|
nix develop -c tofu -chdir=terraform plan
|
||||||
|
nix develop -c tofu -chdir=terraform/vault apply
|
||||||
|
|
||||||
|
# Avoid - changing directories
|
||||||
|
cd terraform && tofu plan
|
||||||
|
```
|
||||||
|
|
||||||
|
### Ansible
|
||||||
|
|
||||||
|
Ansible configuration and playbooks are in `/ansible/`. See [ansible/README.md](ansible/README.md) for inventory groups, available playbooks, and usage examples.
|
||||||
|
|
||||||
|
The devshell sets `ANSIBLE_CONFIG` automatically, so no `-i` flag is needed.
|
||||||
|
|
||||||
### Secrets Management
|
### Secrets Management
|
||||||
|
|
||||||
Secrets are handled by sops. Do not edit any `.sops.yaml` or any file within `secrets/`. Ask the user to modify if necessary.
|
Secrets are managed by OpenBao (Vault) using AppRole authentication. Most hosts use the
|
||||||
|
`vault.secrets` option defined in `system/vault-secrets.nix` to fetch secrets at boot.
|
||||||
|
Terraform manages the secrets and AppRole policies in `terraform/vault/`.
|
||||||
|
|
||||||
|
### Git Workflow
|
||||||
|
|
||||||
|
**Important:** Never commit directly to `master` unless the user explicitly asks for it. Always create a feature branch for changes.
|
||||||
|
|
||||||
|
**Important:** Never amend commits to `master` unless the user explicitly asks for it. Amending rewrites history and causes issues for deployed configurations.
|
||||||
|
|
||||||
|
**Important:** Never force push to `master`. If a commit on master has an error, fix it with a new commit rather than rewriting history.
|
||||||
|
|
||||||
|
**Important:** Do not use `gh pr create` to create pull requests. The git server does not support GitHub CLI for PR creation. Instead, push the branch and let the user create the PR manually via the web interface.
|
||||||
|
|
||||||
|
When starting a new plan or task, the first step should typically be to create and checkout a new branch with an appropriate name (e.g., `git checkout -b dns-automation` or `git checkout -b fix-nginx-config`).
|
||||||
|
|
||||||
|
### Plan Management
|
||||||
|
|
||||||
|
When creating plans for large features, follow this workflow:
|
||||||
|
|
||||||
|
1. When implementation begins, save a copy of the plan to `docs/plans/` (e.g., `docs/plans/feature-name.md`)
|
||||||
|
2. Once the feature is fully implemented, move the plan to `docs/plans/completed/`
|
||||||
|
|
||||||
### Git Commit Messages
|
### Git Commit Messages
|
||||||
|
|
||||||
@@ -53,26 +154,143 @@ Examples:
|
|||||||
- `template2: add proxmox image configuration`
|
- `template2: add proxmox image configuration`
|
||||||
- `terraform: add VM deployment configuration`
|
- `terraform: add VM deployment configuration`
|
||||||
|
|
||||||
|
### Clipboard
|
||||||
|
|
||||||
|
To copy text to the clipboard, pipe to `wl-copy` (Wayland):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
echo "text" | wl-copy
|
||||||
|
```
|
||||||
|
|
||||||
|
### NixOS Options and Packages Lookup
|
||||||
|
|
||||||
|
Two MCP servers are available for searching NixOS options and packages:
|
||||||
|
|
||||||
|
- **nixpkgs-options** - Search and lookup NixOS configuration option documentation
|
||||||
|
- **nixpkgs-packages** - Search and lookup Nix packages from nixpkgs
|
||||||
|
|
||||||
|
**Session Setup:** At the start of each session, index the nixpkgs revision from `flake.lock` to ensure documentation matches the project's nixpkgs version:
|
||||||
|
|
||||||
|
1. Read `flake.lock` and find the `nixpkgs` node's `rev` field
|
||||||
|
2. Call `index_revision` with that git hash (both servers share the same index)
|
||||||
|
|
||||||
|
**Options Tools (nixpkgs-options):**
|
||||||
|
|
||||||
|
- `search_options` - Search for options by name or description (e.g., query "nginx" or "postgresql")
|
||||||
|
- `get_option` - Get full details for a specific option (e.g., `services.loki.configuration`)
|
||||||
|
- `get_file` - Fetch the source file from nixpkgs that declares an option
|
||||||
|
|
||||||
|
**Package Tools (nixpkgs-packages):**
|
||||||
|
|
||||||
|
- `search_packages` - Search for packages by name or description (e.g., query "nginx" or "python")
|
||||||
|
- `get_package` - Get full details for a specific package by attribute path (e.g., `firefox`, `python312Packages.requests`)
|
||||||
|
- `get_file` - Fetch the source file from nixpkgs that defines a package
|
||||||
|
|
||||||
|
This ensures documentation matches the exact nixpkgs version (currently NixOS 25.11) used by this flake.
|
||||||
|
|
||||||
|
### Lab Monitoring
|
||||||
|
|
||||||
|
The **lab-monitoring** MCP server provides access to Prometheus metrics and Loki logs. Use the `/observability` skill for detailed reference on:
|
||||||
|
|
||||||
|
- Available Prometheus jobs and exporters
|
||||||
|
- Loki labels and LogQL query syntax
|
||||||
|
- Bootstrap log monitoring for new VMs
|
||||||
|
- Common troubleshooting workflows
|
||||||
|
|
||||||
|
The skill contains up-to-date information about all scrape targets, host labels, and example queries.
|
||||||
|
|
||||||
|
### Deploying to Test Hosts
|
||||||
|
|
||||||
|
The **homelab-deploy** MCP server enables remote deployments to test-tier hosts via NATS messaging.
|
||||||
|
|
||||||
|
**Available Tools:**
|
||||||
|
|
||||||
|
- `deploy` - Deploy NixOS configuration to test-tier hosts
|
||||||
|
- `list_hosts` - List available deployment targets
|
||||||
|
|
||||||
|
**Deploy Parameters:**
|
||||||
|
|
||||||
|
- `hostname` - Target a specific host (e.g., `vaulttest01`)
|
||||||
|
- `role` - Deploy to all hosts with a specific role (e.g., `vault`)
|
||||||
|
- `all` - Deploy to all test-tier hosts
|
||||||
|
- `action` - nixos-rebuild action: `switch` (default), `boot`, `test`, `dry-activate`
|
||||||
|
- `branch` - Git branch or commit to deploy (default: `master`)
|
||||||
|
|
||||||
|
**Examples:**
|
||||||
|
|
||||||
|
```
|
||||||
|
# List available hosts
|
||||||
|
list_hosts()
|
||||||
|
|
||||||
|
# Deploy to a specific host
|
||||||
|
deploy(hostname="vaulttest01", action="switch")
|
||||||
|
|
||||||
|
# Dry-run deployment
|
||||||
|
deploy(hostname="vaulttest01", action="dry-activate")
|
||||||
|
|
||||||
|
# Deploy to all hosts with a role
|
||||||
|
deploy(role="vault", action="switch")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Note:** Only test-tier hosts with `homelab.deploy.enable = true` and the listener service running will respond to deployments.
|
||||||
|
|
||||||
|
**Deploying to Prod Hosts:**
|
||||||
|
|
||||||
|
The MCP server only deploys to test-tier hosts. For prod hosts, use the CLI directly:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
nix develop -c homelab-deploy -- deploy \
|
||||||
|
--nats-url nats://nats1.home.2rjus.net:4222 \
|
||||||
|
--nkey-file ~/.config/homelab-deploy/admin-deployer.nkey \
|
||||||
|
--branch <branch-name> \
|
||||||
|
--action switch \
|
||||||
|
deploy.prod.<hostname>
|
||||||
|
```
|
||||||
|
|
||||||
|
Subject format: `deploy.<tier>.<hostname>` (e.g., `deploy.prod.monitoring02`, `deploy.test.testvm01`)
|
||||||
|
|
||||||
|
**Verifying Deployments:**
|
||||||
|
|
||||||
|
After deploying, use the `nixos_flake_info` metric from nixos-exporter to verify the host is running the expected revision:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
nixos_flake_info{instance=~"vaulttest01.*"}
|
||||||
|
```
|
||||||
|
|
||||||
|
The `current_rev` label contains the git commit hash of the deployed flake configuration.
|
||||||
|
|
||||||
## Architecture
|
## Architecture
|
||||||
|
|
||||||
### Directory Structure
|
### Directory Structure
|
||||||
|
|
||||||
- `/flake.nix` - Central flake defining all 16 NixOS configurations
|
- `/flake.nix` - Central flake defining all NixOS configurations
|
||||||
- `/hosts/<hostname>/` - Per-host configurations
|
- `/hosts/<hostname>/` - Per-host configurations
|
||||||
- `default.nix` - Entry point, imports configuration.nix and services
|
- `default.nix` - Entry point, imports configuration.nix and services
|
||||||
- `configuration.nix` - Host-specific settings (networking, hardware, users)
|
- `configuration.nix` - Host-specific settings (networking, hardware, users)
|
||||||
- `/system/` - Shared system-level configurations applied to ALL hosts
|
- `/system/` - Shared system-level configurations applied to ALL hosts
|
||||||
- Core modules: nix.nix, sshd.nix, sops.nix, acme.nix, autoupgrade.nix
|
- Core modules: nix.nix, sshd.nix, vault-secrets.nix, acme.nix, autoupgrade.nix
|
||||||
|
- Additional modules: motd.nix (dynamic MOTD), packages.nix (base packages), root-user.nix (root config), homelab-deploy.nix (NATS listener)
|
||||||
- Monitoring: node-exporter and promtail on every host
|
- Monitoring: node-exporter and promtail on every host
|
||||||
|
- `/modules/` - Custom NixOS modules
|
||||||
|
- `homelab/` - Homelab-specific options (see "Homelab Module Options" section below)
|
||||||
|
- `/lib/` - Nix library functions
|
||||||
|
- `dns-zone.nix` - DNS zone generation functions
|
||||||
|
- `monitoring.nix` - Prometheus scrape target generation functions
|
||||||
- `/services/` - Reusable service modules, selectively imported by hosts
|
- `/services/` - Reusable service modules, selectively imported by hosts
|
||||||
- `home-assistant/` - Home automation stack
|
- `home-assistant/` - Home automation stack
|
||||||
- `monitoring/` - Observability stack (Prometheus, Grafana, Loki, Tempo)
|
- `monitoring/` - Observability stack (Prometheus, Grafana, Loki, Tempo)
|
||||||
- `ns/` - DNS services (authoritative, resolver)
|
- `ns/` - DNS services (authoritative, resolver, zone generation)
|
||||||
- `http-proxy/`, `ca/`, `postgres/`, `nats/`, `jellyfin/`, etc.
|
- `vault/` - OpenBao (Vault) secrets server
|
||||||
- `/secrets/` - SOPS-encrypted secrets with age encryption
|
- `actions-runner/` - GitHub Actions runner
|
||||||
|
- `http-proxy/`, `postgres/`, `nats/`, `jellyfin/`, etc.
|
||||||
- `/common/` - Shared configurations (e.g., VM guest agent)
|
- `/common/` - Shared configurations (e.g., VM guest agent)
|
||||||
- `/playbooks/` - Ansible playbooks for fleet management
|
- `/docs/` - Documentation and plans
|
||||||
- `/.sops.yaml` - SOPS configuration with age keys for all servers
|
- `plans/` - Future plans and proposals
|
||||||
|
- `plans/completed/` - Completed plans (moved here when done)
|
||||||
|
- `/ansible/` - Ansible configuration and playbooks
|
||||||
|
- `ansible.cfg` - Ansible configuration (inventory path, defaults)
|
||||||
|
- `inventory/` - Dynamic and static inventory sources
|
||||||
|
- `playbooks/` - Ansible playbooks for fleet management
|
||||||
|
|
||||||
### Configuration Inheritance
|
### Configuration Inheritance
|
||||||
|
|
||||||
@@ -88,39 +306,28 @@ hosts/<hostname>/default.nix
|
|||||||
All hosts automatically get:
|
All hosts automatically get:
|
||||||
- Nix binary cache (nix-cache.home.2rjus.net)
|
- Nix binary cache (nix-cache.home.2rjus.net)
|
||||||
- SSH with root login enabled
|
- SSH with root login enabled
|
||||||
- SOPS secrets management with auto-generated age keys
|
- OpenBao (Vault) secrets management via AppRole
|
||||||
- Internal ACME CA integration (ca.home.2rjus.net)
|
- Internal ACME CA integration (OpenBao PKI at vault.home.2rjus.net)
|
||||||
- Daily auto-upgrades with auto-reboot
|
- Daily auto-upgrades with auto-reboot
|
||||||
- Prometheus node-exporter + Promtail (logs to monitoring01)
|
- Prometheus node-exporter + Promtail (logs to monitoring02)
|
||||||
|
- Monitoring scrape target auto-registration via `homelab.monitoring` options
|
||||||
- Custom root CA trust
|
- Custom root CA trust
|
||||||
|
- DNS zone auto-registration via `homelab.dns` options
|
||||||
|
|
||||||
### Active Hosts
|
### Hosts
|
||||||
|
|
||||||
Production servers managed by `rebuild-all.sh`:
|
Host configurations are in `/hosts/<hostname>/`. See `flake.nix` for the complete list of `nixosConfigurations`.
|
||||||
- `ns1`, `ns2` - Primary/secondary DNS servers (10.69.13.5/6)
|
|
||||||
- `ca` - Internal Certificate Authority
|
|
||||||
- `ha1` - Home Assistant + Zigbee2MQTT + Mosquitto
|
|
||||||
- `http-proxy` - Reverse proxy
|
|
||||||
- `monitoring01` - Full observability stack (Prometheus, Grafana, Loki, Tempo, Pyroscope)
|
|
||||||
- `jelly01` - Jellyfin media server
|
|
||||||
- `nix-cache01` - Binary cache server
|
|
||||||
- `pgdb1` - PostgreSQL database
|
|
||||||
- `nats1` - NATS messaging server
|
|
||||||
- `auth01` - Authentication service
|
|
||||||
|
|
||||||
Template/test hosts:
|
Use `nix flake show` or `nix develop -c ansible-inventory --graph` to list all hosts.
|
||||||
- `template1` - Base template for cloning new hosts
|
|
||||||
- `nixos-test1` - Test environment
|
|
||||||
|
|
||||||
### Flake Inputs
|
### Flake Inputs
|
||||||
|
|
||||||
- `nixpkgs` - NixOS 25.11 stable (primary)
|
- `nixpkgs` - NixOS 25.11 stable (primary)
|
||||||
- `nixpkgs-unstable` - Unstable channel (available via overlay as `pkgs.unstable.<package>`)
|
- `nixpkgs-unstable` - Unstable channel (available via overlay as `pkgs.unstable.<package>`)
|
||||||
- `sops-nix` - Secrets management
|
- `nixos-exporter` - NixOS module for exposing flake revision metrics (used to verify deployments)
|
||||||
|
- `homelab-deploy` - NATS-based remote deployment tool for test-tier hosts
|
||||||
- Custom packages from git.t-juice.club:
|
- Custom packages from git.t-juice.club:
|
||||||
- `backup-helper` - Backup automation module
|
|
||||||
- `alerttonotify` - Alert routing
|
- `alerttonotify` - Alert routing
|
||||||
- `labmon` - Lab monitoring
|
|
||||||
|
|
||||||
### Network Architecture
|
### Network Architecture
|
||||||
|
|
||||||
@@ -128,17 +335,21 @@ Template/test hosts:
|
|||||||
- Infrastructure subnet: `10.69.13.x`
|
- Infrastructure subnet: `10.69.13.x`
|
||||||
- DNS: ns1/ns2 provide authoritative DNS with primary-secondary setup
|
- DNS: ns1/ns2 provide authoritative DNS with primary-secondary setup
|
||||||
- Internal CA for ACME certificates (no Let's Encrypt)
|
- Internal CA for ACME certificates (no Let's Encrypt)
|
||||||
- Centralized monitoring at monitoring01
|
- Centralized monitoring at monitoring02
|
||||||
- Static networking via systemd-networkd
|
- Static networking via systemd-networkd
|
||||||
|
|
||||||
### Secrets Management
|
### Secrets Management
|
||||||
|
|
||||||
- Uses SOPS with age encryption
|
Most hosts use OpenBao (Vault) for secrets:
|
||||||
- Each server has unique age key in `.sops.yaml`
|
- Vault server at `vault01.home.2rjus.net:8200`
|
||||||
- Keys auto-generated at `/var/lib/sops-nix/key.txt` on first boot
|
- AppRole authentication with credentials at `/var/lib/vault/approle/`
|
||||||
- Shared secrets: `/secrets/secrets.yaml`
|
- Secrets defined in Terraform (`terraform/vault/secrets.tf`)
|
||||||
- Per-host secrets: `/secrets/<hostname>/`
|
- AppRole policies in Terraform (`terraform/vault/approle.tf`)
|
||||||
- All production servers can decrypt shared secrets; host-specific secrets require specific host keys
|
- NixOS module: `system/vault-secrets.nix` with `vault.secrets.<name>` options
|
||||||
|
- `extractKey` option extracts a single key from vault JSON as a plain file
|
||||||
|
- Secrets fetched at boot by `vault-secret-<name>.service` systemd units
|
||||||
|
- Fallback to cached secrets in `/var/lib/vault/cache/` when Vault is unreachable
|
||||||
|
- Provision AppRole credentials: `nix develop -c ansible-playbook ansible/playbooks/provision-approle.yml -l <hostname>`
|
||||||
|
|
||||||
### Auto-Upgrade System
|
### Auto-Upgrade System
|
||||||
|
|
||||||
@@ -162,7 +373,7 @@ Template VMs are built from `hosts/template2` and deployed to Proxmox using Ansi
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Build NixOS image and deploy to Proxmox as template
|
# Build NixOS image and deploy to Proxmox as template
|
||||||
nix develop -c ansible-playbook -i playbooks/inventory.ini playbooks/build-and-deploy-template.yml
|
nix develop -c ansible-playbook ansible/playbooks/build-and-deploy-template.yml
|
||||||
```
|
```
|
||||||
|
|
||||||
This playbook:
|
This playbook:
|
||||||
@@ -200,22 +411,58 @@ Example VM deployment includes:
|
|||||||
- Custom CPU/memory/disk sizing
|
- Custom CPU/memory/disk sizing
|
||||||
- VLAN tagging
|
- VLAN tagging
|
||||||
- QEMU guest agent
|
- QEMU guest agent
|
||||||
|
- Automatic Vault credential provisioning via `vault_wrapped_token`
|
||||||
|
|
||||||
OpenTofu outputs the VM's IP address after deployment for easy SSH access.
|
OpenTofu outputs the VM's IP address after deployment for easy SSH access.
|
||||||
|
|
||||||
|
**Automatic Vault Credential Provisioning:**
|
||||||
|
|
||||||
|
VMs can receive Vault (OpenBao) credentials automatically during bootstrap:
|
||||||
|
|
||||||
|
1. OpenTofu generates a wrapped token via `terraform/vault/` and stores it in the VM configuration
|
||||||
|
2. Cloud-init passes `VAULT_WRAPPED_TOKEN` and `NIXOS_FLAKE_BRANCH` to the bootstrap script
|
||||||
|
3. The bootstrap script unwraps the token to obtain AppRole credentials
|
||||||
|
4. Credentials are written to `/var/lib/vault/approle/` before the NixOS rebuild
|
||||||
|
|
||||||
|
This eliminates the need for manual `provision-approle.yml` playbook runs on new VMs. Bootstrap progress is logged to Loki with `job="bootstrap"` labels.
|
||||||
|
|
||||||
|
#### Template Rebuilding and Terraform State
|
||||||
|
|
||||||
|
When the Proxmox template is rebuilt (via `build-and-deploy-template.yml`), the template name may change. This would normally cause Terraform to want to recreate all existing VMs, but that's unnecessary since VMs are independent once cloned.
|
||||||
|
|
||||||
|
**Solution**: The `terraform/vms.tf` file includes a lifecycle rule to ignore certain attributes that don't need management:
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
lifecycle {
|
||||||
|
ignore_changes = [
|
||||||
|
clone, # Template name can change without recreating VMs
|
||||||
|
startup_shutdown, # Proxmox sets defaults (-1) that we don't need to manage
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
This means:
|
||||||
|
- **clone**: Existing VMs are not affected by template name changes; only new VMs use the updated template
|
||||||
|
- **startup_shutdown**: Proxmox sets default startup order/delay values (-1) that Terraform would otherwise try to remove
|
||||||
|
- You can safely update `default_template_name` in `terraform/variables.tf` without recreating VMs
|
||||||
|
- `tofu plan` won't show spurious changes for Proxmox-managed defaults
|
||||||
|
|
||||||
|
**When rebuilding the template:**
|
||||||
|
1. Run `nix develop -c ansible-playbook ansible/playbooks/build-and-deploy-template.yml`
|
||||||
|
2. Update `default_template_name` in `terraform/variables.tf` if the name changed
|
||||||
|
3. Run `tofu plan` - should show no VM recreations (only template name in state)
|
||||||
|
4. Run `tofu apply` - updates state without touching existing VMs
|
||||||
|
5. New VMs created after this point will use the new template
|
||||||
|
|
||||||
### Adding a New Host
|
### Adding a New Host
|
||||||
|
|
||||||
1. Create `/hosts/<hostname>/` directory
|
See [docs/host-creation.md](docs/host-creation.md) for the complete host creation pipeline, including:
|
||||||
2. Copy structure from `template1` or similar host
|
- Using the `create-host` script to generate host configurations
|
||||||
3. Add host entry to `flake.nix` nixosConfigurations
|
- Deploying VMs and secrets with OpenTofu
|
||||||
4. Add hostname to dns zone files. Merge to master. Run auto-upgrade on dns servers.
|
- Monitoring the bootstrap process via Loki
|
||||||
5. User clones template host
|
- Verification and troubleshooting steps
|
||||||
6. User runs `prepare-host.sh` on new host, this deletes files which should be regenerated, like ssh host keys, machine-id etc. It also creates a new age key, and prints the public key
|
|
||||||
7. This key is then added to `.sops.yaml`
|
**Note:** DNS A records and Prometheus node-exporter scrape targets are auto-generated from the host's `systemd.network.networks` static IP configuration. No manual zone file or Prometheus config editing is required.
|
||||||
8. Create `/secrets/<hostname>/` if needed
|
|
||||||
9. Configure networking (static IP, DNS servers)
|
|
||||||
10. Commit changes, and merge to master.
|
|
||||||
11. Deploy by running `nixos-rebuild boot --flake URL#<hostname>` on the host.
|
|
||||||
|
|
||||||
### Important Patterns
|
### Important Patterns
|
||||||
|
|
||||||
@@ -229,18 +476,68 @@ OpenTofu outputs the VM's IP address after deployment for easy SSH access.
|
|||||||
|
|
||||||
**Firewall**: Disabled on most hosts (trusted network). Enable selectively in host configuration if needed.
|
**Firewall**: Disabled on most hosts (trusted network). Enable selectively in host configuration if needed.
|
||||||
|
|
||||||
|
**Shell scripts**: Use `pkgs.writeShellApplication` instead of `pkgs.writeShellScript` or `pkgs.writeShellScriptBin` for creating shell scripts. `writeShellApplication` provides automatic shellcheck validation, sets strict bash options (`set -euo pipefail`), and allows declaring `runtimeInputs` for dependencies. When referencing the executable path (e.g., in `ExecStart`), use `lib.getExe myScript` to get the proper `bin/` path.
|
||||||
|
|
||||||
### Monitoring Stack
|
### Monitoring Stack
|
||||||
|
|
||||||
All hosts ship metrics and logs to `monitoring01`:
|
All hosts ship metrics and logs to `monitoring02`:
|
||||||
- **Metrics**: Prometheus scrapes node-exporter from all hosts
|
- **Metrics**: VictoriaMetrics scrapes node-exporter from all hosts
|
||||||
- **Logs**: Promtail ships logs to Loki on monitoring01
|
- **Logs**: Promtail ships logs to Loki on monitoring02
|
||||||
- **Access**: Grafana at monitoring01 for visualization
|
- **Access**: Grafana at monitoring02 for visualization
|
||||||
- **Tracing**: Tempo for distributed tracing
|
|
||||||
- **Profiling**: Pyroscope for continuous profiling
|
**Scrape Target Auto-Generation:**
|
||||||
|
|
||||||
|
VictoriaMetrics scrape targets are automatically generated from host configurations, following the same pattern as DNS zone generation:
|
||||||
|
|
||||||
|
- **Node-exporter**: All flake hosts with static IPs are automatically added as node-exporter targets
|
||||||
|
- **Service targets**: Defined via `homelab.monitoring.scrapeTargets` in service modules
|
||||||
|
- **External targets**: Non-flake hosts defined in `/services/monitoring/external-targets.nix`
|
||||||
|
- **Library**: `lib/monitoring.nix` provides `generateNodeExporterTargets` and `generateScrapeConfigs`
|
||||||
|
|
||||||
|
Service modules declare their scrape targets directly via `homelab.monitoring.scrapeTargets`. The VictoriaMetrics config on monitoring02 auto-generates scrape configs from all hosts. See "Homelab Module Options" section for available options.
|
||||||
|
|
||||||
|
To add monitoring targets for non-NixOS hosts, edit `/services/monitoring/external-targets.nix`.
|
||||||
|
|
||||||
### DNS Architecture
|
### DNS Architecture
|
||||||
|
|
||||||
- `ns1` (10.69.13.5) - Primary authoritative DNS + resolver
|
- `ns1` (10.69.13.5) - Primary authoritative DNS + resolver
|
||||||
- `ns2` (10.69.13.6) - Secondary authoritative DNS (AXFR from ns1)
|
- `ns2` (10.69.13.6) - Secondary authoritative DNS (AXFR from ns1)
|
||||||
- Zone files managed in `/services/ns/`
|
|
||||||
- All hosts point to ns1/ns2 for DNS resolution
|
- All hosts point to ns1/ns2 for DNS resolution
|
||||||
|
|
||||||
|
**Zone Auto-Generation:**
|
||||||
|
|
||||||
|
DNS zone entries are automatically generated from host configurations:
|
||||||
|
|
||||||
|
- **Flake-managed hosts**: A records extracted from `systemd.network.networks` static IPs
|
||||||
|
- **CNAMEs**: Defined via `homelab.dns.cnames` option in host configs
|
||||||
|
- **External hosts**: Non-flake hosts defined in `/services/ns/external-hosts.nix`
|
||||||
|
- **Serial number**: Uses `self.sourceInfo.lastModified` (git commit timestamp)
|
||||||
|
|
||||||
|
Hosts are automatically excluded from DNS if:
|
||||||
|
- `homelab.dns.enable = false` (e.g., template hosts)
|
||||||
|
- No static IP configured (e.g., DHCP-only hosts)
|
||||||
|
- Network interface is a VPN/tunnel (wg*, tun*, tap*)
|
||||||
|
|
||||||
|
To add DNS entries for non-NixOS hosts, edit `/services/ns/external-hosts.nix`.
|
||||||
|
|
||||||
|
### Homelab Module Options
|
||||||
|
|
||||||
|
The `modules/homelab/` directory defines custom options used across hosts for automation and metadata.
|
||||||
|
|
||||||
|
**Host options (`homelab.host.*`):**
|
||||||
|
- `tier` - Deployment tier: `test` or `prod`. Test-tier hosts can receive remote deployments and have different credential access.
|
||||||
|
- `priority` - Alerting priority: `high` or `low`. Controls alerting thresholds for the host.
|
||||||
|
- `role` - Primary role designation (e.g., `dns`, `database`, `bastion`, `vault`)
|
||||||
|
- `labels` - Free-form key-value metadata for host categorization
|
||||||
|
- `ansible = "false"` - Exclude host from Ansible dynamic inventory
|
||||||
|
|
||||||
|
**DNS options (`homelab.dns.*`):**
|
||||||
|
- `enable` (default: `true`) - Include host in DNS zone generation
|
||||||
|
- `cnames` (default: `[]`) - List of CNAME aliases pointing to this host
|
||||||
|
|
||||||
|
**Monitoring options (`homelab.monitoring.*`):**
|
||||||
|
- `enable` (default: `true`) - Include host in Prometheus node-exporter scrape targets
|
||||||
|
- `scrapeTargets` (default: `[]`) - Additional scrape targets exposed by this host
|
||||||
|
|
||||||
|
**Deploy options (`homelab.deploy.*`):**
|
||||||
|
- `enable` (default: `false`) - Enable NATS-based remote deployment listener. When enabled, the host listens for deployment commands via NATS and can be targeted by the `homelab-deploy` MCP server.
|
||||||
|
|||||||
125
README.md
125
README.md
@@ -1,11 +1,124 @@
|
|||||||
# nixos-servers
|
# nixos-servers
|
||||||
|
|
||||||
Nixos configs for my homelab servers.
|
NixOS Flake-based configuration repository for a homelab infrastructure. All hosts run NixOS 25.11 and are managed declaratively through this single repository.
|
||||||
|
|
||||||
## Configurations in use
|
## Hosts
|
||||||
|
|
||||||
* ha1
|
| Host | Role |
|
||||||
* ns1
|
|------|------|
|
||||||
* ns2
|
| `ns1`, `ns2` | Primary/secondary authoritative DNS |
|
||||||
* template1
|
| `ca` | Internal Certificate Authority |
|
||||||
|
| `ha1` | Home Assistant + Zigbee2MQTT + Mosquitto |
|
||||||
|
| `http-proxy` | Reverse proxy |
|
||||||
|
| `monitoring02` | VictoriaMetrics, Grafana, Loki, Alertmanager |
|
||||||
|
| `jelly01` | Jellyfin media server |
|
||||||
|
| `nix-cache02` | Nix binary cache + NATS-based build service |
|
||||||
|
| `nats1` | NATS messaging |
|
||||||
|
| `vault01` | OpenBao (Vault) secrets management |
|
||||||
|
| `template1`, `template2` | VM templates for cloning new hosts |
|
||||||
|
|
||||||
|
## Directory Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
flake.nix # Flake entry point, defines all host configurations
|
||||||
|
hosts/<hostname>/ # Per-host configuration
|
||||||
|
system/ # Shared modules applied to ALL hosts
|
||||||
|
services/ # Reusable service modules, selectively imported per host
|
||||||
|
modules/ # Custom NixOS module definitions
|
||||||
|
lib/ # Nix library functions (DNS zone generation, etc.)
|
||||||
|
secrets/ # SOPS-encrypted secrets (legacy, only used by ca)
|
||||||
|
common/ # Shared configurations (e.g., VM guest agent)
|
||||||
|
terraform/ # OpenTofu configs for Proxmox VM provisioning
|
||||||
|
terraform/vault/ # OpenTofu configs for OpenBao (secrets, PKI, AppRoles)
|
||||||
|
playbooks/ # Ansible playbooks for template building and fleet ops
|
||||||
|
scripts/ # Helper scripts (create-host, vault-fetch)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Key Features
|
||||||
|
|
||||||
|
**Automatic DNS zone generation** - A records are derived from each host's static IP configuration. CNAME aliases are defined via `homelab.dns.cnames`. No manual zone file editing required.
|
||||||
|
|
||||||
|
**OpenBao (Vault) secrets** - Hosts authenticate via AppRole and fetch secrets at boot. Secrets and policies are managed as code in `terraform/vault/`. Legacy SOPS remains only for the `ca` host.
|
||||||
|
|
||||||
|
**Daily auto-upgrades** - All hosts pull from the master branch and automatically rebuild and reboot on a randomized schedule.
|
||||||
|
|
||||||
|
**Shared base configuration** - Every host automatically gets SSH, monitoring (node-exporter + Promtail), internal ACME certificates, and Nix binary cache access via the `system/` modules.
|
||||||
|
|
||||||
|
**Proxmox VM provisioning** - Build VM templates with Ansible and deploy VMs with OpenTofu from `terraform/`.
|
||||||
|
|
||||||
|
**OpenBao (Vault) secrets** - Centralized secrets management with AppRole authentication, PKI infrastructure, and automated bootstrap. Managed as code in `terraform/vault/`.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Enter dev shell (provides ansible, opentofu, openbao, create-host)
|
||||||
|
nix develop
|
||||||
|
|
||||||
|
# Build a host configuration locally
|
||||||
|
nix build .#nixosConfigurations.<hostname>.config.system.build.toplevel
|
||||||
|
|
||||||
|
# List all configurations
|
||||||
|
nix flake show
|
||||||
|
```
|
||||||
|
|
||||||
|
Deployments are done by merging to master and triggering the auto-upgrade on the target host.
|
||||||
|
|
||||||
|
## Provisioning New Hosts
|
||||||
|
|
||||||
|
The repository includes an automated pipeline for creating and deploying new hosts on Proxmox.
|
||||||
|
|
||||||
|
### 1. Generate host configuration
|
||||||
|
|
||||||
|
The `create-host` tool (available in the dev shell) generates all required files for a new host:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
create-host \
|
||||||
|
--hostname myhost \
|
||||||
|
--ip 10.69.13.50/24 \
|
||||||
|
--cpu 4 \
|
||||||
|
--memory 4096 \
|
||||||
|
--disk 50G
|
||||||
|
```
|
||||||
|
|
||||||
|
This creates:
|
||||||
|
- `hosts/<hostname>/` - NixOS configuration (networking, imports, hardware)
|
||||||
|
- Entry in `flake.nix`
|
||||||
|
- VM definition in `terraform/vms.tf`
|
||||||
|
- Vault AppRole policy and wrapped bootstrap token
|
||||||
|
|
||||||
|
Omit `--ip` for DHCP. Use `--dry-run` to preview changes. Use `--force` to regenerate an existing host's config.
|
||||||
|
|
||||||
|
### 2. Build and deploy the VM template
|
||||||
|
|
||||||
|
The Proxmox VM template is built from `hosts/template2` and deployed with Ansible:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
nix develop -c ansible-playbook -i playbooks/inventory.ini playbooks/build-and-deploy-template.yml
|
||||||
|
```
|
||||||
|
|
||||||
|
This only needs to be re-run when the base template changes.
|
||||||
|
|
||||||
|
### 3. Deploy the VM
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd terraform && tofu apply
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Automatic bootstrap
|
||||||
|
|
||||||
|
On first boot, the VM automatically:
|
||||||
|
1. Receives its hostname and Vault credentials via cloud-init
|
||||||
|
2. Unwraps the Vault token and stores AppRole credentials
|
||||||
|
3. Runs `nixos-rebuild boot` against the flake on the master branch
|
||||||
|
4. Reboots into the host-specific configuration
|
||||||
|
5. Services fetch their secrets from Vault at startup
|
||||||
|
|
||||||
|
No manual intervention is required after `tofu apply`.
|
||||||
|
|
||||||
|
## Network
|
||||||
|
|
||||||
|
- Domain: `home.2rjus.net`
|
||||||
|
- Infrastructure subnet: `10.69.13.0/24`
|
||||||
|
- DNS: ns1/ns2 authoritative with primary-secondary AXFR
|
||||||
|
- Internal CA for TLS certificates (migrating from step-ca to OpenBao PKI)
|
||||||
|
- Centralized monitoring at monitoring02
|
||||||
|
|||||||
549
TODO.md
549
TODO.md
@@ -1,549 +0,0 @@
|
|||||||
# TODO: Automated Host Deployment Pipeline
|
|
||||||
|
|
||||||
## Vision
|
|
||||||
|
|
||||||
Automate the entire process of creating, configuring, and deploying new NixOS hosts on Proxmox from a single command or script.
|
|
||||||
|
|
||||||
**Desired workflow:**
|
|
||||||
```bash
|
|
||||||
./scripts/create-host.sh --hostname myhost --ip 10.69.13.50
|
|
||||||
# Script creates config, deploys VM, bootstraps NixOS, and you're ready to go
|
|
||||||
```
|
|
||||||
|
|
||||||
**Current manual workflow (from CLAUDE.md):**
|
|
||||||
1. Create `/hosts/<hostname>/` directory structure
|
|
||||||
2. Add host to `flake.nix`
|
|
||||||
3. Add DNS entries
|
|
||||||
4. Clone template VM manually
|
|
||||||
5. Run `prepare-host.sh` on new VM
|
|
||||||
6. Add generated age key to `.sops.yaml`
|
|
||||||
7. Configure networking
|
|
||||||
8. Commit and push
|
|
||||||
9. Run `nixos-rebuild boot --flake URL#<hostname>` on host
|
|
||||||
|
|
||||||
## The Plan
|
|
||||||
|
|
||||||
### Phase 1: Parameterized OpenTofu Deployments ✅ COMPLETED
|
|
||||||
|
|
||||||
**Status:** Fully implemented and tested
|
|
||||||
|
|
||||||
**Implementation:**
|
|
||||||
- Locals-based structure using `for_each` pattern for multiple VM deployments
|
|
||||||
- All VM parameters configurable with smart defaults (CPU, memory, disk, IP, storage, etc.)
|
|
||||||
- Automatic DHCP vs static IP detection based on `ip` field presence
|
|
||||||
- Dynamic outputs showing deployed VM IPs and specifications
|
|
||||||
- Successfully tested deploying multiple VMs simultaneously
|
|
||||||
|
|
||||||
**Tasks:**
|
|
||||||
- [x] Create module/template structure in terraform for repeatable VM deployments
|
|
||||||
- [x] Parameterize VM configuration (hostname, CPU, memory, disk, IP)
|
|
||||||
- [x] Support both DHCP and static IP configuration via cloud-init
|
|
||||||
- [x] Test deploying multiple VMs from same template
|
|
||||||
|
|
||||||
**Deliverable:** ✅ Can deploy multiple VMs with custom parameters via OpenTofu in a single `tofu apply`
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- `terraform/vms.tf` - VM definitions using locals map
|
|
||||||
- `terraform/outputs.tf` - Dynamic outputs for all VMs
|
|
||||||
- `terraform/variables.tf` - Configurable defaults
|
|
||||||
- `terraform/README.md` - Complete documentation
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Phase 2: Host Configuration Generator ✅ COMPLETED
|
|
||||||
|
|
||||||
**Status:** ✅ Fully implemented and tested
|
|
||||||
**Completed:** 2025-02-01
|
|
||||||
**Enhanced:** 2025-02-01 (added --force flag)
|
|
||||||
|
|
||||||
**Goal:** Automate creation of host configuration files
|
|
||||||
|
|
||||||
**Implementation:**
|
|
||||||
- Python CLI tool packaged as Nix derivation
|
|
||||||
- Available as `create-host` command in devShell
|
|
||||||
- Rich terminal UI with configuration previews
|
|
||||||
- Comprehensive validation (hostname format/uniqueness, IP subnet/uniqueness)
|
|
||||||
- Jinja2 templates for NixOS configurations
|
|
||||||
- Automatic updates to flake.nix and terraform/vms.tf
|
|
||||||
- `--force` flag for regenerating existing configurations (useful for testing)
|
|
||||||
|
|
||||||
**Tasks:**
|
|
||||||
- [x] Create Python CLI with typer framework
|
|
||||||
- [x] Takes parameters: hostname, IP, CPU cores, memory, disk size
|
|
||||||
- [x] Generates `/hosts/<hostname>/` directory structure
|
|
||||||
- [x] Creates `configuration.nix` with proper hostname and networking
|
|
||||||
- [x] Generates `default.nix` with standard imports
|
|
||||||
- [x] References shared `hardware-configuration.nix` from template
|
|
||||||
- [x] Add host entry to `flake.nix` programmatically
|
|
||||||
- [x] Text-based manipulation (regex insertion)
|
|
||||||
- [x] Inserts new nixosConfiguration entry
|
|
||||||
- [x] Maintains proper formatting
|
|
||||||
- [x] Generate corresponding OpenTofu configuration
|
|
||||||
- [x] Adds VM definition to `terraform/vms.tf`
|
|
||||||
- [x] Uses parameters from CLI input
|
|
||||||
- [x] Supports both static IP and DHCP modes
|
|
||||||
- [x] Package as Nix derivation with templates
|
|
||||||
- [x] Add to flake packages and devShell
|
|
||||||
- [x] Implement dry-run mode
|
|
||||||
- [x] Write comprehensive README
|
|
||||||
|
|
||||||
**Usage:**
|
|
||||||
```bash
|
|
||||||
# In nix develop shell
|
|
||||||
create-host \
|
|
||||||
--hostname test01 \
|
|
||||||
--ip 10.69.13.50/24 \ # optional, omit for DHCP
|
|
||||||
--cpu 4 \ # optional, default 2
|
|
||||||
--memory 4096 \ # optional, default 2048
|
|
||||||
--disk 50G \ # optional, default 20G
|
|
||||||
--dry-run # optional preview mode
|
|
||||||
```
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- `scripts/create-host/` - Complete Python package with Nix derivation
|
|
||||||
- `scripts/create-host/README.md` - Full documentation and examples
|
|
||||||
|
|
||||||
**Deliverable:** ✅ Tool generates all config files for a new host, validated with Nix and Terraform
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Phase 3: Bootstrap Mechanism ✅ COMPLETED
|
|
||||||
|
|
||||||
**Status:** ✅ Fully implemented and tested
|
|
||||||
**Completed:** 2025-02-01
|
|
||||||
**Enhanced:** 2025-02-01 (added branch support for testing)
|
|
||||||
|
|
||||||
**Goal:** Get freshly deployed VM to apply its specific host configuration
|
|
||||||
|
|
||||||
**Implementation:** Systemd oneshot service that runs on first boot after cloud-init
|
|
||||||
|
|
||||||
**Approach taken:** Systemd service (variant of Option A)
|
|
||||||
- Systemd service `nixos-bootstrap.service` runs on first boot
|
|
||||||
- Depends on `cloud-config.service` to ensure hostname is set
|
|
||||||
- Reads hostname from `hostnamectl` (set by cloud-init via Terraform)
|
|
||||||
- Supports custom git branch via `NIXOS_FLAKE_BRANCH` environment variable
|
|
||||||
- Runs `nixos-rebuild boot --flake git+https://git.t-juice.club/torjus/nixos-servers.git?ref=$BRANCH#${hostname}`
|
|
||||||
- Reboots into new configuration on success
|
|
||||||
- Fails gracefully without reboot on errors (network issues, missing config)
|
|
||||||
- Service self-destructs after successful bootstrap (not in new config)
|
|
||||||
|
|
||||||
**Tasks:**
|
|
||||||
- [x] Create bootstrap service module in template2
|
|
||||||
- [x] systemd oneshot service with proper dependencies
|
|
||||||
- [x] Reads hostname from hostnamectl (cloud-init sets it)
|
|
||||||
- [x] Checks network connectivity via HTTPS (curl)
|
|
||||||
- [x] Runs nixos-rebuild boot with flake URL
|
|
||||||
- [x] Reboots on success, fails gracefully on error
|
|
||||||
- [x] Configure cloud-init datasource
|
|
||||||
- [x] Use ConfigDrive datasource (Proxmox provider)
|
|
||||||
- [x] Add cloud-init disk to Terraform VMs (disks.ide.ide2.cloudinit)
|
|
||||||
- [x] Hostname passed via cloud-init user-data from Terraform
|
|
||||||
- [x] Test bootstrap service execution on fresh VM
|
|
||||||
- [x] Handle failure cases (flake doesn't exist, network issues)
|
|
||||||
- [x] Clear error messages in journald
|
|
||||||
- [x] No reboot on failure
|
|
||||||
- [x] System remains accessible for debugging
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- `hosts/template2/bootstrap.nix` - Bootstrap service definition
|
|
||||||
- `hosts/template2/configuration.nix` - Cloud-init ConfigDrive datasource
|
|
||||||
- `terraform/vms.tf` - Cloud-init disk configuration
|
|
||||||
|
|
||||||
**Deliverable:** ✅ VMs automatically bootstrap and reboot into host-specific configuration on first boot
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Phase 4: Secrets Management with HashiCorp Vault
|
|
||||||
|
|
||||||
**Challenge:** Current sops-nix approach has chicken-and-egg problem with age keys
|
|
||||||
|
|
||||||
**Current workflow:**
|
|
||||||
1. VM boots, generates age key at `/var/lib/sops-nix/key.txt`
|
|
||||||
2. User runs `prepare-host.sh` which prints public key
|
|
||||||
3. User manually adds public key to `.sops.yaml`
|
|
||||||
4. User commits, pushes
|
|
||||||
5. VM can now decrypt secrets
|
|
||||||
|
|
||||||
**Selected approach:** Migrate to HashiCorp Vault for centralized secrets management
|
|
||||||
|
|
||||||
**Benefits:**
|
|
||||||
- Industry-standard secrets management (Vault experience transferable to work)
|
|
||||||
- Eliminates manual age key distribution step
|
|
||||||
- Secrets-as-code via OpenTofu (infrastructure-as-code aligned)
|
|
||||||
- Centralized PKI management (replaces step-ca, consolidates TLS + SSH CA)
|
|
||||||
- Automatic secret rotation capabilities
|
|
||||||
- Audit logging for all secret access
|
|
||||||
- AppRole authentication enables automated bootstrap
|
|
||||||
|
|
||||||
**Architecture:**
|
|
||||||
```
|
|
||||||
vault.home.2rjus.net
|
|
||||||
├─ KV Secrets Engine (replaces sops-nix)
|
|
||||||
├─ PKI Engine (replaces step-ca for TLS)
|
|
||||||
├─ SSH CA Engine (replaces step-ca SSH CA)
|
|
||||||
└─ AppRole Auth (per-host authentication)
|
|
||||||
↓
|
|
||||||
New hosts authenticate on first boot
|
|
||||||
Fetch secrets via Vault API
|
|
||||||
No manual key distribution needed
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
#### Phase 4a: Vault Server Setup
|
|
||||||
|
|
||||||
**Goal:** Deploy and configure Vault server with auto-unseal
|
|
||||||
|
|
||||||
**Tasks:**
|
|
||||||
- [ ] Create `hosts/vault01/` configuration
|
|
||||||
- [ ] Basic NixOS configuration (hostname, networking, etc.)
|
|
||||||
- [ ] Vault service configuration
|
|
||||||
- [ ] Firewall rules (8200 for API, 8201 for cluster)
|
|
||||||
- [ ] Add to flake.nix and terraform
|
|
||||||
- [ ] Implement auto-unseal mechanism
|
|
||||||
- [ ] **Preferred:** TPM-based auto-unseal if hardware supports it
|
|
||||||
- [ ] Use tpm2-tools to seal/unseal Vault keys
|
|
||||||
- [ ] Systemd service to unseal on boot
|
|
||||||
- [ ] **Fallback:** Shamir secret sharing with systemd automation
|
|
||||||
- [ ] Generate 3 keys, threshold 2
|
|
||||||
- [ ] Store 2 keys on disk (encrypted), keep 1 offline
|
|
||||||
- [ ] Systemd service auto-unseals using 2 keys
|
|
||||||
- [ ] Initial Vault setup
|
|
||||||
- [ ] Initialize Vault
|
|
||||||
- [ ] Configure storage backend (integrated raft or file)
|
|
||||||
- [ ] Set up root token management
|
|
||||||
- [ ] Enable audit logging
|
|
||||||
- [ ] Deploy to infrastructure
|
|
||||||
- [ ] Add DNS entry for vault.home.2rjus.net
|
|
||||||
- [ ] Deploy VM via terraform
|
|
||||||
- [ ] Bootstrap and verify Vault is running
|
|
||||||
|
|
||||||
**Deliverable:** Running Vault server that auto-unseals on boot
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
#### Phase 4b: Vault-as-Code with OpenTofu
|
|
||||||
|
|
||||||
**Goal:** Manage all Vault configuration (secrets structure, policies, roles) as code
|
|
||||||
|
|
||||||
**Tasks:**
|
|
||||||
- [ ] Set up Vault Terraform provider
|
|
||||||
- [ ] Create `terraform/vault/` directory
|
|
||||||
- [ ] Configure Vault provider (address, auth)
|
|
||||||
- [ ] Store Vault token securely (terraform.tfvars, gitignored)
|
|
||||||
- [ ] Enable and configure secrets engines
|
|
||||||
- [ ] Enable KV v2 secrets engine at `secret/`
|
|
||||||
- [ ] Define secret path structure (per-service, per-host)
|
|
||||||
- [ ] Example: `secret/monitoring/grafana`, `secret/postgres/ha1`
|
|
||||||
- [ ] Define policies as code
|
|
||||||
- [ ] Create policies for different service tiers
|
|
||||||
- [ ] Principle of least privilege (hosts only read their secrets)
|
|
||||||
- [ ] Example: monitoring-policy allows read on `secret/monitoring/*`
|
|
||||||
- [ ] Set up AppRole authentication
|
|
||||||
- [ ] Enable AppRole auth backend
|
|
||||||
- [ ] Create role per host type (monitoring, dns, database, etc.)
|
|
||||||
- [ ] Bind policies to roles
|
|
||||||
- [ ] Configure TTL and token policies
|
|
||||||
- [ ] Migrate existing secrets from sops-nix
|
|
||||||
- [ ] Create migration script/playbook
|
|
||||||
- [ ] Decrypt sops secrets and load into Vault KV
|
|
||||||
- [ ] Verify all secrets migrated successfully
|
|
||||||
- [ ] Keep sops as backup during transition
|
|
||||||
- [ ] Implement secrets-as-code patterns
|
|
||||||
- [ ] Secret values in gitignored terraform.tfvars
|
|
||||||
- [ ] Or use random_password for auto-generated secrets
|
|
||||||
- [ ] Secret structure/paths in version-controlled .tf files
|
|
||||||
|
|
||||||
**Example OpenTofu:**
|
|
||||||
```hcl
|
|
||||||
resource "vault_kv_secret_v2" "monitoring_grafana" {
|
|
||||||
mount = "secret"
|
|
||||||
name = "monitoring/grafana"
|
|
||||||
data_json = jsonencode({
|
|
||||||
admin_password = var.grafana_admin_password
|
|
||||||
smtp_password = var.smtp_password
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "vault_policy" "monitoring" {
|
|
||||||
name = "monitoring-policy"
|
|
||||||
policy = <<EOT
|
|
||||||
path "secret/data/monitoring/*" {
|
|
||||||
capabilities = ["read"]
|
|
||||||
}
|
|
||||||
EOT
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "vault_approle_auth_backend_role" "monitoring01" {
|
|
||||||
backend = "approle"
|
|
||||||
role_name = "monitoring01"
|
|
||||||
token_policies = ["monitoring-policy"]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Deliverable:** All secrets and policies managed as OpenTofu code in `terraform/vault/`
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
#### Phase 4c: PKI Migration (Replace step-ca)
|
|
||||||
|
|
||||||
**Goal:** Consolidate PKI infrastructure into Vault
|
|
||||||
|
|
||||||
**Tasks:**
|
|
||||||
- [ ] Set up Vault PKI engines
|
|
||||||
- [ ] Create root CA in Vault (`pki/` mount, 10 year TTL)
|
|
||||||
- [ ] Create intermediate CA (`pki_int/` mount, 5 year TTL)
|
|
||||||
- [ ] Sign intermediate with root CA
|
|
||||||
- [ ] Configure CRL and OCSP
|
|
||||||
- [ ] Enable ACME support
|
|
||||||
- [ ] Enable ACME on intermediate CA (Vault 1.14+)
|
|
||||||
- [ ] Create PKI role for homelab domain
|
|
||||||
- [ ] Set certificate TTLs and allowed domains
|
|
||||||
- [ ] Configure SSH CA in Vault
|
|
||||||
- [ ] Enable SSH secrets engine (`ssh/` mount)
|
|
||||||
- [ ] Generate SSH signing keys
|
|
||||||
- [ ] Create roles for host and user certificates
|
|
||||||
- [ ] Configure TTLs and allowed principals
|
|
||||||
- [ ] Migrate hosts from step-ca to Vault
|
|
||||||
- [ ] Update system/acme.nix to use Vault ACME endpoint
|
|
||||||
- [ ] Change server to `https://vault.home.2rjus.net:8200/v1/pki_int/acme/directory`
|
|
||||||
- [ ] Test certificate issuance on one host
|
|
||||||
- [ ] Roll out to all hosts via auto-upgrade
|
|
||||||
- [ ] Migrate SSH CA trust
|
|
||||||
- [ ] Distribute Vault SSH CA public key to all hosts
|
|
||||||
- [ ] Update sshd_config to trust Vault CA
|
|
||||||
- [ ] Test SSH certificate authentication
|
|
||||||
- [ ] Decommission step-ca
|
|
||||||
- [ ] Verify all services migrated
|
|
||||||
- [ ] Stop step-ca service on ca host
|
|
||||||
- [ ] Archive step-ca configuration for backup
|
|
||||||
|
|
||||||
**Deliverable:** All TLS and SSH certificates issued by Vault, step-ca retired
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
#### Phase 4d: Bootstrap Integration
|
|
||||||
|
|
||||||
**Goal:** New hosts automatically authenticate to Vault on first boot, no manual steps
|
|
||||||
|
|
||||||
**Tasks:**
|
|
||||||
- [ ] Update create-host tool
|
|
||||||
- [ ] Generate AppRole role_id + secret_id for new host
|
|
||||||
- [ ] Or create wrapped token for one-time bootstrap
|
|
||||||
- [ ] Add host-specific policy to Vault (via terraform)
|
|
||||||
- [ ] Store bootstrap credentials for cloud-init injection
|
|
||||||
- [ ] Update template2 for Vault authentication
|
|
||||||
- [ ] Create Vault authentication module
|
|
||||||
- [ ] Reads bootstrap credentials from cloud-init
|
|
||||||
- [ ] Authenticates to Vault, retrieves permanent AppRole credentials
|
|
||||||
- [ ] Stores role_id + secret_id locally for services to use
|
|
||||||
- [ ] Create NixOS Vault secrets module
|
|
||||||
- [ ] Replacement for sops.secrets
|
|
||||||
- [ ] Fetches secrets from Vault at nixos-rebuild/activation time
|
|
||||||
- [ ] Or runtime secret fetching for services
|
|
||||||
- [ ] Handle Vault token renewal
|
|
||||||
- [ ] Update bootstrap service
|
|
||||||
- [ ] After authenticating to Vault, fetch any bootstrap secrets
|
|
||||||
- [ ] Run nixos-rebuild with host configuration
|
|
||||||
- [ ] Services automatically fetch their secrets from Vault
|
|
||||||
- [ ] Update terraform cloud-init
|
|
||||||
- [ ] Inject Vault address and bootstrap credentials
|
|
||||||
- [ ] Pass via cloud-init user-data or write_files
|
|
||||||
- [ ] Credentials scoped to single use or short TTL
|
|
||||||
- [ ] Test complete flow
|
|
||||||
- [ ] Run create-host to generate new host config
|
|
||||||
- [ ] Deploy with terraform
|
|
||||||
- [ ] Verify host bootstraps and authenticates to Vault
|
|
||||||
- [ ] Verify services can fetch secrets
|
|
||||||
- [ ] Confirm no manual steps required
|
|
||||||
|
|
||||||
**Bootstrap flow:**
|
|
||||||
```
|
|
||||||
1. terraform apply (deploys VM with cloud-init)
|
|
||||||
2. Cloud-init sets hostname + Vault bootstrap credentials
|
|
||||||
3. nixos-bootstrap.service runs:
|
|
||||||
- Authenticates to Vault with bootstrap credentials
|
|
||||||
- Retrieves permanent AppRole credentials
|
|
||||||
- Stores locally for service use
|
|
||||||
- Runs nixos-rebuild
|
|
||||||
4. Host services fetch secrets from Vault as needed
|
|
||||||
5. Done - no manual intervention
|
|
||||||
```
|
|
||||||
|
|
||||||
**Deliverable:** Fully automated secrets access from first boot, zero manual steps
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Phase 5: DNS Automation
|
|
||||||
|
|
||||||
**Goal:** Automatically generate DNS entries from host configurations
|
|
||||||
|
|
||||||
**Approach:** Leverage Nix to generate zone file entries from flake host configurations
|
|
||||||
|
|
||||||
Since most hosts use static IPs defined in their NixOS configurations, we can extract this information and automatically generate A records. This keeps DNS in sync with the actual host configs.
|
|
||||||
|
|
||||||
**Tasks:**
|
|
||||||
- [ ] Add optional CNAME field to host configurations
|
|
||||||
- [ ] Add `networking.cnames = [ "alias1" "alias2" ]` or similar option
|
|
||||||
- [ ] Document in host configuration template
|
|
||||||
- [ ] Create Nix function to extract DNS records from all hosts
|
|
||||||
- [ ] Parse each host's `networking.hostName` and IP configuration
|
|
||||||
- [ ] Collect any defined CNAMEs
|
|
||||||
- [ ] Generate zone file fragment with A and CNAME records
|
|
||||||
- [ ] Integrate auto-generated records into zone files
|
|
||||||
- [ ] Keep manual entries separate (for non-flake hosts/services)
|
|
||||||
- [ ] Include generated fragment in main zone file
|
|
||||||
- [ ] Add comments showing which records are auto-generated
|
|
||||||
- [ ] Update zone file serial number automatically
|
|
||||||
- [ ] Test zone file validity after generation
|
|
||||||
- [ ] Either:
|
|
||||||
- [ ] Automatically trigger DNS server reload (Ansible)
|
|
||||||
- [ ] Or document manual step: merge to master, run upgrade on ns1/ns2
|
|
||||||
|
|
||||||
**Deliverable:** DNS A records and CNAMEs automatically generated from host configs
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Phase 6: Integration Script
|
|
||||||
|
|
||||||
**Goal:** Single command to create and deploy a new host
|
|
||||||
|
|
||||||
**Tasks:**
|
|
||||||
- [ ] Create `scripts/create-host.sh` master script that orchestrates:
|
|
||||||
1. Prompts for: hostname, IP (or DHCP), CPU, memory, disk
|
|
||||||
2. Validates inputs (IP not in use, hostname unique, etc.)
|
|
||||||
3. Calls host config generator (Phase 2)
|
|
||||||
4. Generates OpenTofu config (Phase 2)
|
|
||||||
5. Handles secrets (Phase 4)
|
|
||||||
6. Updates DNS (Phase 5)
|
|
||||||
7. Commits all changes to git
|
|
||||||
8. Runs `tofu apply` to deploy VM
|
|
||||||
9. Waits for bootstrap to complete (Phase 3)
|
|
||||||
10. Prints success message with IP and SSH command
|
|
||||||
- [ ] Add `--dry-run` flag to preview changes
|
|
||||||
- [ ] Add `--interactive` mode vs `--batch` mode
|
|
||||||
- [ ] Error handling and rollback on failures
|
|
||||||
|
|
||||||
**Deliverable:** `./scripts/create-host.sh --hostname myhost --ip 10.69.13.50` creates a fully working host
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Phase 7: Testing & Documentation
|
|
||||||
|
|
||||||
**Status:** 🚧 In Progress (testing improvements completed)
|
|
||||||
|
|
||||||
**Testing Improvements Implemented (2025-02-01):**
|
|
||||||
|
|
||||||
The pipeline now supports efficient testing without polluting master branch:
|
|
||||||
|
|
||||||
**1. --force Flag for create-host**
|
|
||||||
- Re-run `create-host` to regenerate existing configurations
|
|
||||||
- Updates existing entries in flake.nix and terraform/vms.tf (no duplicates)
|
|
||||||
- Skip uniqueness validation checks
|
|
||||||
- Useful for iterating on configuration templates during testing
|
|
||||||
|
|
||||||
**2. Branch Support for Bootstrap**
|
|
||||||
- Bootstrap service reads `NIXOS_FLAKE_BRANCH` environment variable
|
|
||||||
- Defaults to `master` if not set
|
|
||||||
- Allows testing pipeline changes on feature branches
|
|
||||||
- Cloud-init passes branch via `/etc/environment`
|
|
||||||
|
|
||||||
**3. Cloud-init Disk for Branch Configuration**
|
|
||||||
- Terraform generates custom cloud-init snippets for test VMs
|
|
||||||
- Set `flake_branch` field in VM definition to use non-master branch
|
|
||||||
- Production VMs omit this field and use master (default)
|
|
||||||
- Files automatically uploaded to Proxmox via SSH
|
|
||||||
|
|
||||||
**Testing Workflow:**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# 1. Create test branch
|
|
||||||
git checkout -b test-pipeline
|
|
||||||
|
|
||||||
# 2. Generate or update host config
|
|
||||||
create-host --hostname testvm01 --ip 10.69.13.100/24
|
|
||||||
|
|
||||||
# 3. Edit terraform/vms.tf to add test VM with branch
|
|
||||||
# vms = {
|
|
||||||
# "testvm01" = {
|
|
||||||
# ip = "10.69.13.100/24"
|
|
||||||
# flake_branch = "test-pipeline" # Bootstrap from this branch
|
|
||||||
# }
|
|
||||||
# }
|
|
||||||
|
|
||||||
# 4. Commit and push test branch
|
|
||||||
git add -A && git commit -m "test: add testvm01"
|
|
||||||
git push origin test-pipeline
|
|
||||||
|
|
||||||
# 5. Deploy VM
|
|
||||||
cd terraform && tofu apply
|
|
||||||
|
|
||||||
# 6. Watch bootstrap (VM fetches from test-pipeline branch)
|
|
||||||
ssh root@10.69.13.100
|
|
||||||
journalctl -fu nixos-bootstrap.service
|
|
||||||
|
|
||||||
# 7. Iterate: modify templates and regenerate with --force
|
|
||||||
cd .. && create-host --hostname testvm01 --ip 10.69.13.100/24 --force
|
|
||||||
git commit -am "test: update config" && git push
|
|
||||||
|
|
||||||
# Redeploy to test fresh bootstrap
|
|
||||||
cd terraform
|
|
||||||
tofu destroy -target=proxmox_vm_qemu.vm[\"testvm01\"] && tofu apply
|
|
||||||
|
|
||||||
# 8. Clean up when done: squash commits, merge to master, remove test VM
|
|
||||||
```
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- `scripts/create-host/create_host.py` - Added --force parameter
|
|
||||||
- `scripts/create-host/manipulators.py` - Update vs insert logic
|
|
||||||
- `hosts/template2/bootstrap.nix` - Branch support via environment variable
|
|
||||||
- `terraform/vms.tf` - flake_branch field support
|
|
||||||
- `terraform/cloud-init.tf` - Custom cloud-init disk generation
|
|
||||||
- `terraform/variables.tf` - proxmox_host variable for SSH uploads
|
|
||||||
|
|
||||||
**Remaining Tasks:**
|
|
||||||
- [ ] Test full pipeline end-to-end on feature branch
|
|
||||||
- [ ] Update CLAUDE.md with testing workflow
|
|
||||||
- [ ] Add troubleshooting section
|
|
||||||
- [ ] Create examples for common scenarios (DHCP host, static IP host, etc.)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Open Questions
|
|
||||||
|
|
||||||
1. **Bootstrap method:** Cloud-init runcmd vs Terraform provisioner vs Ansible?
|
|
||||||
2. **Secrets handling:** Pre-generate keys vs post-deployment injection?
|
|
||||||
3. **DNS automation:** Auto-commit or manual merge?
|
|
||||||
4. **Git workflow:** Auto-push changes or leave for user review?
|
|
||||||
5. **Template selection:** Single template2 or multiple templates for different host types?
|
|
||||||
6. **Networking:** Always DHCP initially, or support static IP from start?
|
|
||||||
7. **Error recovery:** What happens if bootstrap fails? Manual intervention or retry?
|
|
||||||
|
|
||||||
## Implementation Order
|
|
||||||
|
|
||||||
Recommended sequence:
|
|
||||||
1. Phase 1: Parameterize OpenTofu (foundation for testing)
|
|
||||||
2. Phase 3: Bootstrap mechanism (core automation)
|
|
||||||
3. Phase 2: Config generator (automate the boilerplate)
|
|
||||||
4. Phase 4: Secrets (solves biggest chicken-and-egg)
|
|
||||||
5. Phase 5: DNS (nice-to-have automation)
|
|
||||||
6. Phase 6: Integration script (ties it all together)
|
|
||||||
7. Phase 7: Testing & docs
|
|
||||||
|
|
||||||
## Success Criteria
|
|
||||||
|
|
||||||
When complete, creating a new host should:
|
|
||||||
- Take < 5 minutes of human time
|
|
||||||
- Require minimal user input (hostname, IP, basic specs)
|
|
||||||
- Result in a fully configured, secret-enabled, DNS-registered host
|
|
||||||
- Be reproducible and documented
|
|
||||||
- Handle common errors gracefully
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Notes
|
|
||||||
|
|
||||||
- Keep incremental commits at each phase
|
|
||||||
- Test each phase independently before moving to next
|
|
||||||
- Maintain backward compatibility with manual workflow
|
|
||||||
- Document any manual steps that can't be automated
|
|
||||||
120
ansible/README.md
Normal file
120
ansible/README.md
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
# Ansible Configuration
|
||||||
|
|
||||||
|
This directory contains Ansible configuration for fleet management tasks.
|
||||||
|
|
||||||
|
## Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
ansible/
|
||||||
|
├── ansible.cfg # Ansible configuration
|
||||||
|
├── inventory/
|
||||||
|
│ ├── dynamic_flake.py # Dynamic inventory from NixOS flake
|
||||||
|
│ ├── static.yml # Non-flake hosts (Proxmox, etc.)
|
||||||
|
│ └── group_vars/
|
||||||
|
│ └── all.yml # Common variables
|
||||||
|
└── playbooks/
|
||||||
|
├── build-and-deploy-template.yml
|
||||||
|
├── provision-approle.yml
|
||||||
|
├── restart-service.yml
|
||||||
|
└── run-upgrade.yml
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
The devshell automatically configures `ANSIBLE_CONFIG`, so commands work without extra flags:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# List inventory groups
|
||||||
|
nix develop -c ansible-inventory --graph
|
||||||
|
|
||||||
|
# List hosts in a specific group
|
||||||
|
nix develop -c ansible-inventory --list | jq '.role_dns'
|
||||||
|
|
||||||
|
# Run a playbook
|
||||||
|
nix develop -c ansible-playbook ansible/playbooks/run-upgrade.yml -l tier_test
|
||||||
|
```
|
||||||
|
|
||||||
|
## Inventory
|
||||||
|
|
||||||
|
The inventory combines dynamic and static sources automatically.
|
||||||
|
|
||||||
|
### Dynamic Inventory (from flake)
|
||||||
|
|
||||||
|
The `dynamic_flake.py` script extracts hosts from the NixOS flake using `homelab.host.*` options:
|
||||||
|
|
||||||
|
**Groups generated:**
|
||||||
|
- `flake_hosts` - All NixOS hosts from the flake
|
||||||
|
- `tier_test`, `tier_prod` - By `homelab.host.tier`
|
||||||
|
- `role_dns`, `role_vault`, `role_monitoring`, etc. - By `homelab.host.role`
|
||||||
|
|
||||||
|
**Host variables set:**
|
||||||
|
- `tier` - Deployment tier (test/prod)
|
||||||
|
- `role` - Host role
|
||||||
|
- `short_hostname` - Hostname without domain
|
||||||
|
|
||||||
|
### Static Inventory
|
||||||
|
|
||||||
|
Non-flake hosts are defined in `inventory/static.yml`:
|
||||||
|
|
||||||
|
- `proxmox` - Proxmox hypervisors
|
||||||
|
|
||||||
|
## Playbooks
|
||||||
|
|
||||||
|
| Playbook | Description | Example |
|
||||||
|
|----------|-------------|---------|
|
||||||
|
| `run-upgrade.yml` | Trigger nixos-upgrade on hosts | `-l tier_prod` |
|
||||||
|
| `restart-service.yml` | Restart a systemd service | `-l role_dns -e service=unbound` |
|
||||||
|
| `reboot.yml` | Rolling reboot (one host at a time) | `-l tier_test` |
|
||||||
|
| `provision-approle.yml` | Deploy Vault credentials (single host only) | `-l testvm01` |
|
||||||
|
| `build-and-deploy-template.yml` | Build and deploy Proxmox template | (no limit needed) |
|
||||||
|
|
||||||
|
### Examples
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Restart unbound on all DNS servers
|
||||||
|
nix develop -c ansible-playbook ansible/playbooks/restart-service.yml \
|
||||||
|
-l role_dns -e service=unbound
|
||||||
|
|
||||||
|
# Trigger upgrade on all test hosts
|
||||||
|
nix develop -c ansible-playbook ansible/playbooks/run-upgrade.yml -l tier_test
|
||||||
|
|
||||||
|
# Provision Vault credentials for a specific host
|
||||||
|
nix develop -c ansible-playbook ansible/playbooks/provision-approle.yml -l testvm01
|
||||||
|
|
||||||
|
# Build and deploy Proxmox template
|
||||||
|
nix develop -c ansible-playbook ansible/playbooks/build-and-deploy-template.yml
|
||||||
|
|
||||||
|
# Rolling reboot of test hosts (one at a time, waits for each to come back)
|
||||||
|
nix develop -c ansible-playbook ansible/playbooks/reboot.yml -l tier_test
|
||||||
|
```
|
||||||
|
|
||||||
|
## Excluding Flake Hosts
|
||||||
|
|
||||||
|
To exclude a flake host from the dynamic inventory, add the `ansible = "false"` label in the host's configuration:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
homelab.host.labels.ansible = "false";
|
||||||
|
```
|
||||||
|
|
||||||
|
Hosts with `homelab.dns.enable = false` are also excluded automatically.
|
||||||
|
|
||||||
|
## Adding Non-Flake Hosts
|
||||||
|
|
||||||
|
Edit `inventory/static.yml` to add hosts not managed by the NixOS flake:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
all:
|
||||||
|
children:
|
||||||
|
my_group:
|
||||||
|
hosts:
|
||||||
|
host1.example.com:
|
||||||
|
ansible_user: admin
|
||||||
|
```
|
||||||
|
|
||||||
|
## Common Variables
|
||||||
|
|
||||||
|
Variables in `inventory/group_vars/all.yml` apply to all hosts:
|
||||||
|
|
||||||
|
- `ansible_user` - Default SSH user (root)
|
||||||
|
- `domain` - Domain name (home.2rjus.net)
|
||||||
|
- `vault_addr` - Vault server URL
|
||||||
17
ansible/ansible.cfg
Normal file
17
ansible/ansible.cfg
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
[defaults]
|
||||||
|
inventory = inventory/
|
||||||
|
remote_user = root
|
||||||
|
host_key_checking = False
|
||||||
|
|
||||||
|
# Reduce SSH connection overhead
|
||||||
|
forks = 10
|
||||||
|
pipelining = True
|
||||||
|
|
||||||
|
# Output formatting (YAML output via builtin default callback)
|
||||||
|
stdout_callback = default
|
||||||
|
callbacks_enabled = profile_tasks
|
||||||
|
result_format = yaml
|
||||||
|
|
||||||
|
[ssh_connection]
|
||||||
|
# Reuse SSH connections
|
||||||
|
ssh_args = -o ControlMaster=auto -o ControlPersist=60s
|
||||||
162
ansible/inventory/dynamic_flake.py
Executable file
162
ansible/inventory/dynamic_flake.py
Executable file
@@ -0,0 +1,162 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Dynamic Ansible inventory script that extracts host information from the NixOS flake.
|
||||||
|
|
||||||
|
Generates groups:
|
||||||
|
- flake_hosts: All hosts defined in the flake
|
||||||
|
- tier_test, tier_prod: Hosts by deployment tier
|
||||||
|
- role_<name>: Hosts by role (dns, vault, monitoring, etc.)
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
./dynamic_flake.py --list # Return full inventory
|
||||||
|
./dynamic_flake.py --host X # Return host vars (not used, but required by Ansible)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def get_flake_dir() -> Path:
|
||||||
|
"""Find the flake root directory."""
|
||||||
|
script_dir = Path(__file__).resolve().parent
|
||||||
|
# ansible/inventory/dynamic_flake.py -> repo root
|
||||||
|
return script_dir.parent.parent
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_flake() -> dict:
|
||||||
|
"""Evaluate the flake and extract host metadata."""
|
||||||
|
flake_dir = get_flake_dir()
|
||||||
|
|
||||||
|
# Nix expression to extract relevant config from each host
|
||||||
|
nix_expr = """
|
||||||
|
configs: builtins.mapAttrs (name: cfg: {
|
||||||
|
hostname = cfg.config.networking.hostName;
|
||||||
|
domain = cfg.config.networking.domain or "home.2rjus.net";
|
||||||
|
tier = cfg.config.homelab.host.tier;
|
||||||
|
role = cfg.config.homelab.host.role;
|
||||||
|
labels = cfg.config.homelab.host.labels;
|
||||||
|
dns_enabled = cfg.config.homelab.dns.enable;
|
||||||
|
}) configs
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
[
|
||||||
|
"nix",
|
||||||
|
"eval",
|
||||||
|
"--json",
|
||||||
|
f"{flake_dir}#nixosConfigurations",
|
||||||
|
"--apply",
|
||||||
|
nix_expr,
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True,
|
||||||
|
cwd=flake_dir,
|
||||||
|
)
|
||||||
|
return json.loads(result.stdout)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"Error evaluating flake: {e.stderr}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
print(f"Error parsing nix output: {e}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_group_name(name: str) -> str:
|
||||||
|
"""Sanitize a string for use as an Ansible group name.
|
||||||
|
|
||||||
|
Ansible group names should contain only alphanumeric characters and underscores.
|
||||||
|
"""
|
||||||
|
return name.replace("-", "_")
|
||||||
|
|
||||||
|
|
||||||
|
def build_inventory(hosts_data: dict) -> dict:
|
||||||
|
"""Build Ansible inventory structure from host data."""
|
||||||
|
inventory = {
|
||||||
|
"_meta": {"hostvars": {}},
|
||||||
|
"flake_hosts": {"hosts": []},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Track groups we need to create
|
||||||
|
tier_groups: dict[str, list[str]] = {}
|
||||||
|
role_groups: dict[str, list[str]] = {}
|
||||||
|
|
||||||
|
for _config_name, host_info in hosts_data.items():
|
||||||
|
hostname = host_info["hostname"]
|
||||||
|
domain = host_info["domain"]
|
||||||
|
tier = host_info["tier"]
|
||||||
|
role = host_info["role"]
|
||||||
|
labels = host_info["labels"]
|
||||||
|
dns_enabled = host_info["dns_enabled"]
|
||||||
|
|
||||||
|
# Skip hosts that have DNS disabled (like templates)
|
||||||
|
if not dns_enabled:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip hosts with ansible = "false" label
|
||||||
|
if labels.get("ansible") == "false":
|
||||||
|
continue
|
||||||
|
|
||||||
|
fqdn = f"{hostname}.{domain}"
|
||||||
|
|
||||||
|
# Use short hostname as inventory name, FQDN for connection
|
||||||
|
inventory_name = hostname
|
||||||
|
|
||||||
|
# Add to flake_hosts group
|
||||||
|
inventory["flake_hosts"]["hosts"].append(inventory_name)
|
||||||
|
|
||||||
|
# Add host variables
|
||||||
|
inventory["_meta"]["hostvars"][inventory_name] = {
|
||||||
|
"ansible_host": fqdn, # Connect using FQDN
|
||||||
|
"fqdn": fqdn,
|
||||||
|
"tier": tier,
|
||||||
|
"role": role,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Group by tier
|
||||||
|
tier_group = f"tier_{sanitize_group_name(tier)}"
|
||||||
|
if tier_group not in tier_groups:
|
||||||
|
tier_groups[tier_group] = []
|
||||||
|
tier_groups[tier_group].append(inventory_name)
|
||||||
|
|
||||||
|
# Group by role (if set)
|
||||||
|
if role:
|
||||||
|
role_group = f"role_{sanitize_group_name(role)}"
|
||||||
|
if role_group not in role_groups:
|
||||||
|
role_groups[role_group] = []
|
||||||
|
role_groups[role_group].append(inventory_name)
|
||||||
|
|
||||||
|
# Add tier groups to inventory
|
||||||
|
for group_name, hosts in tier_groups.items():
|
||||||
|
inventory[group_name] = {"hosts": hosts}
|
||||||
|
|
||||||
|
# Add role groups to inventory
|
||||||
|
for group_name, hosts in role_groups.items():
|
||||||
|
inventory[group_name] = {"hosts": hosts}
|
||||||
|
|
||||||
|
return inventory
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print("Usage: dynamic_flake.py --list | --host <hostname>", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if sys.argv[1] == "--list":
|
||||||
|
hosts_data = evaluate_flake()
|
||||||
|
inventory = build_inventory(hosts_data)
|
||||||
|
print(json.dumps(inventory, indent=2))
|
||||||
|
elif sys.argv[1] == "--host":
|
||||||
|
# Ansible calls this to get vars for a specific host
|
||||||
|
# We provide all vars in _meta.hostvars, so just return empty
|
||||||
|
print(json.dumps({}))
|
||||||
|
else:
|
||||||
|
print(f"Unknown option: {sys.argv[1]}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
5
ansible/inventory/group_vars/all.yml
Normal file
5
ansible/inventory/group_vars/all.yml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
# Common variables for all hosts
|
||||||
|
|
||||||
|
ansible_user: root
|
||||||
|
domain: home.2rjus.net
|
||||||
|
vault_addr: https://vault01.home.2rjus.net:8200
|
||||||
13
ansible/inventory/static.yml
Normal file
13
ansible/inventory/static.yml
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
# Static inventory for non-flake hosts
|
||||||
|
#
|
||||||
|
# Hosts defined here are merged with the dynamic flake inventory.
|
||||||
|
# Use this for infrastructure that isn't managed by NixOS.
|
||||||
|
#
|
||||||
|
# Use short hostnames as inventory names with ansible_host for FQDN.
|
||||||
|
|
||||||
|
all:
|
||||||
|
children:
|
||||||
|
proxmox:
|
||||||
|
hosts:
|
||||||
|
pve1:
|
||||||
|
ansible_host: pve1.home.2rjus.net
|
||||||
@@ -15,13 +15,13 @@
|
|||||||
- name: Build NixOS image
|
- name: Build NixOS image
|
||||||
ansible.builtin.command:
|
ansible.builtin.command:
|
||||||
cmd: "nixos-rebuild build-image --image-variant proxmox --flake .#template2"
|
cmd: "nixos-rebuild build-image --image-variant proxmox --flake .#template2"
|
||||||
chdir: "{{ playbook_dir }}/.."
|
chdir: "{{ playbook_dir }}/../.."
|
||||||
register: build_result
|
register: build_result
|
||||||
changed_when: true
|
changed_when: true
|
||||||
|
|
||||||
- name: Find built image file
|
- name: Find built image file
|
||||||
ansible.builtin.find:
|
ansible.builtin.find:
|
||||||
paths: "{{ playbook_dir}}/../result"
|
paths: "{{ playbook_dir}}/../../result"
|
||||||
patterns: "*.vma.zst"
|
patterns: "*.vma.zst"
|
||||||
recurse: true
|
recurse: true
|
||||||
register: image_files
|
register: image_files
|
||||||
@@ -99,3 +99,48 @@
|
|||||||
- name: Display success message
|
- name: Display success message
|
||||||
ansible.builtin.debug:
|
ansible.builtin.debug:
|
||||||
msg: "Template VM {{ template_vmid }} created successfully on {{ storage }}"
|
msg: "Template VM {{ template_vmid }} created successfully on {{ storage }}"
|
||||||
|
|
||||||
|
- name: Update Terraform template name
|
||||||
|
hosts: localhost
|
||||||
|
gather_facts: false
|
||||||
|
|
||||||
|
vars:
|
||||||
|
terraform_dir: "{{ playbook_dir }}/../../terraform"
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
- name: Get image filename from earlier play
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
image_filename: "{{ hostvars['localhost']['image_filename'] }}"
|
||||||
|
|
||||||
|
- name: Extract template name from image filename
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
new_template_name: "{{ image_filename | regex_replace('\\.vma\\.zst$', '') | regex_replace('^vzdump-qemu-', '') }}"
|
||||||
|
|
||||||
|
- name: Read current Terraform variables file
|
||||||
|
ansible.builtin.slurp:
|
||||||
|
src: "{{ terraform_dir }}/variables.tf"
|
||||||
|
register: variables_tf_content
|
||||||
|
|
||||||
|
- name: Extract current template name from variables.tf
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
current_template_name: "{{ (variables_tf_content.content | b64decode) | regex_search('variable \"default_template_name\"[^}]+default\\s*=\\s*\"([^\"]+)\"', '\\1') | first }}"
|
||||||
|
|
||||||
|
- name: Check if template name has changed
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
template_name_changed: "{{ current_template_name != new_template_name }}"
|
||||||
|
|
||||||
|
- name: Display template name status
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "Template name: {{ current_template_name }} -> {{ new_template_name }} ({{ 'changed' if template_name_changed else 'unchanged' }})"
|
||||||
|
|
||||||
|
- name: Update default_template_name in variables.tf
|
||||||
|
ansible.builtin.replace:
|
||||||
|
path: "{{ terraform_dir }}/variables.tf"
|
||||||
|
regexp: '(variable "default_template_name"[^}]+default\s*=\s*)"[^"]+"'
|
||||||
|
replace: '\1"{{ new_template_name }}"'
|
||||||
|
when: template_name_changed
|
||||||
|
|
||||||
|
- name: Display update result
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "Updated terraform/variables.tf with new template name: {{ new_template_name }}"
|
||||||
|
when: template_name_changed
|
||||||
84
ansible/playbooks/provision-approle.yml
Normal file
84
ansible/playbooks/provision-approle.yml
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
---
|
||||||
|
# Provision OpenBao AppRole credentials to a host
|
||||||
|
#
|
||||||
|
# Usage: ansible-playbook ansible/playbooks/provision-approle.yml -l <hostname>
|
||||||
|
# Requires: BAO_ADDR and BAO_TOKEN environment variables set
|
||||||
|
#
|
||||||
|
# IMPORTANT: This playbook must target exactly one host to prevent
|
||||||
|
# accidentally regenerating credentials for multiple hosts.
|
||||||
|
|
||||||
|
- name: Validate single host target
|
||||||
|
hosts: all
|
||||||
|
gather_facts: false
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
- name: Fail if targeting multiple hosts
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: |
|
||||||
|
This playbook must target exactly one host.
|
||||||
|
Use: ansible-playbook provision-approle.yml -l <hostname>
|
||||||
|
|
||||||
|
Targeting multiple hosts would regenerate credentials for all of them,
|
||||||
|
potentially breaking existing services.
|
||||||
|
when: ansible_play_hosts | length != 1
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: Provision AppRole credentials
|
||||||
|
hosts: all
|
||||||
|
gather_facts: false
|
||||||
|
|
||||||
|
vars:
|
||||||
|
target_hostname: "{{ inventory_hostname.split('.')[0] }}"
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
- name: Display target host
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "Provisioning AppRole credentials for: {{ target_hostname }}"
|
||||||
|
|
||||||
|
- name: Get role-id for host
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: "bao read -field=role_id auth/approle/role/{{ target_hostname }}/role-id"
|
||||||
|
environment:
|
||||||
|
BAO_ADDR: "{{ vault_addr }}"
|
||||||
|
BAO_SKIP_VERIFY: "1"
|
||||||
|
register: role_id_result
|
||||||
|
changed_when: false
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: Generate secret-id for host
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: "bao write -field=secret_id -f auth/approle/role/{{ target_hostname }}/secret-id"
|
||||||
|
environment:
|
||||||
|
BAO_ADDR: "{{ vault_addr }}"
|
||||||
|
BAO_SKIP_VERIFY: "1"
|
||||||
|
register: secret_id_result
|
||||||
|
changed_when: true
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: Create AppRole directory
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: /var/lib/vault/approle
|
||||||
|
state: directory
|
||||||
|
mode: "0700"
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
|
||||||
|
- name: Write role-id
|
||||||
|
ansible.builtin.copy:
|
||||||
|
content: "{{ role_id_result.stdout }}"
|
||||||
|
dest: /var/lib/vault/approle/role-id
|
||||||
|
mode: "0600"
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
|
||||||
|
- name: Write secret-id
|
||||||
|
ansible.builtin.copy:
|
||||||
|
content: "{{ secret_id_result.stdout }}"
|
||||||
|
dest: /var/lib/vault/approle/secret-id
|
||||||
|
mode: "0600"
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
|
||||||
|
- name: Display success
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "AppRole credentials provisioned to {{ inventory_hostname }}"
|
||||||
48
ansible/playbooks/reboot.yml
Normal file
48
ansible/playbooks/reboot.yml
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
---
|
||||||
|
# Reboot hosts with rolling strategy to avoid taking down redundant services
|
||||||
|
#
|
||||||
|
# Usage examples:
|
||||||
|
# # Reboot a single host
|
||||||
|
# ansible-playbook reboot.yml -l testvm01
|
||||||
|
#
|
||||||
|
# # Reboot all test hosts (one at a time)
|
||||||
|
# ansible-playbook reboot.yml -l tier_test
|
||||||
|
#
|
||||||
|
# # Reboot all DNS servers safely (one at a time)
|
||||||
|
# ansible-playbook reboot.yml -l role_dns
|
||||||
|
#
|
||||||
|
# Safety features:
|
||||||
|
# - serial: 1 ensures only one host reboots at a time
|
||||||
|
# - Waits for host to come back online before proceeding
|
||||||
|
# - Groups hosts by role to avoid rebooting same-role hosts consecutively
|
||||||
|
|
||||||
|
- name: Reboot hosts (rolling)
|
||||||
|
hosts: all
|
||||||
|
serial: 1
|
||||||
|
order: shuffle # Randomize to spread out same-role hosts
|
||||||
|
gather_facts: false
|
||||||
|
|
||||||
|
vars:
|
||||||
|
reboot_timeout: 300 # 5 minutes to wait for host to come back
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
- name: Display reboot target
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "Rebooting {{ inventory_hostname }} (role: {{ role | default('none') }})"
|
||||||
|
|
||||||
|
- name: Reboot the host
|
||||||
|
ansible.builtin.systemd:
|
||||||
|
name: reboot.target
|
||||||
|
state: started
|
||||||
|
async: 1
|
||||||
|
poll: 0
|
||||||
|
ignore_errors: true
|
||||||
|
|
||||||
|
- name: Wait for host to come back online
|
||||||
|
ansible.builtin.wait_for_connection:
|
||||||
|
delay: 5
|
||||||
|
timeout: "{{ reboot_timeout }}"
|
||||||
|
|
||||||
|
- name: Display reboot result
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "{{ inventory_hostname }} rebooted successfully"
|
||||||
40
ansible/playbooks/restart-service.yml
Normal file
40
ansible/playbooks/restart-service.yml
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
---
|
||||||
|
# Restart a systemd service on target hosts
|
||||||
|
#
|
||||||
|
# Usage examples:
|
||||||
|
# # Restart unbound on all DNS servers
|
||||||
|
# ansible-playbook restart-service.yml -l role_dns -e service=unbound
|
||||||
|
#
|
||||||
|
# # Restart nginx on a specific host
|
||||||
|
# ansible-playbook restart-service.yml -l http-proxy.home.2rjus.net -e service=nginx
|
||||||
|
#
|
||||||
|
# # Restart promtail on all prod hosts
|
||||||
|
# ansible-playbook restart-service.yml -l tier_prod -e service=promtail
|
||||||
|
|
||||||
|
- name: Restart systemd service
|
||||||
|
hosts: all
|
||||||
|
gather_facts: false
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
- name: Validate service name provided
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: |
|
||||||
|
The 'service' variable is required.
|
||||||
|
Usage: ansible-playbook restart-service.yml -l <target> -e service=<name>
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
-e service=nginx
|
||||||
|
-e service=unbound
|
||||||
|
-e service=promtail
|
||||||
|
when: service is not defined
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: Restart {{ service }}
|
||||||
|
ansible.builtin.systemd:
|
||||||
|
name: "{{ service }}"
|
||||||
|
state: restarted
|
||||||
|
register: restart_result
|
||||||
|
|
||||||
|
- name: Display result
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "Service {{ service }} restarted on {{ inventory_hostname }}"
|
||||||
21
common/ssh-audit.nix
Normal file
21
common/ssh-audit.nix
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
# SSH session command auditing
|
||||||
|
#
|
||||||
|
# Logs all commands executed by users who logged in interactively (SSH).
|
||||||
|
# System services and nix builds are excluded via auid filter.
|
||||||
|
#
|
||||||
|
# Logs are sent to journald and forwarded to Loki via promtail.
|
||||||
|
# Query with: {host="<hostname>"} |= "EXECVE"
|
||||||
|
{
|
||||||
|
# Enable Linux audit subsystem
|
||||||
|
security.audit.enable = true;
|
||||||
|
security.auditd.enable = true;
|
||||||
|
|
||||||
|
# Log execve syscalls only from interactive login sessions
|
||||||
|
# auid!=4294967295 means "audit login uid is set" (excludes system services, nix builds)
|
||||||
|
security.audit.rules = [
|
||||||
|
"-a exit,always -F arch=b64 -S execve -F auid!=4294967295"
|
||||||
|
];
|
||||||
|
|
||||||
|
# Forward audit logs to journald (so promtail ships them to Loki)
|
||||||
|
services.journald.audit = true;
|
||||||
|
}
|
||||||
217
docs/host-creation.md
Normal file
217
docs/host-creation.md
Normal file
@@ -0,0 +1,217 @@
|
|||||||
|
# Host Creation Pipeline
|
||||||
|
|
||||||
|
This document describes the process for creating new hosts in the homelab infrastructure.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
We use the `create-host` script to create new hosts, which generates default configurations from a template. We then use OpenTofu to deploy both secrets and VMs. The VMs boot using a template image (built from `hosts/template2`), which starts a bootstrap process. This bootstrap process applies the host's NixOS configuration and then reboots into the new config.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
All tools are available in the devshell: `create-host`, `bao` (OpenBao CLI), `tofu`.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
nix develop
|
||||||
|
```
|
||||||
|
|
||||||
|
## Steps
|
||||||
|
|
||||||
|
Steps marked with **USER** must be performed by the user due to credential requirements.
|
||||||
|
|
||||||
|
1. **USER**: Run `create-host --hostname <name> --ip <ip/prefix>`
|
||||||
|
2. Edit the auto-generated configurations in `hosts/<hostname>/` to import whatever modules are needed for its purpose
|
||||||
|
3. Add any secrets needed to `terraform/vault/`
|
||||||
|
4. Edit the VM specs in `terraform/vms.tf` if needed. To deploy from a branch other than master, add `flake_branch = "<branch>"` to the VM definition
|
||||||
|
5. Push configuration to master (or the branch specified by `flake_branch`)
|
||||||
|
6. **USER**: Apply terraform:
|
||||||
|
```bash
|
||||||
|
nix develop -c tofu -chdir=terraform/vault apply
|
||||||
|
nix develop -c tofu -chdir=terraform apply
|
||||||
|
```
|
||||||
|
7. Once terraform completes, a VM boots in Proxmox using the template image
|
||||||
|
8. The VM runs the `nixos-bootstrap` service, which applies the host config and reboots
|
||||||
|
9. After reboot, the host should be operational
|
||||||
|
10. Trigger auto-upgrade on `ns1` and `ns2` to propagate DNS records for the new host
|
||||||
|
11. Trigger auto-upgrade on `monitoring01` to add the host to Prometheus scrape targets
|
||||||
|
|
||||||
|
## Tier Specification
|
||||||
|
|
||||||
|
New hosts should set `homelab.host.tier` in their configuration:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
homelab.host.tier = "test"; # or "prod"
|
||||||
|
```
|
||||||
|
|
||||||
|
- **test** - Test-tier hosts can receive remote deployments via the `homelab-deploy` MCP server and have different credential access. Use for staging/testing.
|
||||||
|
- **prod** - Production hosts. Deployments require direct access or the CLI with appropriate credentials.
|
||||||
|
|
||||||
|
## Observability
|
||||||
|
|
||||||
|
During the bootstrap process, status updates are sent to Loki. Query bootstrap logs with:
|
||||||
|
|
||||||
|
```
|
||||||
|
{job="bootstrap", hostname="<hostname>"}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Bootstrap Stages
|
||||||
|
|
||||||
|
The bootstrap process reports these stages via the `stage` label:
|
||||||
|
|
||||||
|
| Stage | Message | Meaning |
|
||||||
|
|-------|---------|---------|
|
||||||
|
| `starting` | Bootstrap starting for \<host\> (branch: \<branch\>) | Bootstrap service has started |
|
||||||
|
| `network_ok` | Network connectivity confirmed | Can reach git server |
|
||||||
|
| `vault_ok` | Vault credentials unwrapped and stored | AppRole credentials provisioned |
|
||||||
|
| `vault_skip` | No Vault token provided - skipping credential setup | No wrapped token was provided |
|
||||||
|
| `vault_warn` | Failed to unwrap Vault token - continuing without secrets | Token unwrap failed (expired/used) |
|
||||||
|
| `building` | Starting nixos-rebuild boot | NixOS build starting |
|
||||||
|
| `success` | Build successful - rebooting into new configuration | Build complete, rebooting |
|
||||||
|
| `failed` | nixos-rebuild failed - manual intervention required | Build failed |
|
||||||
|
|
||||||
|
### Useful Queries
|
||||||
|
|
||||||
|
```
|
||||||
|
# All bootstrap activity for a host
|
||||||
|
{job="bootstrap", hostname="myhost"}
|
||||||
|
|
||||||
|
# Track all failures
|
||||||
|
{job="bootstrap", stage="failed"}
|
||||||
|
|
||||||
|
# Monitor builds in progress
|
||||||
|
{job="bootstrap", stage=~"building|success"}
|
||||||
|
```
|
||||||
|
|
||||||
|
Once the VM reboots with its full configuration, it will start publishing metrics to Prometheus and logs to Loki via Promtail.
|
||||||
|
|
||||||
|
## Verification
|
||||||
|
|
||||||
|
1. Check bootstrap completed successfully:
|
||||||
|
```
|
||||||
|
{job="bootstrap", hostname="<hostname>", stage="success"}
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Verify the host is up and reporting metrics:
|
||||||
|
```promql
|
||||||
|
up{instance=~"<hostname>.*"}
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Verify the correct flake revision is deployed:
|
||||||
|
```promql
|
||||||
|
nixos_flake_info{instance=~"<hostname>.*"}
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Check logs are flowing:
|
||||||
|
```
|
||||||
|
{hostname="<hostname>"}
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Confirm expected services are running and producing logs
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Bootstrap Failed
|
||||||
|
|
||||||
|
#### Common Issues
|
||||||
|
|
||||||
|
* VM has trouble running initial nixos-rebuild. Usually caused if it needs to compile packages from scratch if they are not available in our local nix-cache.
|
||||||
|
|
||||||
|
#### Troubleshooting
|
||||||
|
|
||||||
|
1. Check bootstrap logs in Loki - if they never progress past `building`, the rebuild likely consumed all resources:
|
||||||
|
```
|
||||||
|
{job="bootstrap", hostname="<hostname>"}
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **USER**: SSH into the host and check the bootstrap service:
|
||||||
|
```bash
|
||||||
|
ssh root@<hostname>
|
||||||
|
journalctl -u nixos-bootstrap.service
|
||||||
|
```
|
||||||
|
|
||||||
|
3. If the build failed due to resource constraints, increase VM specs in `terraform/vms.tf` and redeploy, or manually run the rebuild:
|
||||||
|
```bash
|
||||||
|
nixos-rebuild boot --flake git+https://git.t-juice.club/torjus/nixos-servers.git#<hostname>
|
||||||
|
```
|
||||||
|
|
||||||
|
4. If the host config doesn't exist in the flake, ensure step 5 was completed (config pushed to the correct branch).
|
||||||
|
|
||||||
|
### Vault Credentials Not Working
|
||||||
|
|
||||||
|
Usually caused by running the `create-host` script without proper credentials, or the wrapped token has expired/already been used.
|
||||||
|
|
||||||
|
#### Troubleshooting
|
||||||
|
|
||||||
|
1. Check if credentials exist on the host:
|
||||||
|
```bash
|
||||||
|
ssh root@<hostname>
|
||||||
|
ls -la /var/lib/vault/approle/
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Check bootstrap logs for vault-related stages:
|
||||||
|
```
|
||||||
|
{job="bootstrap", hostname="<hostname>", stage=~"vault.*"}
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **USER**: Regenerate and provision credentials manually:
|
||||||
|
```bash
|
||||||
|
nix develop -c ansible-playbook playbooks/provision-approle.yml -e hostname=<hostname>
|
||||||
|
```
|
||||||
|
|
||||||
|
### Host Not Appearing in DNS
|
||||||
|
|
||||||
|
Usually caused by not having deployed the commit with the new host to ns1/ns2.
|
||||||
|
|
||||||
|
#### Troubleshooting
|
||||||
|
|
||||||
|
1. Verify the host config has a static IP configured in `systemd.network.networks`
|
||||||
|
|
||||||
|
2. Check that `homelab.dns.enable` is not set to `false`
|
||||||
|
|
||||||
|
3. **USER**: Trigger auto-upgrade on DNS servers:
|
||||||
|
```bash
|
||||||
|
ssh root@ns1 systemctl start nixos-upgrade.service
|
||||||
|
ssh root@ns2 systemctl start nixos-upgrade.service
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Verify DNS resolution after upgrade completes:
|
||||||
|
```bash
|
||||||
|
dig @ns1.home.2rjus.net <hostname>.home.2rjus.net
|
||||||
|
```
|
||||||
|
|
||||||
|
### Host Not Being Scraped by Prometheus
|
||||||
|
|
||||||
|
Usually caused by not having deployed the commit with the new host to the monitoring host.
|
||||||
|
|
||||||
|
#### Troubleshooting
|
||||||
|
|
||||||
|
1. Check that `homelab.monitoring.enable` is not set to `false`
|
||||||
|
|
||||||
|
2. **USER**: Trigger auto-upgrade on monitoring01:
|
||||||
|
```bash
|
||||||
|
ssh root@monitoring01 systemctl start nixos-upgrade.service
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Verify the target appears in Prometheus:
|
||||||
|
```promql
|
||||||
|
up{instance=~"<hostname>.*"}
|
||||||
|
```
|
||||||
|
|
||||||
|
4. If the target is down, check that node-exporter is running on the host:
|
||||||
|
```bash
|
||||||
|
ssh root@<hostname> systemctl status prometheus-node-exporter.service
|
||||||
|
```
|
||||||
|
|
||||||
|
## Related Files
|
||||||
|
|
||||||
|
| Path | Description |
|
||||||
|
|------|-------------|
|
||||||
|
| `scripts/create-host/` | The `create-host` script that generates host configurations |
|
||||||
|
| `hosts/template2/` | Template VM configuration (base image for new VMs) |
|
||||||
|
| `hosts/template2/bootstrap.nix` | Bootstrap service that applies NixOS config on first boot |
|
||||||
|
| `terraform/vms.tf` | VM definitions (specs, IPs, branch overrides) |
|
||||||
|
| `terraform/cloud-init.tf` | Cloud-init configuration (passes hostname, branch, vault token) |
|
||||||
|
| `terraform/vault/approle.tf` | AppRole policies for each host |
|
||||||
|
| `terraform/vault/secrets.tf` | Secret definitions in Vault |
|
||||||
|
| `terraform/vault/hosts-generated.tf` | Auto-generated wrapped tokens for VM bootstrap |
|
||||||
|
| `playbooks/provision-approle.yml` | Ansible playbook for manual credential provisioning |
|
||||||
|
| `flake.nix` | Flake with all host configurations (add new hosts here) |
|
||||||
282
docs/infrastructure.md
Normal file
282
docs/infrastructure.md
Normal file
@@ -0,0 +1,282 @@
|
|||||||
|
# Homelab Infrastructure
|
||||||
|
|
||||||
|
This document describes the physical and virtual infrastructure components that support the NixOS-managed servers in this repository.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The homelab consists of several core infrastructure components:
|
||||||
|
- **Proxmox VE** - Hypervisor hosting all NixOS VMs
|
||||||
|
- **TrueNAS** - Network storage and backup target
|
||||||
|
- **Ubiquiti EdgeRouter** - Primary router and gateway
|
||||||
|
- **Mikrotik Switch** - Core network switching
|
||||||
|
|
||||||
|
All NixOS configurations in this repository run as VMs on Proxmox and rely on these underlying infrastructure components.
|
||||||
|
|
||||||
|
## Network Topology
|
||||||
|
|
||||||
|
### Subnets
|
||||||
|
|
||||||
|
VLAN numbers are based on third octet of ip address.
|
||||||
|
|
||||||
|
TODO: VLAN naming is currently inconsistent across router/switch/Proxmox configurations. Need to standardize VLAN names and update all device configs to use consistent naming.
|
||||||
|
|
||||||
|
- `10.69.8.x` - Kubernetes (no longer in use)
|
||||||
|
- `10.69.12.x` - Core services
|
||||||
|
- `10.69.13.x` - NixOS VMs and core services
|
||||||
|
- `10.69.30.x` - Client network 1
|
||||||
|
- `10.69.31.x` - Clients network 2
|
||||||
|
- `10.69.99.x` - Management network
|
||||||
|
|
||||||
|
### Core Network Services
|
||||||
|
|
||||||
|
- **Gateway**: Web UI exposed on 10.69.10.1
|
||||||
|
- **DNS**: ns1 (10.69.13.5), ns2 (10.69.13.6)
|
||||||
|
- **Primary DNS Domain**: `home.2rjus.net`
|
||||||
|
|
||||||
|
## Hardware Components
|
||||||
|
|
||||||
|
### Proxmox Hypervisor
|
||||||
|
|
||||||
|
**Purpose**: Hosts all NixOS VMs defined in this repository
|
||||||
|
|
||||||
|
**Hardware**:
|
||||||
|
- CPU: AMD Ryzen 9 3900X 12-Core Processor
|
||||||
|
- RAM: 96GB (94Gi)
|
||||||
|
- Storage: 1TB NVMe SSD (nvme0n1)
|
||||||
|
|
||||||
|
**Management**:
|
||||||
|
- Web UI: `https://pve1.home.2rjus.net:8006`
|
||||||
|
- Cluster: Standalone
|
||||||
|
- Version: Proxmox VE 8.4.16 (kernel 6.8.12-18-pve)
|
||||||
|
|
||||||
|
**VM Provisioning**:
|
||||||
|
- Template VM: ID 9000 (built from `hosts/template2`)
|
||||||
|
- See `/terraform` directory for automated VM deployment using OpenTofu
|
||||||
|
|
||||||
|
**Storage**:
|
||||||
|
- ZFS pool: `rpool` on NVMe partition (nvme0n1p3)
|
||||||
|
- Total capacity: ~900GB (232GB used, 667GB available)
|
||||||
|
- Configuration: Single disk (no RAID)
|
||||||
|
- Scrub status: Last scrub completed successfully with 0 errors
|
||||||
|
|
||||||
|
**Networking**:
|
||||||
|
- Management interface: `vmbr0` - 10.69.12.75/24 (VLAN 12 - Core services)
|
||||||
|
- Physical interface: `enp9s0` (primary), `enp4s0` (unused)
|
||||||
|
- VM bridges:
|
||||||
|
- `vmbr0` - Main bridge (bridged to enp9s0)
|
||||||
|
- `vmbr0v8` - VLAN 8 (Kubernetes - deprecated)
|
||||||
|
- `vmbr0v13` - VLAN 13 (NixOS VMs and core services)
|
||||||
|
|
||||||
|
### TrueNAS
|
||||||
|
|
||||||
|
**Purpose**: Network storage, backup target, media storage
|
||||||
|
|
||||||
|
**Hardware**:
|
||||||
|
- Model: Custom build
|
||||||
|
- CPU: AMD Ryzen 5 5600G with Radeon Graphics
|
||||||
|
- RAM: 32GB (31.2 GiB)
|
||||||
|
- Disks:
|
||||||
|
- 2x Kingston SA400S37 240GB SSD (boot pool, mirrored)
|
||||||
|
- 2x Seagate ST16000NE000 16TB HDD (hdd-pool mirror-0)
|
||||||
|
- 2x WD WD80EFBX 8TB HDD (hdd-pool mirror-1)
|
||||||
|
- 2x Seagate ST8000VN004 8TB HDD (hdd-pool mirror-2)
|
||||||
|
- 1x NVMe 2TB (nvme-pool, no redundancy)
|
||||||
|
|
||||||
|
**Management**:
|
||||||
|
- Web UI: `https://nas.home.2rjus.net` (10.69.12.50)
|
||||||
|
- Hostname: `nas.home.2rjus.net`
|
||||||
|
- Version: TrueNAS-13.0-U6.1 (Core)
|
||||||
|
|
||||||
|
**Networking**:
|
||||||
|
- Primary interface: `mlxen0` - 10GbE (10Gbase-CX4) connected to sw1
|
||||||
|
- IP: 10.69.12.50/24 (VLAN 12 - Core services)
|
||||||
|
|
||||||
|
**ZFS Pools**:
|
||||||
|
- `boot-pool`: 206GB (mirrored SSDs) - 4% used
|
||||||
|
- Mirror of 2x Kingston 240GB SSDs
|
||||||
|
- Last scrub: No errors
|
||||||
|
- `hdd-pool`: 29.1TB total (3-way mirror, 28.4TB used, 658GB free) - 97% capacity
|
||||||
|
- mirror-0: 2x 16TB Seagate ST16000NE000
|
||||||
|
- mirror-1: 2x 8TB WD WD80EFBX
|
||||||
|
- mirror-2: 2x 8TB Seagate ST8000VN004
|
||||||
|
- Last scrub: No errors
|
||||||
|
- `nvme-pool`: 1.81TB (single NVMe, 70.4GB used, 1.74TB free) - 3% capacity
|
||||||
|
- Single NVMe drive, no redundancy
|
||||||
|
- Last scrub: No errors
|
||||||
|
|
||||||
|
**NFS Exports**:
|
||||||
|
- `/mnt/hdd-pool/media` - Media storage (exported to 10.69.0.0/16, used by Jellyfin)
|
||||||
|
- `/mnt/hdd-pool/virt/nfs-iso` - ISO storage for Proxmox
|
||||||
|
- `/mnt/hdd-pool/virt/kube-prod-pvc` - Kubernetes storage (deprecated)
|
||||||
|
|
||||||
|
**Jails**:
|
||||||
|
TrueNAS runs several FreeBSD jails for media management:
|
||||||
|
- nzbget - Usenet downloader
|
||||||
|
- restic-rest - Restic REST server for backups
|
||||||
|
- radarr - Movie management
|
||||||
|
- sonarr - TV show management
|
||||||
|
|
||||||
|
### Ubiquiti EdgeRouter
|
||||||
|
|
||||||
|
**Purpose**: Primary router, gateway, firewall, inter-VLAN routing
|
||||||
|
|
||||||
|
**Model**: EdgeRouter X 5-Port
|
||||||
|
|
||||||
|
**Hardware**:
|
||||||
|
- Serial: F09FC20E1A4C
|
||||||
|
|
||||||
|
**Management**:
|
||||||
|
- SSH: `ssh ubnt@10.69.10.1`
|
||||||
|
- Web UI: `https://10.69.10.1`
|
||||||
|
- Version: EdgeOS v2.0.9-hotfix.6 (build 5574651, 12/30/22)
|
||||||
|
|
||||||
|
**WAN Connection**:
|
||||||
|
- Interface: eth0
|
||||||
|
- Public IP: 84.213.73.123/20
|
||||||
|
- Gateway: 84.213.64.1
|
||||||
|
|
||||||
|
**Interface Layout**:
|
||||||
|
- **eth0**: WAN (public IP)
|
||||||
|
- **eth1**: 10.69.31.1/24 - Clients network 2
|
||||||
|
- **eth2**: Unused (down)
|
||||||
|
- **eth3**: 10.69.30.1/24 - Client network 1
|
||||||
|
- **eth4**: Trunk port to Mikrotik switch (carries all VLANs)
|
||||||
|
- eth4.8: 10.69.8.1/24 - K8S (deprecated)
|
||||||
|
- eth4.10: 10.69.10.1/24 - TRUSTED (management access)
|
||||||
|
- eth4.12: 10.69.12.1/24 - SERVER (Proxmox, TrueNAS, core services)
|
||||||
|
- eth4.13: 10.69.13.1/24 - SVC (NixOS VMs)
|
||||||
|
- eth4.21: 10.69.21.1/24 - CLIENTS
|
||||||
|
- eth4.22: 10.69.22.1/24 - WLAN (wireless clients)
|
||||||
|
- eth4.23: 10.69.23.1/24 - IOT
|
||||||
|
- eth4.99: 10.69.99.1/24 - MGMT (device management)
|
||||||
|
|
||||||
|
**Routing**:
|
||||||
|
- Default route: 0.0.0.0/0 via 84.213.64.1 (WAN gateway)
|
||||||
|
- Static route: 192.168.100.0/24 via eth0
|
||||||
|
- All internal VLANs directly connected
|
||||||
|
|
||||||
|
**DHCP Servers**:
|
||||||
|
Active DHCP pools on all networks:
|
||||||
|
- dhcp-8: VLAN 8 (K8S) - 91 addresses
|
||||||
|
- dhcp-12: VLAN 12 (SERVER) - 51 addresses
|
||||||
|
- dhcp-13: VLAN 13 (SVC) - 41 addresses
|
||||||
|
- dhcp-21: VLAN 21 (CLIENTS) - 141 addresses
|
||||||
|
- dhcp-22: VLAN 22 (WLAN) - 101 addresses
|
||||||
|
- dhcp-23: VLAN 23 (IOT) - 191 addresses
|
||||||
|
- dhcp-30: eth3 (Client network 1) - 101 addresses
|
||||||
|
- dhcp-31: eth1 (Clients network 2) - 21 addresses
|
||||||
|
- dhcp-mgmt: VLAN 99 (MGMT) - 51 addresses
|
||||||
|
|
||||||
|
**NAT/Firewall**:
|
||||||
|
- Masquerading on WAN interface (eth0)
|
||||||
|
|
||||||
|
### Mikrotik Switch
|
||||||
|
|
||||||
|
**Purpose**: Core Layer 2/3 switching
|
||||||
|
|
||||||
|
**Model**: MikroTik CRS326-24G-2S+ (24x 1GbE + 2x 10GbE SFP+)
|
||||||
|
|
||||||
|
**Hardware**:
|
||||||
|
- CPU: ARMv7 @ 800MHz
|
||||||
|
- RAM: 512MB
|
||||||
|
- Uptime: 21+ weeks
|
||||||
|
|
||||||
|
**Management**:
|
||||||
|
- Hostname: `sw1.home.2rjus.net`
|
||||||
|
- SSH access: `ssh admin@sw1.home.2rjus.net` (using gunter SSH key)
|
||||||
|
- Management IP: 10.69.99.2/24 (VLAN 99)
|
||||||
|
- Version: RouterOS 6.47.10 (long-term)
|
||||||
|
|
||||||
|
**VLANs**:
|
||||||
|
- VLAN 8: Kubernetes (deprecated)
|
||||||
|
- VLAN 12: SERVERS - Core services subnet
|
||||||
|
- VLAN 13: SVC - Services subnet
|
||||||
|
- VLAN 21: CLIENTS
|
||||||
|
- VLAN 22: WLAN - Wireless network
|
||||||
|
- VLAN 23: IOT
|
||||||
|
- VLAN 99: MGMT - Management network
|
||||||
|
|
||||||
|
**Port Layout** (active ports):
|
||||||
|
- **ether1**: Uplink to EdgeRouter (trunk, carries all VLANs)
|
||||||
|
- **ether11**: virt-mini1 (VLAN 12 - SERVERS)
|
||||||
|
- **ether12**: Home Assistant (VLAN 12 - SERVERS)
|
||||||
|
- **ether24**: Wireless AP (VLAN 22 - WLAN)
|
||||||
|
- **sfp-sfpplus1**: Media server/Jellyfin (VLAN 12) - 10Gbps, 7m copper DAC
|
||||||
|
- **sfp-sfpplus2**: TrueNAS (VLAN 12) - 10Gbps, 1m copper DAC
|
||||||
|
|
||||||
|
**Bridge Configuration**:
|
||||||
|
- All ports bridged to main bridge interface
|
||||||
|
- Hardware offloading enabled
|
||||||
|
- VLAN filtering enabled on bridge
|
||||||
|
|
||||||
|
## Backup & Disaster Recovery
|
||||||
|
|
||||||
|
### Backup Strategy
|
||||||
|
|
||||||
|
**NixOS VMs**:
|
||||||
|
- Declarative configurations in this git repository
|
||||||
|
- Secrets: SOPS-encrypted, backed up with repository
|
||||||
|
- State/data: Some hosts are backed up to nas host, but this should be improved and expanded to more hosts.
|
||||||
|
|
||||||
|
**Proxmox**:
|
||||||
|
- VM backups: Not currently implemented
|
||||||
|
|
||||||
|
**Critical Credentials**:
|
||||||
|
|
||||||
|
TODO: Document this
|
||||||
|
|
||||||
|
- OpenBao root token and unseal keys: _[offline secure storage location]_
|
||||||
|
- Proxmox root password: _[secure storage]_
|
||||||
|
- TrueNAS admin password: _[secure storage]_
|
||||||
|
- Router admin credentials: _[secure storage]_
|
||||||
|
|
||||||
|
### Disaster Recovery Procedures
|
||||||
|
|
||||||
|
**Total Infrastructure Loss**:
|
||||||
|
1. Restore Proxmox from installation media
|
||||||
|
2. Restore TrueNAS from installation media, import ZFS pools
|
||||||
|
3. Restore network configuration on EdgeRouter and Mikrotik
|
||||||
|
4. Rebuild NixOS VMs from this repository using Proxmox template
|
||||||
|
5. Restore stateful data from TrueNAS backups
|
||||||
|
6. Re-initialize OpenBao and restore from backup if needed
|
||||||
|
|
||||||
|
**Individual VM Loss**:
|
||||||
|
1. Deploy new VM from template using OpenTofu (`terraform/`)
|
||||||
|
2. Run `nixos-rebuild` with appropriate flake configuration
|
||||||
|
3. Restore any stateful data from backups
|
||||||
|
4. For vault01: follow re-provisioning steps in `docs/vault/auto-unseal.md`
|
||||||
|
|
||||||
|
**Network Device Failure**:
|
||||||
|
- EdgeRouter: _[config backup location, restoration procedure]_
|
||||||
|
- Mikrotik: _[config backup location, restoration procedure]_
|
||||||
|
|
||||||
|
## Future Additions
|
||||||
|
|
||||||
|
- Additional Proxmox nodes for clustering
|
||||||
|
- Backup Proxmox Backup Server
|
||||||
|
- Additional TrueNAS for replication
|
||||||
|
|
||||||
|
## Maintenance Notes
|
||||||
|
|
||||||
|
### Proxmox Updates
|
||||||
|
|
||||||
|
- Update schedule: manual
|
||||||
|
- Pre-update checklist: yolo
|
||||||
|
|
||||||
|
### TrueNAS Updates
|
||||||
|
|
||||||
|
- Update schedule: manual
|
||||||
|
|
||||||
|
### Network Device Updates
|
||||||
|
|
||||||
|
- EdgeRouter: manual
|
||||||
|
- Mikrotik: manual
|
||||||
|
|
||||||
|
## Monitoring
|
||||||
|
|
||||||
|
**Infrastructure Monitoring**:
|
||||||
|
|
||||||
|
TODO: Improve monitoring for physical hosts (proxmox, nas)
|
||||||
|
TODO: Improve monitoring for networking equipment
|
||||||
|
|
||||||
|
All NixOS VMs ship metrics to monitoring01 via node-exporter and logs via Promtail. See `/services/monitoring/` for the observability stack configuration.
|
||||||
183
docs/plans/completed/auth-system-replacement.md
Normal file
183
docs/plans/completed/auth-system-replacement.md
Normal file
@@ -0,0 +1,183 @@
|
|||||||
|
# Authentication System Replacement Plan
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Deploy a modern, unified authentication solution for the homelab. Provides central user management, SSO for web services, and consistent UID/GID mapping for NAS permissions.
|
||||||
|
|
||||||
|
## Goals
|
||||||
|
|
||||||
|
1. **Central user database** - Manage users across all homelab hosts from a single source
|
||||||
|
2. **Linux PAM/NSS integration** - Users can SSH into hosts using central credentials
|
||||||
|
3. **UID/GID consistency** - Proper POSIX attributes for NAS share permissions
|
||||||
|
4. **OIDC provider** - Single sign-on for homelab web services (Grafana, etc.)
|
||||||
|
|
||||||
|
## Solution: Kanidm
|
||||||
|
|
||||||
|
Kanidm was chosen for the following reasons:
|
||||||
|
|
||||||
|
| Requirement | Kanidm Support |
|
||||||
|
|-------------|----------------|
|
||||||
|
| Central user database | Native |
|
||||||
|
| Linux PAM/NSS (host login) | Native NixOS module |
|
||||||
|
| UID/GID for NAS | POSIX attributes supported |
|
||||||
|
| OIDC for services | Built-in |
|
||||||
|
| Declarative config | Excellent NixOS provisioning |
|
||||||
|
| Simplicity | Modern API, LDAP optional |
|
||||||
|
| NixOS integration | First-class |
|
||||||
|
|
||||||
|
### Configuration Files
|
||||||
|
|
||||||
|
- **Host configuration:** `hosts/kanidm01/`
|
||||||
|
- **Service module:** `services/kanidm/default.nix`
|
||||||
|
|
||||||
|
## NAS Integration
|
||||||
|
|
||||||
|
### Current: TrueNAS CORE (FreeBSD)
|
||||||
|
|
||||||
|
TrueNAS CORE has a built-in LDAP client. Kanidm's read-only LDAP interface will work for NFS share permissions:
|
||||||
|
|
||||||
|
- **NFS shares**: Only need consistent UID/GID mapping - Kanidm's LDAP provides this
|
||||||
|
- **No SMB requirement**: SMB would need Samba schema attributes (deprecated in TrueNAS 13.0+), but we're NFS-only
|
||||||
|
|
||||||
|
Configuration approach:
|
||||||
|
1. Enable Kanidm's LDAP interface (`ldapbindaddress = "0.0.0.0:636"`)
|
||||||
|
2. Import internal CA certificate into TrueNAS
|
||||||
|
3. Configure TrueNAS LDAP client with Kanidm's Base DN and bind credentials
|
||||||
|
4. Users/groups appear in TrueNAS permission dropdowns
|
||||||
|
|
||||||
|
Note: Kanidm's LDAP is read-only and uses LDAPS only (no StartTLS). This is fine for our use case.
|
||||||
|
|
||||||
|
### Future: NixOS NAS
|
||||||
|
|
||||||
|
When the NAS is migrated to NixOS, it becomes a first-class citizen:
|
||||||
|
|
||||||
|
- Native Kanidm PAM/NSS integration (same as other hosts)
|
||||||
|
- No LDAP compatibility layer needed
|
||||||
|
- Full integration with the rest of the homelab
|
||||||
|
|
||||||
|
This future migration path is a strong argument for Kanidm over LDAP-only solutions.
|
||||||
|
|
||||||
|
## Implementation Steps
|
||||||
|
|
||||||
|
1. **Create kanidm01 host and service module** ✅
|
||||||
|
- Host: `kanidm01.home.2rjus.net` (10.69.13.23, test tier)
|
||||||
|
- Service module: `services/kanidm/`
|
||||||
|
- TLS via internal ACME (`auth.home.2rjus.net`)
|
||||||
|
- Vault integration for idm_admin password
|
||||||
|
- LDAPS on port 636
|
||||||
|
|
||||||
|
2. **Configure provisioning** ✅
|
||||||
|
- Groups provisioned declaratively: `admins`, `users`, `ssh-users`
|
||||||
|
- Users managed imperatively via CLI (allows setting POSIX passwords in one step)
|
||||||
|
- POSIX attributes enabled (UID/GID range 65,536-69,999)
|
||||||
|
|
||||||
|
3. **Test NAS integration** (in progress)
|
||||||
|
- ✅ LDAP interface verified working
|
||||||
|
- Configure TrueNAS LDAP client to connect to Kanidm
|
||||||
|
- Verify UID/GID mapping works with NFS shares
|
||||||
|
|
||||||
|
4. **Add OIDC clients** for homelab services
|
||||||
|
- Grafana
|
||||||
|
- Other services as needed
|
||||||
|
|
||||||
|
5. **Create client module** in `system/` for PAM/NSS ✅
|
||||||
|
- Module: `system/kanidm-client.nix`
|
||||||
|
- `homelab.kanidm.enable = true` enables PAM/NSS
|
||||||
|
- Short usernames (not SPN format)
|
||||||
|
- Home directory symlinks via `home_alias`
|
||||||
|
- Enabled on test tier: testvm01, testvm02, testvm03
|
||||||
|
|
||||||
|
6. **Documentation** ✅
|
||||||
|
- `docs/user-management.md` - CLI workflows, troubleshooting
|
||||||
|
- User/group creation procedures verified working
|
||||||
|
|
||||||
|
## Progress
|
||||||
|
|
||||||
|
### Completed (2026-02-08)
|
||||||
|
|
||||||
|
**Kanidm server deployed on kanidm01 (test tier):**
|
||||||
|
- Host: `kanidm01.home.2rjus.net` (10.69.13.23)
|
||||||
|
- WebUI: `https://auth.home.2rjus.net`
|
||||||
|
- LDAPS: port 636
|
||||||
|
- Valid certificate from internal CA
|
||||||
|
|
||||||
|
**Configuration:**
|
||||||
|
- Kanidm 1.8 with secret provisioning support
|
||||||
|
- Daily backups at 22:00 (7 versions retained)
|
||||||
|
- Vault integration for idm_admin password
|
||||||
|
- Prometheus monitoring scrape target configured
|
||||||
|
|
||||||
|
**Provisioned entities:**
|
||||||
|
- Groups: `admins`, `users`, `ssh-users` (declarative)
|
||||||
|
- Users managed via CLI (imperative)
|
||||||
|
|
||||||
|
**Verified working:**
|
||||||
|
- WebUI login with idm_admin
|
||||||
|
- LDAP bind and search with POSIX-enabled user
|
||||||
|
- LDAPS with valid internal CA certificate
|
||||||
|
|
||||||
|
### Completed (2026-02-08) - PAM/NSS Client
|
||||||
|
|
||||||
|
**Client module deployed (`system/kanidm-client.nix`):**
|
||||||
|
- `homelab.kanidm.enable = true` enables PAM/NSS integration
|
||||||
|
- Connects to auth.home.2rjus.net
|
||||||
|
- Short usernames (`torjus` instead of `torjus@home.2rjus.net`)
|
||||||
|
- Home directory symlinks (`/home/torjus` → UUID-based dir)
|
||||||
|
- Login restricted to `ssh-users` group
|
||||||
|
|
||||||
|
**Enabled on test tier:**
|
||||||
|
- testvm01, testvm02, testvm03
|
||||||
|
|
||||||
|
**Verified working:**
|
||||||
|
- User/group resolution via `getent`
|
||||||
|
- SSH login with Kanidm unix passwords
|
||||||
|
- Home directory creation with symlinks
|
||||||
|
- Imperative user/group creation via CLI
|
||||||
|
|
||||||
|
**Documentation:**
|
||||||
|
- `docs/user-management.md` with full CLI workflows
|
||||||
|
- Password requirements (min 10 chars)
|
||||||
|
- Troubleshooting guide (nscd, cache invalidation)
|
||||||
|
|
||||||
|
### UID/GID Range (Resolved)
|
||||||
|
|
||||||
|
**Range: 65,536 - 69,999** (manually allocated)
|
||||||
|
|
||||||
|
- Users: 65,536 - 67,999 (up to ~2500 users)
|
||||||
|
- Groups: 68,000 - 69,999 (up to ~2000 groups)
|
||||||
|
|
||||||
|
Rationale:
|
||||||
|
- Starts at Kanidm's recommended minimum (65,536)
|
||||||
|
- Well above NixOS system users (typically <1000)
|
||||||
|
- Avoids Podman/container issues with very high GIDs
|
||||||
|
|
||||||
|
### Completed (2026-02-08) - OAuth2/OIDC for Grafana
|
||||||
|
|
||||||
|
**OAuth2 client deployed for Grafana on monitoring02:**
|
||||||
|
- Client ID: `grafana`
|
||||||
|
- Redirect URL: `https://grafana-test.home.2rjus.net/login/generic_oauth`
|
||||||
|
- Scope maps: `openid`, `profile`, `email`, `groups` for `users` group
|
||||||
|
- Role mapping: `admins` group → Grafana Admin, others → Viewer
|
||||||
|
|
||||||
|
**Configuration locations:**
|
||||||
|
- Kanidm OAuth2 client: `services/kanidm/default.nix`
|
||||||
|
- Grafana OIDC config: `services/grafana/default.nix`
|
||||||
|
- Vault secret: `services/grafana/oauth2-client-secret`
|
||||||
|
|
||||||
|
**Key findings:**
|
||||||
|
- PKCE is required by Kanidm - enable `use_pkce = true` in Grafana
|
||||||
|
- Must set `email_attribute_path`, `login_attribute_path`, `name_attribute_path` to extract from userinfo
|
||||||
|
- Users need: primary credential (password + TOTP for MFA), membership in `users` group, email address set
|
||||||
|
- Unix password is separate from primary credential (web login requires primary credential)
|
||||||
|
|
||||||
|
### Next Steps
|
||||||
|
|
||||||
|
1. Enable PAM/NSS on production hosts (after test tier validation)
|
||||||
|
2. Configure TrueNAS LDAP client for NAS integration testing
|
||||||
|
3. Add OAuth2 clients for other services as needed
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [Kanidm Documentation](https://kanidm.github.io/kanidm/stable/)
|
||||||
|
- [NixOS Kanidm Module](https://search.nixos.org/options?query=services.kanidm)
|
||||||
|
- [Kanidm PAM/NSS Integration](https://kanidm.github.io/kanidm/stable/pam_and_nsswitch.html)
|
||||||
669
docs/plans/completed/automated-host-deployment-pipeline.md
Normal file
669
docs/plans/completed/automated-host-deployment-pipeline.md
Normal file
@@ -0,0 +1,669 @@
|
|||||||
|
# TODO: Automated Host Deployment Pipeline
|
||||||
|
|
||||||
|
## Vision
|
||||||
|
|
||||||
|
Automate the entire process of creating, configuring, and deploying new NixOS hosts on Proxmox from a single command or script.
|
||||||
|
|
||||||
|
**Desired workflow:**
|
||||||
|
```bash
|
||||||
|
./scripts/create-host.sh --hostname myhost --ip 10.69.13.50
|
||||||
|
# Script creates config, deploys VM, bootstraps NixOS, and you're ready to go
|
||||||
|
```
|
||||||
|
|
||||||
|
**Current manual workflow (from CLAUDE.md):**
|
||||||
|
1. Create `/hosts/<hostname>/` directory structure
|
||||||
|
2. Add host to `flake.nix`
|
||||||
|
3. Add DNS entries
|
||||||
|
4. Clone template VM manually
|
||||||
|
5. Run `prepare-host.sh` on new VM
|
||||||
|
6. Add generated age key to `.sops.yaml`
|
||||||
|
7. Configure networking
|
||||||
|
8. Commit and push
|
||||||
|
9. Run `nixos-rebuild boot --flake URL#<hostname>` on host
|
||||||
|
|
||||||
|
## The Plan
|
||||||
|
|
||||||
|
### Phase 1: Parameterized OpenTofu Deployments ✅ COMPLETED
|
||||||
|
|
||||||
|
**Status:** Fully implemented and tested
|
||||||
|
|
||||||
|
**Implementation:**
|
||||||
|
- Locals-based structure using `for_each` pattern for multiple VM deployments
|
||||||
|
- All VM parameters configurable with smart defaults (CPU, memory, disk, IP, storage, etc.)
|
||||||
|
- Automatic DHCP vs static IP detection based on `ip` field presence
|
||||||
|
- Dynamic outputs showing deployed VM IPs and specifications
|
||||||
|
- Successfully tested deploying multiple VMs simultaneously
|
||||||
|
|
||||||
|
**Tasks:**
|
||||||
|
- [x] Create module/template structure in terraform for repeatable VM deployments
|
||||||
|
- [x] Parameterize VM configuration (hostname, CPU, memory, disk, IP)
|
||||||
|
- [x] Support both DHCP and static IP configuration via cloud-init
|
||||||
|
- [x] Test deploying multiple VMs from same template
|
||||||
|
|
||||||
|
**Deliverable:** ✅ Can deploy multiple VMs with custom parameters via OpenTofu in a single `tofu apply`
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- `terraform/vms.tf` - VM definitions using locals map
|
||||||
|
- `terraform/outputs.tf` - Dynamic outputs for all VMs
|
||||||
|
- `terraform/variables.tf` - Configurable defaults
|
||||||
|
- `terraform/README.md` - Complete documentation
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Phase 2: Host Configuration Generator ✅ COMPLETED
|
||||||
|
|
||||||
|
**Status:** ✅ Fully implemented and tested
|
||||||
|
**Completed:** 2025-02-01
|
||||||
|
**Enhanced:** 2025-02-01 (added --force flag)
|
||||||
|
|
||||||
|
**Goal:** Automate creation of host configuration files
|
||||||
|
|
||||||
|
**Implementation:**
|
||||||
|
- Python CLI tool packaged as Nix derivation
|
||||||
|
- Available as `create-host` command in devShell
|
||||||
|
- Rich terminal UI with configuration previews
|
||||||
|
- Comprehensive validation (hostname format/uniqueness, IP subnet/uniqueness)
|
||||||
|
- Jinja2 templates for NixOS configurations
|
||||||
|
- Automatic updates to flake.nix and terraform/vms.tf
|
||||||
|
- `--force` flag for regenerating existing configurations (useful for testing)
|
||||||
|
|
||||||
|
**Tasks:**
|
||||||
|
- [x] Create Python CLI with typer framework
|
||||||
|
- [x] Takes parameters: hostname, IP, CPU cores, memory, disk size
|
||||||
|
- [x] Generates `/hosts/<hostname>/` directory structure
|
||||||
|
- [x] Creates `configuration.nix` with proper hostname and networking
|
||||||
|
- [x] Generates `default.nix` with standard imports
|
||||||
|
- [x] References shared `hardware-configuration.nix` from template
|
||||||
|
- [x] Add host entry to `flake.nix` programmatically
|
||||||
|
- [x] Text-based manipulation (regex insertion)
|
||||||
|
- [x] Inserts new nixosConfiguration entry
|
||||||
|
- [x] Maintains proper formatting
|
||||||
|
- [x] Generate corresponding OpenTofu configuration
|
||||||
|
- [x] Adds VM definition to `terraform/vms.tf`
|
||||||
|
- [x] Uses parameters from CLI input
|
||||||
|
- [x] Supports both static IP and DHCP modes
|
||||||
|
- [x] Package as Nix derivation with templates
|
||||||
|
- [x] Add to flake packages and devShell
|
||||||
|
- [x] Implement dry-run mode
|
||||||
|
- [x] Write comprehensive README
|
||||||
|
|
||||||
|
**Usage:**
|
||||||
|
```bash
|
||||||
|
# In nix develop shell
|
||||||
|
create-host \
|
||||||
|
--hostname test01 \
|
||||||
|
--ip 10.69.13.50/24 \ # optional, omit for DHCP
|
||||||
|
--cpu 4 \ # optional, default 2
|
||||||
|
--memory 4096 \ # optional, default 2048
|
||||||
|
--disk 50G \ # optional, default 20G
|
||||||
|
--dry-run # optional preview mode
|
||||||
|
```
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- `scripts/create-host/` - Complete Python package with Nix derivation
|
||||||
|
- `scripts/create-host/README.md` - Full documentation and examples
|
||||||
|
|
||||||
|
**Deliverable:** ✅ Tool generates all config files for a new host, validated with Nix and Terraform
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Phase 3: Bootstrap Mechanism ✅ COMPLETED
|
||||||
|
|
||||||
|
**Status:** ✅ Fully implemented and tested
|
||||||
|
**Completed:** 2025-02-01
|
||||||
|
**Enhanced:** 2025-02-01 (added branch support for testing)
|
||||||
|
|
||||||
|
**Goal:** Get freshly deployed VM to apply its specific host configuration
|
||||||
|
|
||||||
|
**Implementation:** Systemd oneshot service that runs on first boot after cloud-init
|
||||||
|
|
||||||
|
**Approach taken:** Systemd service (variant of Option A)
|
||||||
|
- Systemd service `nixos-bootstrap.service` runs on first boot
|
||||||
|
- Depends on `cloud-config.service` to ensure hostname is set
|
||||||
|
- Reads hostname from `hostnamectl` (set by cloud-init via Terraform)
|
||||||
|
- Supports custom git branch via `NIXOS_FLAKE_BRANCH` environment variable
|
||||||
|
- Runs `nixos-rebuild boot --flake git+https://git.t-juice.club/torjus/nixos-servers.git?ref=$BRANCH#${hostname}`
|
||||||
|
- Reboots into new configuration on success
|
||||||
|
- Fails gracefully without reboot on errors (network issues, missing config)
|
||||||
|
- Service self-destructs after successful bootstrap (not in new config)
|
||||||
|
|
||||||
|
**Tasks:**
|
||||||
|
- [x] Create bootstrap service module in template2
|
||||||
|
- [x] systemd oneshot service with proper dependencies
|
||||||
|
- [x] Reads hostname from hostnamectl (cloud-init sets it)
|
||||||
|
- [x] Checks network connectivity via HTTPS (curl)
|
||||||
|
- [x] Runs nixos-rebuild boot with flake URL
|
||||||
|
- [x] Reboots on success, fails gracefully on error
|
||||||
|
- [x] Configure cloud-init datasource
|
||||||
|
- [x] Use ConfigDrive datasource (Proxmox provider)
|
||||||
|
- [x] Add cloud-init disk to Terraform VMs (disks.ide.ide2.cloudinit)
|
||||||
|
- [x] Hostname passed via cloud-init user-data from Terraform
|
||||||
|
- [x] Test bootstrap service execution on fresh VM
|
||||||
|
- [x] Handle failure cases (flake doesn't exist, network issues)
|
||||||
|
- [x] Clear error messages in journald
|
||||||
|
- [x] No reboot on failure
|
||||||
|
- [x] System remains accessible for debugging
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- `hosts/template2/bootstrap.nix` - Bootstrap service definition
|
||||||
|
- `hosts/template2/configuration.nix` - Cloud-init ConfigDrive datasource
|
||||||
|
- `terraform/vms.tf` - Cloud-init disk configuration
|
||||||
|
|
||||||
|
**Deliverable:** ✅ VMs automatically bootstrap and reboot into host-specific configuration on first boot
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Phase 4: Secrets Management with OpenBao (Vault)
|
||||||
|
|
||||||
|
**Status:** 🚧 Phases 4a, 4b, 4c (partial), & 4d Complete
|
||||||
|
|
||||||
|
**Challenge:** Current sops-nix approach has chicken-and-egg problem with age keys
|
||||||
|
|
||||||
|
**Current workflow:**
|
||||||
|
1. VM boots, generates age key at `/var/lib/sops-nix/key.txt`
|
||||||
|
2. User runs `prepare-host.sh` which prints public key
|
||||||
|
3. User manually adds public key to `.sops.yaml`
|
||||||
|
4. User commits, pushes
|
||||||
|
5. VM can now decrypt secrets
|
||||||
|
|
||||||
|
**Selected approach:** Migrate to OpenBao (Vault fork) for centralized secrets management
|
||||||
|
|
||||||
|
**Why OpenBao instead of HashiCorp Vault:**
|
||||||
|
- HashiCorp Vault switched to BSL (Business Source License), unavailable in NixOS cache
|
||||||
|
- OpenBao is the community fork maintaining the pre-BSL MPL 2.0 license
|
||||||
|
- API-compatible with Vault, uses same Terraform provider
|
||||||
|
- Maintains all Vault features we need
|
||||||
|
|
||||||
|
**Benefits:**
|
||||||
|
- Industry-standard secrets management (Vault-compatible experience)
|
||||||
|
- Eliminates manual age key distribution step
|
||||||
|
- Secrets-as-code via OpenTofu (infrastructure-as-code aligned)
|
||||||
|
- Centralized PKI management with ACME support (ready to replace step-ca)
|
||||||
|
- Automatic secret rotation capabilities
|
||||||
|
- Audit logging for all secret access (not yet enabled)
|
||||||
|
- AppRole authentication enables automated bootstrap
|
||||||
|
|
||||||
|
**Current Architecture:**
|
||||||
|
```
|
||||||
|
vault01.home.2rjus.net (10.69.13.19)
|
||||||
|
├─ KV Secrets Engine (ready to replace sops-nix)
|
||||||
|
│ ├─ secret/hosts/{hostname}/*
|
||||||
|
│ ├─ secret/services/{service}/*
|
||||||
|
│ └─ secret/shared/{category}/*
|
||||||
|
├─ PKI Engine (ready to replace step-ca for TLS)
|
||||||
|
│ ├─ Root CA (EC P-384, 10 year)
|
||||||
|
│ ├─ Intermediate CA (EC P-384, 5 year)
|
||||||
|
│ └─ ACME endpoint enabled
|
||||||
|
├─ SSH CA Engine (TODO: Phase 4c)
|
||||||
|
└─ AppRole Auth (per-host authentication configured)
|
||||||
|
↓
|
||||||
|
[✅ Phase 4d] New hosts authenticate on first boot
|
||||||
|
[✅ Phase 4d] Fetch secrets via Vault API
|
||||||
|
No manual key distribution needed
|
||||||
|
```
|
||||||
|
|
||||||
|
**Completed:**
|
||||||
|
- ✅ Phase 4a: OpenBao server with TPM2 auto-unseal
|
||||||
|
- ✅ Phase 4b: Infrastructure-as-code (secrets, policies, AppRoles, PKI)
|
||||||
|
- ✅ Phase 4d: Bootstrap integration for automated secrets access
|
||||||
|
|
||||||
|
**Next Steps:**
|
||||||
|
- Phase 4c: Migrate from step-ca to OpenBao PKI
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### Phase 4a: Vault Server Setup ✅ COMPLETED
|
||||||
|
|
||||||
|
**Status:** ✅ Fully implemented and tested
|
||||||
|
**Completed:** 2026-02-02
|
||||||
|
|
||||||
|
**Goal:** Deploy and configure Vault server with auto-unseal
|
||||||
|
|
||||||
|
**Implementation:**
|
||||||
|
- Used **OpenBao** (Vault fork) instead of HashiCorp Vault due to BSL licensing concerns
|
||||||
|
- TPM2-based auto-unseal using systemd's native `LoadCredentialEncrypted`
|
||||||
|
- Self-signed bootstrap TLS certificates (avoiding circular dependency with step-ca)
|
||||||
|
- File-based storage backend at `/var/lib/openbao`
|
||||||
|
- Unix socket + TCP listener (0.0.0.0:8200) configuration
|
||||||
|
|
||||||
|
**Tasks:**
|
||||||
|
- [x] Create `hosts/vault01/` configuration
|
||||||
|
- [x] Basic NixOS configuration (hostname: vault01, IP: 10.69.13.19/24)
|
||||||
|
- [x] Created reusable `services/vault` module
|
||||||
|
- [x] Firewall not needed (trusted network)
|
||||||
|
- [x] Already in flake.nix, deployed via terraform
|
||||||
|
- [x] Implement auto-unseal mechanism
|
||||||
|
- [x] **TPM2-based auto-unseal** (preferred option)
|
||||||
|
- [x] systemd `LoadCredentialEncrypted` with TPM2 binding
|
||||||
|
- [x] `writeShellApplication` script with proper runtime dependencies
|
||||||
|
- [x] Reads multiple unseal keys (one per line) until unsealed
|
||||||
|
- [x] Auto-unseals on service start via `ExecStartPost`
|
||||||
|
- [x] Initial Vault setup
|
||||||
|
- [x] Initialized OpenBao with Shamir secret sharing (5 keys, threshold 3)
|
||||||
|
- [x] File storage backend
|
||||||
|
- [x] Self-signed TLS certificates via LoadCredential
|
||||||
|
- [x] Deploy to infrastructure
|
||||||
|
- [x] DNS entry added for vault01.home.2rjus.net
|
||||||
|
- [x] VM deployed via terraform
|
||||||
|
- [x] Verified OpenBao running and auto-unsealing
|
||||||
|
|
||||||
|
**Changes from Original Plan:**
|
||||||
|
- Used OpenBao instead of HashiCorp Vault (licensing)
|
||||||
|
- Used systemd's native TPM2 support instead of tpm2-tools directly
|
||||||
|
- Skipped audit logging (can be enabled later)
|
||||||
|
- Used self-signed certs initially (will migrate to OpenBao PKI later)
|
||||||
|
|
||||||
|
**Deliverable:** ✅ Running OpenBao server that auto-unseals on boot using TPM2
|
||||||
|
|
||||||
|
**Documentation:**
|
||||||
|
- `/services/vault/README.md` - Service module overview
|
||||||
|
- `/docs/vault/auto-unseal.md` - Complete TPM2 auto-unseal setup guide
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### Phase 4b: Vault-as-Code with OpenTofu ✅ COMPLETED
|
||||||
|
|
||||||
|
**Status:** ✅ Fully implemented and tested
|
||||||
|
**Completed:** 2026-02-02
|
||||||
|
|
||||||
|
**Goal:** Manage all Vault configuration (secrets structure, policies, roles) as code
|
||||||
|
|
||||||
|
**Implementation:**
|
||||||
|
- Complete Terraform/OpenTofu configuration in `terraform/vault/`
|
||||||
|
- Locals-based pattern (similar to `vms.tf`) for declaring secrets and policies
|
||||||
|
- Auto-generation of secrets using `random_password` provider
|
||||||
|
- Three-tier secrets path hierarchy: `hosts/`, `services/`, `shared/`
|
||||||
|
- PKI infrastructure with **Elliptic Curve certificates** (P-384 for CAs, P-256 for leaf certs)
|
||||||
|
- ACME support enabled on intermediate CA
|
||||||
|
|
||||||
|
**Tasks:**
|
||||||
|
- [x] Set up Vault Terraform provider
|
||||||
|
- [x] Created `terraform/vault/` directory
|
||||||
|
- [x] Configured Vault provider (uses HashiCorp provider, compatible with OpenBao)
|
||||||
|
- [x] Credentials in terraform.tfvars (gitignored)
|
||||||
|
- [x] terraform.tfvars.example for reference
|
||||||
|
- [x] Enable and configure secrets engines
|
||||||
|
- [x] KV v2 engine at `secret/`
|
||||||
|
- [x] Three-tier path structure:
|
||||||
|
- `secret/hosts/{hostname}/*` - Host-specific secrets
|
||||||
|
- `secret/services/{service}/*` - Service-wide secrets
|
||||||
|
- `secret/shared/{category}/*` - Shared secrets (SMTP, backups, etc.)
|
||||||
|
- [x] Define policies as code
|
||||||
|
- [x] Policies auto-generated from `locals.host_policies`
|
||||||
|
- [x] Per-host policies with read/list on designated paths
|
||||||
|
- [x] Principle of least privilege enforced
|
||||||
|
- [x] Set up AppRole authentication
|
||||||
|
- [x] AppRole backend enabled at `approle/`
|
||||||
|
- [x] Roles auto-generated per host from `locals.host_policies`
|
||||||
|
- [x] Token TTL: 1 hour, max 24 hours
|
||||||
|
- [x] Policies bound to roles
|
||||||
|
- [x] Implement secrets-as-code patterns
|
||||||
|
- [x] Auto-generated secrets using `random_password` provider
|
||||||
|
- [x] Manual secrets supported via variables in terraform.tfvars
|
||||||
|
- [x] Secret structure versioned in .tf files
|
||||||
|
- [x] Secret values excluded from git
|
||||||
|
- [x] Set up PKI infrastructure
|
||||||
|
- [x] Root CA (10 year TTL, EC P-384)
|
||||||
|
- [x] Intermediate CA (5 year TTL, EC P-384)
|
||||||
|
- [x] PKI role for `*.home.2rjus.net` (30 day max TTL, EC P-256)
|
||||||
|
- [x] ACME enabled on intermediate CA
|
||||||
|
- [x] Support for static certificate issuance via Terraform
|
||||||
|
- [x] CRL, OCSP, and issuing certificate URLs configured
|
||||||
|
|
||||||
|
**Changes from Original Plan:**
|
||||||
|
- Used Elliptic Curve instead of RSA for all certificates (better performance, smaller keys)
|
||||||
|
- Implemented PKI infrastructure in Phase 4b instead of Phase 4c (more logical grouping)
|
||||||
|
- ACME support configured immediately (ready for migration from step-ca)
|
||||||
|
- Did not migrate existing sops-nix secrets yet (deferred to gradual migration)
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- `terraform/vault/main.tf` - Provider configuration
|
||||||
|
- `terraform/vault/variables.tf` - Variable definitions
|
||||||
|
- `terraform/vault/approle.tf` - AppRole authentication (locals-based pattern)
|
||||||
|
- `terraform/vault/pki.tf` - PKI infrastructure with EC certificates
|
||||||
|
- `terraform/vault/secrets.tf` - KV secrets engine (auto-generation support)
|
||||||
|
- `terraform/vault/README.md` - Complete documentation and usage examples
|
||||||
|
- `terraform/vault/terraform.tfvars.example` - Example credentials
|
||||||
|
|
||||||
|
**Deliverable:** ✅ All secrets, policies, AppRoles, and PKI managed as OpenTofu code in `terraform/vault/`
|
||||||
|
|
||||||
|
**Documentation:**
|
||||||
|
- `/terraform/vault/README.md` - Comprehensive guide covering:
|
||||||
|
- Setup and deployment
|
||||||
|
- AppRole usage and host access patterns
|
||||||
|
- PKI certificate issuance (ACME, static, manual)
|
||||||
|
- Secrets management patterns
|
||||||
|
- ACME configuration and troubleshooting
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### Phase 4c: PKI Migration (Replace step-ca)
|
||||||
|
|
||||||
|
**Status:** 🚧 Partially Complete - vault01 and test host migrated, remaining hosts pending
|
||||||
|
|
||||||
|
**Goal:** Migrate hosts from step-ca to OpenBao PKI for TLS certificates
|
||||||
|
|
||||||
|
**Note:** PKI infrastructure already set up in Phase 4b (root CA, intermediate CA, ACME support)
|
||||||
|
|
||||||
|
**Tasks:**
|
||||||
|
- [x] Set up OpenBao PKI engines (completed in Phase 4b)
|
||||||
|
- [x] Root CA (`pki/` mount, 10 year TTL, EC P-384)
|
||||||
|
- [x] Intermediate CA (`pki_int/` mount, 5 year TTL, EC P-384)
|
||||||
|
- [x] Signed intermediate with root CA
|
||||||
|
- [x] Configured CRL, OCSP, and issuing certificate URLs
|
||||||
|
- [x] Enable ACME support (completed in Phase 4b, fixed in Phase 4c)
|
||||||
|
- [x] Enabled ACME on intermediate CA
|
||||||
|
- [x] Created PKI role for `*.home.2rjus.net`
|
||||||
|
- [x] Set certificate TTLs (30 day max) and allowed domains
|
||||||
|
- [x] ACME directory: `https://vault01.home.2rjus.net:8200/v1/pki_int/acme/directory`
|
||||||
|
- [x] Fixed ACME response headers (added Replay-Nonce, Link, Location to allowed_response_headers)
|
||||||
|
- [x] Configured cluster path for ACME
|
||||||
|
- [x] Download and distribute root CA certificate
|
||||||
|
- [x] Added root CA to `system/pki/root-ca.nix`
|
||||||
|
- [x] Distributed to all hosts via system imports
|
||||||
|
- [x] Test certificate issuance
|
||||||
|
- [x] Tested ACME issuance on vaulttest01 successfully
|
||||||
|
- [x] Verified certificate chain and trust
|
||||||
|
- [x] Migrate vault01's own certificate
|
||||||
|
- [x] Created `bootstrap-vault-cert` script for initial certificate issuance via bao CLI
|
||||||
|
- [x] Issued certificate with SANs (vault01.home.2rjus.net + vault.home.2rjus.net)
|
||||||
|
- [x] Updated service to read certificates from `/var/lib/acme/vault01.home.2rjus.net/`
|
||||||
|
- [x] Configured ACME for automatic renewals
|
||||||
|
- [ ] Migrate hosts from step-ca to OpenBao
|
||||||
|
- [x] Tested on vaulttest01 (non-production host)
|
||||||
|
- [ ] Standardize hostname usage across all configurations
|
||||||
|
- [ ] Use `vault.home.2rjus.net` (CNAME) consistently everywhere
|
||||||
|
- [ ] Update NixOS configurations to use CNAME instead of vault01
|
||||||
|
- [ ] Update Terraform configurations to use CNAME
|
||||||
|
- [ ] Audit and fix mixed usage of vault01.home.2rjus.net vs vault.home.2rjus.net
|
||||||
|
- [ ] Update `system/acme.nix` to use OpenBao ACME endpoint
|
||||||
|
- [ ] Change server to `https://vault.home.2rjus.net:8200/v1/pki_int/acme/directory`
|
||||||
|
- [ ] Roll out to all hosts via auto-upgrade
|
||||||
|
- [ ] Configure SSH CA in OpenBao (optional, future work)
|
||||||
|
- [ ] Enable SSH secrets engine (`ssh/` mount)
|
||||||
|
- [ ] Generate SSH signing keys
|
||||||
|
- [ ] Create roles for host and user certificates
|
||||||
|
- [ ] Configure TTLs and allowed principals
|
||||||
|
- [ ] Distribute SSH CA public key to all hosts
|
||||||
|
- [ ] Update sshd_config to trust OpenBao CA
|
||||||
|
- [ ] Decommission step-ca
|
||||||
|
- [ ] Verify all ACME services migrated and working
|
||||||
|
- [ ] Stop step-ca service on ca host
|
||||||
|
- [ ] Archive step-ca configuration for backup
|
||||||
|
- [ ] Update documentation
|
||||||
|
|
||||||
|
**Implementation Details (2026-02-03):**
|
||||||
|
|
||||||
|
**ACME Configuration Fix:**
|
||||||
|
The key blocker was that OpenBao's PKI mount was filtering out required ACME response headers. The solution was to add `allowed_response_headers` to the Terraform mount configuration:
|
||||||
|
```hcl
|
||||||
|
allowed_response_headers = [
|
||||||
|
"Replay-Nonce", # Required for ACME nonce generation
|
||||||
|
"Link", # Required for ACME navigation
|
||||||
|
"Location" # Required for ACME resource location
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
**Cluster Path Configuration:**
|
||||||
|
ACME requires the cluster path to include the full API path:
|
||||||
|
```hcl
|
||||||
|
path = "${var.vault_address}/v1/${vault_mount.pki_int.path}"
|
||||||
|
aia_path = "${var.vault_address}/v1/${vault_mount.pki_int.path}"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Bootstrap Process:**
|
||||||
|
Since vault01 needed a certificate from its own PKI (chicken-and-egg problem), we created a `bootstrap-vault-cert` script that:
|
||||||
|
1. Uses the Unix socket (no TLS) to issue a certificate via `bao` CLI
|
||||||
|
2. Places it in the ACME directory structure
|
||||||
|
3. Includes both vault01.home.2rjus.net and vault.home.2rjus.net as SANs
|
||||||
|
4. After restart, ACME manages renewals automatically
|
||||||
|
|
||||||
|
**Files Modified:**
|
||||||
|
- `terraform/vault/pki.tf` - Added allowed_response_headers, cluster config, ACME config
|
||||||
|
- `services/vault/default.nix` - Updated cert paths, added bootstrap script, configured ACME
|
||||||
|
- `system/pki/root-ca.nix` - Added OpenBao root CA to trust store
|
||||||
|
- `hosts/vaulttest01/configuration.nix` - Overrode ACME server for testing
|
||||||
|
|
||||||
|
**Deliverable:** ✅ vault01 and vaulttest01 using OpenBao PKI, remaining hosts still on step-ca
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### Phase 4d: Bootstrap Integration ✅ COMPLETED (2026-02-02)
|
||||||
|
|
||||||
|
**Goal:** New hosts automatically authenticate to Vault on first boot, no manual steps
|
||||||
|
|
||||||
|
**Tasks:**
|
||||||
|
- [x] Update create-host tool
|
||||||
|
- [x] Generate wrapped token (24h TTL, single-use) for new host
|
||||||
|
- [x] Add host-specific policy to Vault (via terraform/vault/hosts-generated.tf)
|
||||||
|
- [x] Store wrapped token in terraform/vms.tf for cloud-init injection
|
||||||
|
- [x] Add `--regenerate-token` flag to regenerate only the token without overwriting config
|
||||||
|
- [x] Update template2 for Vault authentication
|
||||||
|
- [x] Reads wrapped token from cloud-init (/run/cloud-init-env)
|
||||||
|
- [x] Unwraps token to get role_id + secret_id
|
||||||
|
- [x] Stores AppRole credentials in /var/lib/vault/approle/ (persistent)
|
||||||
|
- [x] Graceful fallback if Vault unavailable during bootstrap
|
||||||
|
- [x] Create NixOS Vault secrets module (system/vault-secrets.nix)
|
||||||
|
- [x] Runtime secret fetching (services fetch on start, not at nixos-rebuild time)
|
||||||
|
- [x] Secrets cached in /var/lib/vault/cache/ for fallback when Vault unreachable
|
||||||
|
- [x] Secrets written to /run/secrets/ (tmpfs, cleared on reboot)
|
||||||
|
- [x] Fresh authentication per service start (no token renewal needed)
|
||||||
|
- [x] Optional periodic rotation with systemd timers
|
||||||
|
- [x] Critical service protection (no auto-restart for DNS, CA, Vault itself)
|
||||||
|
- [x] Create vault-fetch helper script
|
||||||
|
- [x] Standalone tool for fetching secrets from Vault
|
||||||
|
- [x] Authenticates using AppRole credentials
|
||||||
|
- [x] Writes individual files per secret key
|
||||||
|
- [x] Handles caching and fallback logic
|
||||||
|
- [x] Update bootstrap service (hosts/template2/bootstrap.nix)
|
||||||
|
- [x] Unwraps Vault token on first boot
|
||||||
|
- [x] Stores persistent AppRole credentials
|
||||||
|
- [x] Continues with nixos-rebuild
|
||||||
|
- [x] Services fetch secrets when they start
|
||||||
|
- [x] Update terraform cloud-init (terraform/cloud-init.tf)
|
||||||
|
- [x] Inject VAULT_ADDR and VAULT_WRAPPED_TOKEN via write_files
|
||||||
|
- [x] Write to /run/cloud-init-env (tmpfs, cleaned on reboot)
|
||||||
|
- [x] Fixed YAML indentation issues (write_files at top level)
|
||||||
|
- [x] Support flake_branch alongside vault credentials
|
||||||
|
- [x] Test complete flow
|
||||||
|
- [x] Created vaulttest01 test host
|
||||||
|
- [x] Verified bootstrap with Vault integration
|
||||||
|
- [x] Verified service secret fetching
|
||||||
|
- [x] Tested cache fallback when Vault unreachable
|
||||||
|
- [x] Tested wrapped token single-use (second bootstrap fails as expected)
|
||||||
|
- [x] Confirmed zero manual steps required
|
||||||
|
|
||||||
|
**Implementation Details:**
|
||||||
|
|
||||||
|
**Wrapped Token Security:**
|
||||||
|
- Single-use tokens prevent reuse if leaked
|
||||||
|
- 24h TTL limits exposure window
|
||||||
|
- Safe to commit to git (expired/used tokens useless)
|
||||||
|
- Regenerate with `create-host --hostname X --regenerate-token`
|
||||||
|
|
||||||
|
**Secret Fetching:**
|
||||||
|
- Runtime (not build-time) keeps secrets out of Nix store
|
||||||
|
- Cache fallback enables service availability when Vault down
|
||||||
|
- Fresh authentication per service start (no renewal complexity)
|
||||||
|
- Individual files per secret key for easy consumption
|
||||||
|
|
||||||
|
**Bootstrap Flow:**
|
||||||
|
```
|
||||||
|
1. create-host --hostname myhost --ip 10.69.13.x/24
|
||||||
|
↓ Generates wrapped token, updates terraform
|
||||||
|
2. tofu apply (deploys VM with cloud-init)
|
||||||
|
↓ Cloud-init writes wrapped token to /run/cloud-init-env
|
||||||
|
3. nixos-bootstrap.service runs:
|
||||||
|
↓ Unwraps token → gets role_id + secret_id
|
||||||
|
↓ Stores in /var/lib/vault/approle/ (persistent)
|
||||||
|
↓ Runs nixos-rebuild boot
|
||||||
|
4. Service starts → fetches secrets from Vault
|
||||||
|
↓ Uses stored AppRole credentials
|
||||||
|
↓ Caches secrets for fallback
|
||||||
|
5. Done - zero manual intervention
|
||||||
|
```
|
||||||
|
|
||||||
|
**Files Created:**
|
||||||
|
- `scripts/vault-fetch/` - Secret fetching helper (Nix package)
|
||||||
|
- `system/vault-secrets.nix` - NixOS module for declarative Vault secrets
|
||||||
|
- `scripts/create-host/vault_helper.py` - Vault API integration
|
||||||
|
- `terraform/vault/hosts-generated.tf` - Auto-generated host policies
|
||||||
|
- `docs/vault-bootstrap-implementation.md` - Architecture documentation
|
||||||
|
- `docs/vault-bootstrap-testing.md` - Testing guide
|
||||||
|
|
||||||
|
**Configuration:**
|
||||||
|
- Vault address: `https://vault01.home.2rjus.net:8200` (configurable)
|
||||||
|
- All defaults remain configurable via environment variables or NixOS options
|
||||||
|
|
||||||
|
**Next Steps:**
|
||||||
|
- Gradually migrate existing services from sops-nix to Vault
|
||||||
|
- Add CNAME for vault.home.2rjus.net → vault01.home.2rjus.net
|
||||||
|
- Phase 4c: Migrate from step-ca to OpenBao PKI (future)
|
||||||
|
|
||||||
|
**Deliverable:** ✅ Fully automated secrets access from first boot, zero manual steps
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Phase 6: Integration Script
|
||||||
|
|
||||||
|
**Goal:** Single command to create and deploy a new host
|
||||||
|
|
||||||
|
**Tasks:**
|
||||||
|
- [ ] Create `scripts/create-host.sh` master script that orchestrates:
|
||||||
|
1. Prompts for: hostname, IP (or DHCP), CPU, memory, disk
|
||||||
|
2. Validates inputs (IP not in use, hostname unique, etc.)
|
||||||
|
3. Calls host config generator (Phase 2)
|
||||||
|
4. Generates OpenTofu config (Phase 2)
|
||||||
|
5. Handles secrets (Phase 4)
|
||||||
|
6. Updates DNS (Phase 5)
|
||||||
|
7. Commits all changes to git
|
||||||
|
8. Runs `tofu apply` to deploy VM
|
||||||
|
9. Waits for bootstrap to complete (Phase 3)
|
||||||
|
10. Prints success message with IP and SSH command
|
||||||
|
- [ ] Add `--dry-run` flag to preview changes
|
||||||
|
- [ ] Add `--interactive` mode vs `--batch` mode
|
||||||
|
- [ ] Error handling and rollback on failures
|
||||||
|
|
||||||
|
**Deliverable:** `./scripts/create-host.sh --hostname myhost --ip 10.69.13.50` creates a fully working host
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Phase 7: Testing & Documentation
|
||||||
|
|
||||||
|
**Status:** 🚧 In Progress (testing improvements completed)
|
||||||
|
|
||||||
|
**Testing Improvements Implemented (2025-02-01):**
|
||||||
|
|
||||||
|
The pipeline now supports efficient testing without polluting master branch:
|
||||||
|
|
||||||
|
**1. --force Flag for create-host**
|
||||||
|
- Re-run `create-host` to regenerate existing configurations
|
||||||
|
- Updates existing entries in flake.nix and terraform/vms.tf (no duplicates)
|
||||||
|
- Skip uniqueness validation checks
|
||||||
|
- Useful for iterating on configuration templates during testing
|
||||||
|
|
||||||
|
**2. Branch Support for Bootstrap**
|
||||||
|
- Bootstrap service reads `NIXOS_FLAKE_BRANCH` environment variable
|
||||||
|
- Defaults to `master` if not set
|
||||||
|
- Allows testing pipeline changes on feature branches
|
||||||
|
- Cloud-init passes branch via `/etc/environment`
|
||||||
|
|
||||||
|
**3. Cloud-init Disk for Branch Configuration**
|
||||||
|
- Terraform generates custom cloud-init snippets for test VMs
|
||||||
|
- Set `flake_branch` field in VM definition to use non-master branch
|
||||||
|
- Production VMs omit this field and use master (default)
|
||||||
|
- Files automatically uploaded to Proxmox via SSH
|
||||||
|
|
||||||
|
**Testing Workflow:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Create test branch
|
||||||
|
git checkout -b test-pipeline
|
||||||
|
|
||||||
|
# 2. Generate or update host config
|
||||||
|
create-host --hostname testvm01 --ip 10.69.13.100/24
|
||||||
|
|
||||||
|
# 3. Edit terraform/vms.tf to add test VM with branch
|
||||||
|
# vms = {
|
||||||
|
# "testvm01" = {
|
||||||
|
# ip = "10.69.13.100/24"
|
||||||
|
# flake_branch = "test-pipeline" # Bootstrap from this branch
|
||||||
|
# }
|
||||||
|
# }
|
||||||
|
|
||||||
|
# 4. Commit and push test branch
|
||||||
|
git add -A && git commit -m "test: add testvm01"
|
||||||
|
git push origin test-pipeline
|
||||||
|
|
||||||
|
# 5. Deploy VM
|
||||||
|
cd terraform && tofu apply
|
||||||
|
|
||||||
|
# 6. Watch bootstrap (VM fetches from test-pipeline branch)
|
||||||
|
ssh root@10.69.13.100
|
||||||
|
journalctl -fu nixos-bootstrap.service
|
||||||
|
|
||||||
|
# 7. Iterate: modify templates and regenerate with --force
|
||||||
|
cd .. && create-host --hostname testvm01 --ip 10.69.13.100/24 --force
|
||||||
|
git commit -am "test: update config" && git push
|
||||||
|
|
||||||
|
# Redeploy to test fresh bootstrap
|
||||||
|
cd terraform
|
||||||
|
tofu destroy -target=proxmox_vm_qemu.vm[\"testvm01\"] && tofu apply
|
||||||
|
|
||||||
|
# 8. Clean up when done: squash commits, merge to master, remove test VM
|
||||||
|
```
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- `scripts/create-host/create_host.py` - Added --force parameter
|
||||||
|
- `scripts/create-host/manipulators.py` - Update vs insert logic
|
||||||
|
- `hosts/template2/bootstrap.nix` - Branch support via environment variable
|
||||||
|
- `terraform/vms.tf` - flake_branch field support
|
||||||
|
- `terraform/cloud-init.tf` - Custom cloud-init disk generation
|
||||||
|
- `terraform/variables.tf` - proxmox_host variable for SSH uploads
|
||||||
|
|
||||||
|
**Remaining Tasks:**
|
||||||
|
- [ ] Test full pipeline end-to-end on feature branch
|
||||||
|
- [ ] Update CLAUDE.md with testing workflow
|
||||||
|
- [ ] Add troubleshooting section
|
||||||
|
- [ ] Create examples for common scenarios (DHCP host, static IP host, etc.)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
1. **Bootstrap method:** Cloud-init runcmd vs Terraform provisioner vs Ansible?
|
||||||
|
2. **Secrets handling:** Pre-generate keys vs post-deployment injection?
|
||||||
|
3. **DNS automation:** Auto-commit or manual merge?
|
||||||
|
4. **Git workflow:** Auto-push changes or leave for user review?
|
||||||
|
5. **Template selection:** Single template2 or multiple templates for different host types?
|
||||||
|
6. **Networking:** Always DHCP initially, or support static IP from start?
|
||||||
|
7. **Error recovery:** What happens if bootstrap fails? Manual intervention or retry?
|
||||||
|
|
||||||
|
## Implementation Order
|
||||||
|
|
||||||
|
Recommended sequence:
|
||||||
|
1. Phase 1: Parameterize OpenTofu (foundation for testing)
|
||||||
|
2. Phase 3: Bootstrap mechanism (core automation)
|
||||||
|
3. Phase 2: Config generator (automate the boilerplate)
|
||||||
|
4. Phase 4: Secrets (solves biggest chicken-and-egg)
|
||||||
|
5. Phase 5: DNS (nice-to-have automation)
|
||||||
|
6. Phase 6: Integration script (ties it all together)
|
||||||
|
7. Phase 7: Testing & docs
|
||||||
|
|
||||||
|
## Success Criteria
|
||||||
|
|
||||||
|
When complete, creating a new host should:
|
||||||
|
- Take < 5 minutes of human time
|
||||||
|
- Require minimal user input (hostname, IP, basic specs)
|
||||||
|
- Result in a fully configured, secret-enabled, DNS-registered host
|
||||||
|
- Be reproducible and documented
|
||||||
|
- Handle common errors gracefully
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Keep incremental commits at each phase
|
||||||
|
- Test each phase independently before moving to next
|
||||||
|
- Maintain backward compatibility with manual workflow
|
||||||
|
- Document any manual steps that can't be automated
|
||||||
35
docs/plans/completed/bootstrap-cache.md
Normal file
35
docs/plans/completed/bootstrap-cache.md
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
# Plan: Configure Template2 to Use Nix Cache
|
||||||
|
|
||||||
|
## Problem
|
||||||
|
|
||||||
|
New VMs bootstrapped from template2 don't use our local nix cache (nix-cache.home.2rjus.net) during the initial `nixos-rebuild boot`. This means the first build downloads everything from cache.nixos.org, which is slower and uses more bandwidth.
|
||||||
|
|
||||||
|
## Solution
|
||||||
|
|
||||||
|
Update the template2 base image to include the nix cache configuration, so new VMs immediately benefit from cached builds during bootstrap.
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
1. Add nix cache configuration to `hosts/template2/configuration.nix`:
|
||||||
|
```nix
|
||||||
|
nix.settings = {
|
||||||
|
substituters = [ "https://nix-cache.home.2rjus.net" "https://cache.nixos.org" ];
|
||||||
|
trusted-public-keys = [
|
||||||
|
"nix-cache.home.2rjus.net:..." # Add the cache's public key
|
||||||
|
"cache.nixos.org-1:..."
|
||||||
|
];
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Rebuild and redeploy the Proxmox template:
|
||||||
|
```bash
|
||||||
|
nix develop -c ansible-playbook -i playbooks/inventory.ini playbooks/build-and-deploy-template.yml
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Update `default_template_name` in `terraform/variables.tf` if the template name changed
|
||||||
|
|
||||||
|
## Benefits
|
||||||
|
|
||||||
|
- Faster VM bootstrap times
|
||||||
|
- Reduced bandwidth to external cache
|
||||||
|
- Most derivations will already be cached from other hosts
|
||||||
72
docs/plans/completed/cert-monitoring.md
Normal file
72
docs/plans/completed/cert-monitoring.md
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
# Certificate Monitoring Plan
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
This document describes the removal of labmon certificate monitoring and outlines future needs for certificate monitoring in the homelab.
|
||||||
|
|
||||||
|
## What Was Removed
|
||||||
|
|
||||||
|
### labmon Service
|
||||||
|
|
||||||
|
The `labmon` service was a custom Go application that provided:
|
||||||
|
|
||||||
|
1. **StepMonitor**: Monitoring for step-ca (Smallstep CA) certificate provisioning and health
|
||||||
|
2. **TLSConnectionMonitor**: Periodic TLS connection checks to verify certificate validity and expiration
|
||||||
|
|
||||||
|
The service exposed Prometheus metrics at `:9969` including:
|
||||||
|
- `labmon_tlsconmon_certificate_seconds_left` - Time until certificate expiration
|
||||||
|
- `labmon_tlsconmon_certificate_check_error` - Whether the TLS check failed
|
||||||
|
- `labmon_stepmon_certificate_seconds_left` - Step-CA internal certificate expiration
|
||||||
|
|
||||||
|
### Affected Files
|
||||||
|
|
||||||
|
- `hosts/monitoring01/configuration.nix` - Removed labmon configuration block
|
||||||
|
- `services/monitoring/prometheus.nix` - Removed labmon scrape target
|
||||||
|
- `services/monitoring/rules.yml` - Removed `certificate_rules` alert group
|
||||||
|
- `services/monitoring/alloy.nix` - Deleted (was only used for labmon profiling)
|
||||||
|
- `services/monitoring/default.nix` - Removed alloy.nix import
|
||||||
|
|
||||||
|
### Removed Alerts
|
||||||
|
|
||||||
|
- `certificate_expiring_soon` - Warned when any monitored TLS cert had < 24h validity
|
||||||
|
- `step_ca_serving_cert_expiring` - Critical alert for step-ca's own serving certificate
|
||||||
|
- `certificate_check_error` - Warned when TLS connection check failed
|
||||||
|
- `step_ca_certificate_expiring` - Critical alert for step-ca issued certificates
|
||||||
|
|
||||||
|
## Why It Was Removed
|
||||||
|
|
||||||
|
1. **step-ca decommissioned**: The primary monitoring target (step-ca) is no longer in use
|
||||||
|
2. **Outdated codebase**: labmon was a custom tool that required maintenance
|
||||||
|
3. **Limited value**: With ACME auto-renewal, certificates should renew automatically
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
ACME certificates are now issued by OpenBao PKI at `vault.home.2rjus.net:8200`. The ACME protocol handles automatic renewal, and certificates are typically renewed well before expiration.
|
||||||
|
|
||||||
|
## Future Needs
|
||||||
|
|
||||||
|
While ACME handles renewal automatically, we should consider monitoring for:
|
||||||
|
|
||||||
|
1. **ACME renewal failures**: Alert when a certificate fails to renew
|
||||||
|
- Could monitor ACME client logs (via Loki queries)
|
||||||
|
- Could check certificate file modification times
|
||||||
|
|
||||||
|
2. **Certificate expiration as backup**: Even with auto-renewal, a last-resort alert for certificates approaching expiration would catch renewal failures
|
||||||
|
|
||||||
|
3. **Certificate transparency**: Monitor for unexpected certificate issuance
|
||||||
|
|
||||||
|
### Potential Solutions
|
||||||
|
|
||||||
|
1. **Prometheus blackbox_exporter**: Can probe TLS endpoints and export certificate expiration metrics
|
||||||
|
- `probe_ssl_earliest_cert_expiry` metric
|
||||||
|
- Already a standard tool, well-maintained
|
||||||
|
|
||||||
|
2. **Custom Loki alerting**: Query ACME service logs for renewal failures
|
||||||
|
- Works with existing infrastructure
|
||||||
|
- No additional services needed
|
||||||
|
|
||||||
|
3. **Node-exporter textfile collector**: Script that checks local certificate files and writes expiration metrics
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
**Not yet implemented.** This document serves as a placeholder for future work on certificate monitoring.
|
||||||
61
docs/plans/completed/dns-automation.md
Normal file
61
docs/plans/completed/dns-automation.md
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
# DNS Automation
|
||||||
|
|
||||||
|
**Status:** Completed (2026-02-04)
|
||||||
|
|
||||||
|
**Goal:** Automatically generate DNS entries from host configurations
|
||||||
|
|
||||||
|
**Approach:** Leverage Nix to generate zone file entries from flake host configurations
|
||||||
|
|
||||||
|
Since most hosts use static IPs defined in their NixOS configurations, we can extract this information and automatically generate A records. This keeps DNS in sync with the actual host configs.
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
- [x] Add optional CNAME field to host configurations
|
||||||
|
- [x] Added `homelab.dns.cnames` option in `modules/homelab/dns.nix`
|
||||||
|
- [x] Added `homelab.dns.enable` to allow opting out (defaults to true)
|
||||||
|
- [x] Documented in CLAUDE.md
|
||||||
|
- [x] Create Nix function to extract DNS records from all hosts
|
||||||
|
- [x] Created `lib/dns-zone.nix` with extraction functions
|
||||||
|
- [x] Parses each host's `networking.hostName` and `systemd.network.networks` IP configuration
|
||||||
|
- [x] Collects CNAMEs from `homelab.dns.cnames`
|
||||||
|
- [x] Filters out VPN interfaces (wg*, tun*, tap*, vti*)
|
||||||
|
- [x] Generates complete zone file with A and CNAME records
|
||||||
|
- [x] Integrate auto-generated records into zone files
|
||||||
|
- [x] External hosts separated to `services/ns/external-hosts.nix`
|
||||||
|
- [x] Zone includes comments showing which records are auto-generated vs external
|
||||||
|
- [x] Update zone file serial number automatically
|
||||||
|
- [x] Uses `self.sourceInfo.lastModified` (git commit timestamp)
|
||||||
|
- [x] Test zone file validity after generation
|
||||||
|
- [x] NSD validates zone at build time via `nsd-checkzone`
|
||||||
|
- [x] Deploy process documented
|
||||||
|
- [x] Merge to master, run auto-upgrade on ns1/ns2
|
||||||
|
|
||||||
|
## Files Created/Modified
|
||||||
|
|
||||||
|
| File | Purpose |
|
||||||
|
|------|---------|
|
||||||
|
| `modules/homelab/dns.nix` | Defines `homelab.dns.*` options |
|
||||||
|
| `modules/homelab/default.nix` | Module import hub |
|
||||||
|
| `lib/dns-zone.nix` | Zone generation functions |
|
||||||
|
| `services/ns/external-hosts.nix` | Non-flake host records |
|
||||||
|
| `services/ns/master-authorative.nix` | Uses generated zone |
|
||||||
|
| `services/ns/secondary-authorative.nix` | Uses generated zone |
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
View generated zone:
|
||||||
|
```bash
|
||||||
|
nix eval .#nixosConfigurations.ns1.config.services.nsd.zones.'"home.2rjus.net"'.data --raw
|
||||||
|
```
|
||||||
|
|
||||||
|
Add CNAMEs to a host:
|
||||||
|
```nix
|
||||||
|
homelab.dns.cnames = [ "alias1" "alias2" ];
|
||||||
|
```
|
||||||
|
|
||||||
|
Exclude a host from DNS:
|
||||||
|
```nix
|
||||||
|
homelab.dns.enable = false;
|
||||||
|
```
|
||||||
|
|
||||||
|
Add non-flake hosts: Edit `services/ns/external-hosts.nix`
|
||||||
46
docs/plans/completed/garage-s3-storage.md
Normal file
46
docs/plans/completed/garage-s3-storage.md
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
# Garage S3 Storage Server
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Deploy a Garage instance for self-hosted S3-compatible object storage.
|
||||||
|
|
||||||
|
## Garage Basics
|
||||||
|
|
||||||
|
- S3-compatible distributed object storage designed for self-hosting
|
||||||
|
- Supports per-key, per-bucket permissions (read/write/owner)
|
||||||
|
- Keys without explicit grants have no access
|
||||||
|
|
||||||
|
## NixOS Module
|
||||||
|
|
||||||
|
Available as `services.garage` with these key options:
|
||||||
|
|
||||||
|
- `services.garage.enable` - Enable the service
|
||||||
|
- `services.garage.package` - Must be set explicitly
|
||||||
|
- `services.garage.settings` - Freeform TOML config (replication mode, ports, RPC, etc.)
|
||||||
|
- `services.garage.settings.metadata_dir` - Metadata storage (SSD recommended)
|
||||||
|
- `services.garage.settings.data_dir` - Data block storage (supports multiple dirs since v0.9)
|
||||||
|
- `services.garage.environmentFile` - For secrets like `GARAGE_RPC_SECRET`
|
||||||
|
- `services.garage.logLevel` - error/warn/info/debug/trace
|
||||||
|
|
||||||
|
The NixOS module only manages the server daemon. Buckets and keys are managed externally.
|
||||||
|
|
||||||
|
## Bucket/Key Management
|
||||||
|
|
||||||
|
No declarative NixOS options for buckets or keys. Two options:
|
||||||
|
|
||||||
|
1. **Terraform provider** - `jkossis/terraform-provider-garage` manages buckets, keys, and permissions via the Garage Admin API v2. Could live in `terraform/garage/` similar to `terraform/vault/`.
|
||||||
|
2. **CLI** - `garage key create`, `garage bucket create`, `garage bucket allow`
|
||||||
|
|
||||||
|
## Integration Ideas
|
||||||
|
|
||||||
|
- Store Garage API keys in Vault, fetch via `vault.secrets` on consuming hosts
|
||||||
|
- Terraform manages both Vault secrets and Garage buckets/keys
|
||||||
|
- Enable admin API with token for Terraform provider access
|
||||||
|
- Add Prometheus metrics scraping (Garage exposes metrics endpoint)
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- Single-node or multi-node replication?
|
||||||
|
- Which host to deploy on?
|
||||||
|
- What to store? (backups, media, app data)
|
||||||
|
- Expose via HTTP proxy or direct S3 API only?
|
||||||
23
docs/plans/completed/host-cleanup.md
Normal file
23
docs/plans/completed/host-cleanup.md
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
# Host Cleanup
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Remove decommissioned/unused host configurations that are no longer reachable on the network.
|
||||||
|
|
||||||
|
## Hosts to review
|
||||||
|
|
||||||
|
The following hosts return "no route to host" from Prometheus scraping and are likely no longer needed:
|
||||||
|
|
||||||
|
- `media1` (10.69.12.82)
|
||||||
|
- `ns3` (10.69.13.7)
|
||||||
|
- `ns4` (10.69.13.8)
|
||||||
|
- `nixos-test1` (10.69.13.10)
|
||||||
|
|
||||||
|
## Steps
|
||||||
|
|
||||||
|
1. Confirm each host is truly decommissioned (not just temporarily powered off)
|
||||||
|
2. Remove host directory from `hosts/`
|
||||||
|
3. Remove `nixosConfigurations` entry from `flake.nix`
|
||||||
|
4. Remove host's age key from `.sops.yaml`
|
||||||
|
5. Remove per-host secrets from `secrets/<hostname>/` if any
|
||||||
|
6. Verify DNS zone and Prometheus targets no longer include the removed hosts after rebuild
|
||||||
128
docs/plans/completed/monitoring-gaps.md
Normal file
128
docs/plans/completed/monitoring-gaps.md
Normal file
@@ -0,0 +1,128 @@
|
|||||||
|
# Monitoring Gaps Audit
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Audit of services running in the homelab that lack monitoring coverage, either missing Prometheus scrape targets, alerting rules, or both.
|
||||||
|
|
||||||
|
## Services with No Monitoring
|
||||||
|
|
||||||
|
### PostgreSQL (`pgdb1`)
|
||||||
|
|
||||||
|
- **Current state:** No scrape targets, no alert rules
|
||||||
|
- **Risk:** A database outage would go completely unnoticed by Prometheus
|
||||||
|
- **Recommendation:** Enable `services.prometheus.exporters.postgres` (available in nixpkgs). This exposes connection counts, query throughput, replication lag, table/index stats, and more. Add alerts for at least `postgres_down` (systemd unit state) and connection pool exhaustion.
|
||||||
|
|
||||||
|
### Authelia (`auth01`)
|
||||||
|
|
||||||
|
- **Current state:** No scrape targets, no alert rules
|
||||||
|
- **Risk:** The authentication gateway being down blocks access to all proxied services
|
||||||
|
- **Recommendation:** Authelia exposes Prometheus metrics natively at `/metrics`. Add a scrape target and at minimum an `authelia_down` systemd unit state alert.
|
||||||
|
|
||||||
|
### LLDAP (`auth01`)
|
||||||
|
|
||||||
|
- **Current state:** No scrape targets, no alert rules
|
||||||
|
- **Risk:** LLDAP is a dependency of Authelia -- if LDAP is down, authentication breaks even if Authelia is running
|
||||||
|
- **Recommendation:** Add an `lldap_down` systemd unit state alert. LLDAP does not expose Prometheus metrics natively, so systemd unit monitoring via node-exporter may be sufficient.
|
||||||
|
|
||||||
|
### Vault / OpenBao (`vault01`)
|
||||||
|
|
||||||
|
- **Current state:** No scrape targets, no alert rules
|
||||||
|
- **Risk:** Secrets management service failures go undetected
|
||||||
|
- **Recommendation:** OpenBao supports Prometheus telemetry output natively. Add a scrape target for the telemetry endpoint and alerts for `vault_down` (systemd unit) and seal status.
|
||||||
|
|
||||||
|
### Gitea Actions Runner
|
||||||
|
|
||||||
|
- **Current state:** No scrape targets, no alert rules
|
||||||
|
- **Risk:** CI/CD failures go undetected
|
||||||
|
- **Recommendation:** Add at minimum a systemd unit state alert. The runner itself has limited metrics exposure.
|
||||||
|
|
||||||
|
## Services with Partial Monitoring
|
||||||
|
|
||||||
|
### Jellyfin (`jelly01`)
|
||||||
|
|
||||||
|
- **Current state:** Has scrape targets (port 8096), metrics are being collected, but zero alert rules
|
||||||
|
- **Metrics available:** 184 metrics, all .NET runtime / ASP.NET Core level. No Jellyfin-specific metrics (active streams, library size, transcoding sessions). Key useful metrics:
|
||||||
|
- `microsoft_aspnetcore_hosting_failed_requests` - rate of HTTP errors
|
||||||
|
- `microsoft_aspnetcore_hosting_current_requests` - in-flight requests
|
||||||
|
- `process_working_set_bytes` - memory usage (~256 MB currently)
|
||||||
|
- `dotnet_gc_pause_ratio` - GC pressure
|
||||||
|
- `up{job="jellyfin"}` - basic availability
|
||||||
|
- **Recommendation:** Add a `jellyfin_down` alert using either `up{job="jellyfin"} == 0` or systemd unit state. Consider alerting on sustained `failed_requests` rate increase.
|
||||||
|
|
||||||
|
### NATS (`nats1`)
|
||||||
|
|
||||||
|
- **Current state:** Has a `nats_down` alert (systemd unit state via node-exporter), but no NATS-specific metrics
|
||||||
|
- **Metrics available:** NATS has a built-in `/metrics` endpoint exposing connection counts, message throughput, JetStream consumer lag, and more
|
||||||
|
- **Recommendation:** Add a scrape target for the NATS metrics endpoint. Consider alerts for connection count spikes, slow consumers, and JetStream storage usage.
|
||||||
|
|
||||||
|
### DNS - Unbound (`ns1`, `ns2`)
|
||||||
|
|
||||||
|
- **Current state:** Has `unbound_down` alert (systemd unit state), but no DNS query metrics
|
||||||
|
- **Available in nixpkgs:** `services.prometheus.exporters.unbound.enable` (package: `prometheus-unbound-exporter` v0.5.0). Exposes query counts, cache hit ratios, response types (SERVFAIL, NXDOMAIN), upstream latency.
|
||||||
|
- **Recommendation:** Enable the unbound exporter on ns1/ns2. Add alerts for cache hit ratio drops and SERVFAIL rate spikes.
|
||||||
|
|
||||||
|
### DNS - NSD (`ns1`, `ns2`)
|
||||||
|
|
||||||
|
- **Current state:** Has `nsd_down` alert (systemd unit state), no NSD-specific metrics
|
||||||
|
- **Available in nixpkgs:** Nothing. No exporter package or NixOS module. Community `nsd_exporter` exists but is not packaged.
|
||||||
|
- **Recommendation:** The existing systemd unit alert is likely sufficient. NSD is a simple authoritative-only server with limited operational metrics. Not worth packaging a custom exporter for now.
|
||||||
|
|
||||||
|
## Existing Monitoring (for reference)
|
||||||
|
|
||||||
|
These services have adequate alerting and/or scrape targets:
|
||||||
|
|
||||||
|
| Service | Scrape Targets | Alert Rules |
|
||||||
|
|---|---|---|
|
||||||
|
| Monitoring stack (Prometheus, Grafana, Loki, Tempo, Pyroscope) | Yes | 7 alerts |
|
||||||
|
| Home Assistant (+ Zigbee2MQTT, Mosquitto) | Yes (port 8123) | 3 alerts |
|
||||||
|
| HTTP Proxy (Caddy) | Yes (port 80) | 3 alerts |
|
||||||
|
| Nix Cache (Harmonia, build-flakes) | Via Caddy | 4 alerts |
|
||||||
|
| CA (step-ca) | Yes (port 9000) | 4 certificate alerts |
|
||||||
|
|
||||||
|
## Per-Service Resource Metrics (systemd-exporter)
|
||||||
|
|
||||||
|
### Current State
|
||||||
|
|
||||||
|
No per-service CPU, memory, or IO metrics are collected. The existing node-exporter systemd collector only provides unit state (active/inactive/failed), socket stats, and timer triggers. While systemd tracks per-unit resource usage via cgroups internally (visible in `systemctl status` and `systemd-cgtop`), this data is not exported to Prometheus.
|
||||||
|
|
||||||
|
### Available Solution
|
||||||
|
|
||||||
|
The `prometheus-systemd-exporter` package (v0.7.0) is available in nixpkgs with a ready-made NixOS module:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
services.prometheus.exporters.systemd.enable = true;
|
||||||
|
```
|
||||||
|
|
||||||
|
**Options:** `enable`, `port`, `extraFlags`, `user`, `group`
|
||||||
|
|
||||||
|
This exporter reads cgroup data and exposes per-unit metrics including:
|
||||||
|
- CPU seconds consumed per service
|
||||||
|
- Memory usage per service
|
||||||
|
- Task/process counts per service
|
||||||
|
- Restart counts
|
||||||
|
- IO usage
|
||||||
|
|
||||||
|
### Recommendation
|
||||||
|
|
||||||
|
Enable on all hosts via the shared `system/` config (same pattern as node-exporter). Add a corresponding scrape job on monitoring01. This would give visibility into resource consumption per service across the fleet, useful for capacity planning and diagnosing noisy-neighbor issues on shared hosts.
|
||||||
|
|
||||||
|
## Suggested Priority
|
||||||
|
|
||||||
|
1. **PostgreSQL** - Critical infrastructure, easy to add with existing nixpkgs module
|
||||||
|
2. **Authelia + LLDAP** - Auth outage affects all proxied services
|
||||||
|
3. **Unbound exporter** - Ready-to-go NixOS module, just needs enabling
|
||||||
|
4. **Jellyfin alerts** - Metrics already collected, just needs alert rules
|
||||||
|
5. **NATS metrics** - Built-in endpoint, just needs a scrape target
|
||||||
|
6. **Vault/OpenBao** - Native telemetry support
|
||||||
|
7. **Actions Runner** - Lower priority, basic systemd alert sufficient
|
||||||
|
|
||||||
|
## Node-Exporter Targets Currently Down
|
||||||
|
|
||||||
|
Noted during audit -- these node-exporter targets are failing:
|
||||||
|
|
||||||
|
- `nixos-test1.home.2rjus.net:9100` - no route to host
|
||||||
|
- `media1.home.2rjus.net:9100` - no route to host
|
||||||
|
- `ns3.home.2rjus.net:9100` - no route to host
|
||||||
|
- `ns4.home.2rjus.net:9100` - no route to host
|
||||||
|
|
||||||
|
These may be decommissioned or powered-off hosts that should be removed from the scrape config.
|
||||||
156
docs/plans/completed/monitoring-migration-victoriametrics.md
Normal file
156
docs/plans/completed/monitoring-migration-victoriametrics.md
Normal file
@@ -0,0 +1,156 @@
|
|||||||
|
# Monitoring Stack Migration to VictoriaMetrics
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Migrate from Prometheus to VictoriaMetrics on a new host (monitoring02) to gain better compression
|
||||||
|
and longer retention. Run in parallel with monitoring01 until validated, then switch over using
|
||||||
|
a `monitoring` CNAME for seamless transition.
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
**monitoring02** (10.69.13.24) - **PRIMARY**:
|
||||||
|
- 4 CPU cores, 8GB RAM, 60GB disk
|
||||||
|
- VictoriaMetrics with 3-month retention
|
||||||
|
- vmalert with alerting enabled (routes to local Alertmanager)
|
||||||
|
- Alertmanager -> alerttonotify -> NATS notification pipeline
|
||||||
|
- Grafana with Kanidm OIDC (`grafana.home.2rjus.net`)
|
||||||
|
- Loki (log aggregation)
|
||||||
|
- CNAMEs: monitoring, alertmanager, grafana, grafana-test, metrics, vmalert, loki
|
||||||
|
|
||||||
|
**monitoring01** (10.69.13.13) - **SHUT DOWN**:
|
||||||
|
- No longer running, pending decommission
|
||||||
|
|
||||||
|
## Decision: VictoriaMetrics
|
||||||
|
|
||||||
|
Per `docs/plans/long-term-metrics-storage.md`, VictoriaMetrics is the recommended starting point:
|
||||||
|
- Single binary replacement for Prometheus
|
||||||
|
- 5-10x better compression (30 days could become 180+ days in same space)
|
||||||
|
- Same PromQL query language (Grafana dashboards work unchanged)
|
||||||
|
- Same scrape config format (existing auto-generated configs work)
|
||||||
|
|
||||||
|
If multi-year retention with downsampling becomes necessary later, Thanos can be evaluated.
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────┐
|
||||||
|
│ monitoring02 │
|
||||||
|
│ VictoriaMetrics│
|
||||||
|
│ + Grafana │
|
||||||
|
monitoring │ + Loki │
|
||||||
|
CNAME ──────────│ + Alertmanager │
|
||||||
|
│ (vmalert) │
|
||||||
|
└─────────────────┘
|
||||||
|
▲
|
||||||
|
│ scrapes
|
||||||
|
┌───────────────┼───────────────┐
|
||||||
|
│ │ │
|
||||||
|
┌────┴────┐ ┌─────┴────┐ ┌─────┴────┐
|
||||||
|
│ ns1 │ │ ha1 │ │ ... │
|
||||||
|
│ :9100 │ │ :9100 │ │ :9100 │
|
||||||
|
└─────────┘ └──────────┘ └──────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## Implementation Plan
|
||||||
|
|
||||||
|
### Phase 1: Create monitoring02 Host [COMPLETE]
|
||||||
|
|
||||||
|
Host created and deployed at 10.69.13.24 (prod tier) with:
|
||||||
|
- 4 CPU cores, 8GB RAM, 60GB disk
|
||||||
|
- Vault integration enabled
|
||||||
|
- NATS-based remote deployment enabled
|
||||||
|
- Grafana with Kanidm OIDC deployed as test instance (`grafana-test.home.2rjus.net`)
|
||||||
|
|
||||||
|
### Phase 2: Set Up VictoriaMetrics Stack [COMPLETE]
|
||||||
|
|
||||||
|
New service module at `services/victoriametrics/` for VictoriaMetrics + vmalert + Alertmanager.
|
||||||
|
Imported by monitoring02 alongside the existing Grafana service.
|
||||||
|
|
||||||
|
1. **VictoriaMetrics** (port 8428):
|
||||||
|
- `services.victoriametrics.enable = true`
|
||||||
|
- `retentionPeriod = "3"` (3 months)
|
||||||
|
- All scrape configs migrated from Prometheus (22 jobs including auto-generated)
|
||||||
|
- Static user override (DynamicUser disabled) for credential file access
|
||||||
|
- OpenBao token fetch service + 30min refresh timer
|
||||||
|
- Apiary bearer token via vault.secrets
|
||||||
|
|
||||||
|
2. **vmalert** for alerting rules:
|
||||||
|
- Points to VictoriaMetrics datasource at localhost:8428
|
||||||
|
- Reuses existing `services/monitoring/rules.yml` directly via `settings.rule`
|
||||||
|
- Notifier sends to local Alertmanager at localhost:9093
|
||||||
|
|
||||||
|
3. **Alertmanager** (port 9093):
|
||||||
|
- Same configuration as monitoring01 (alerttonotify webhook routing)
|
||||||
|
- alerttonotify imported on monitoring02, routes alerts via NATS
|
||||||
|
|
||||||
|
4. **Grafana** (port 3000):
|
||||||
|
- VictoriaMetrics datasource (localhost:8428) as default
|
||||||
|
- Loki datasource pointing to localhost:3100
|
||||||
|
|
||||||
|
5. **Loki** (port 3100):
|
||||||
|
- Same configuration as monitoring01 in standalone `services/loki/` module
|
||||||
|
- Grafana datasource updated to localhost:3100
|
||||||
|
|
||||||
|
**Note:** pve-exporter and pushgateway scrape targets are not included on monitoring02.
|
||||||
|
pve-exporter requires a local exporter instance; pushgateway is replaced by VictoriaMetrics
|
||||||
|
native push support.
|
||||||
|
|
||||||
|
### Phase 3: Parallel Operation [COMPLETE]
|
||||||
|
|
||||||
|
Ran both monitoring01 and monitoring02 simultaneously to validate data collection and dashboards.
|
||||||
|
|
||||||
|
### Phase 4: Add monitoring CNAME [COMPLETE]
|
||||||
|
|
||||||
|
Added CNAMEs to monitoring02: monitoring, alertmanager, grafana, metrics, vmalert, loki.
|
||||||
|
|
||||||
|
### Phase 5: Update References [COMPLETE]
|
||||||
|
|
||||||
|
- Moved alertmanager, grafana, prometheus CNAMEs from http-proxy to monitoring02
|
||||||
|
- Removed corresponding Caddy reverse proxy entries from http-proxy
|
||||||
|
- monitoring02 Caddy serves alertmanager, grafana, metrics, vmalert directly
|
||||||
|
|
||||||
|
### Phase 6: Enable Alerting [COMPLETE]
|
||||||
|
|
||||||
|
- Switched vmalert from blackhole mode to local Alertmanager
|
||||||
|
- alerttonotify service running on monitoring02 (NATS nkey from Vault)
|
||||||
|
- prometheus-metrics Vault policy added for OpenBao scraping
|
||||||
|
- Full alerting pipeline verified: vmalert -> Alertmanager -> alerttonotify -> NATS
|
||||||
|
|
||||||
|
### Phase 7: Cutover and Decommission [IN PROGRESS]
|
||||||
|
|
||||||
|
- monitoring01 shut down (2026-02-17)
|
||||||
|
- Vault AppRole moved from approle.tf to hosts-generated.tf with extra_policies support
|
||||||
|
|
||||||
|
**Remaining cleanup (separate branch):**
|
||||||
|
- [ ] Update `system/monitoring/logs.nix` - Promtail still points to monitoring01
|
||||||
|
- [ ] Update `hosts/template2/bootstrap.nix` - Bootstrap Loki URL still points to monitoring01
|
||||||
|
- [ ] Remove monitoring01 from flake.nix and host configuration
|
||||||
|
- [ ] Destroy monitoring01 VM in Proxmox
|
||||||
|
- [ ] Remove monitoring01 from terraform state
|
||||||
|
- [ ] Remove or archive `services/monitoring/` (Prometheus config)
|
||||||
|
|
||||||
|
## Completed
|
||||||
|
|
||||||
|
- 2026-02-08: Phase 1 - monitoring02 host created
|
||||||
|
- 2026-02-17: Phase 2 - VictoriaMetrics, vmalert, Alertmanager, Loki, Grafana configured
|
||||||
|
- 2026-02-17: Phase 6 - Alerting enabled, CNAMEs migrated, monitoring01 shut down
|
||||||
|
|
||||||
|
## VictoriaMetrics Service Configuration
|
||||||
|
|
||||||
|
Implemented in `services/victoriametrics/default.nix`. Key design decisions:
|
||||||
|
|
||||||
|
- **Static user**: VictoriaMetrics NixOS module uses `DynamicUser`, overridden with a static
|
||||||
|
`victoriametrics` user so vault.secrets and credential files work correctly
|
||||||
|
- **Shared rules**: vmalert reuses `services/monitoring/rules.yml` via `settings.rule` path
|
||||||
|
reference (no YAML-to-Nix conversion needed)
|
||||||
|
- **Scrape config reuse**: Uses the same `lib/monitoring.nix` functions and
|
||||||
|
`services/monitoring/external-targets.nix` as Prometheus for auto-generated targets
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- VictoriaMetrics uses port 8428 vs Prometheus 9090
|
||||||
|
- PromQL compatibility is excellent
|
||||||
|
- VictoriaMetrics native push replaces Pushgateway (remove from http-proxy if not needed)
|
||||||
|
- monitoring02 deployed via OpenTofu using `create-host` script
|
||||||
|
- Grafana dashboards defined declaratively via NixOS, not imported from monitoring01 state
|
||||||
|
- Tempo and Pyroscope deferred (not actively used; can be added later if needed)
|
||||||
135
docs/plans/completed/monitoring02-reboot-alert-investigation.md
Normal file
135
docs/plans/completed/monitoring02-reboot-alert-investigation.md
Normal file
@@ -0,0 +1,135 @@
|
|||||||
|
# monitoring02 Reboot Alert Investigation
|
||||||
|
|
||||||
|
**Date:** 2026-02-10
|
||||||
|
**Status:** Completed - False positive identified
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
A `host_reboot` alert fired for monitoring02 at 16:27:36 UTC. Investigation determined this was a **false positive** caused by NTP clock adjustments, not an actual reboot.
|
||||||
|
|
||||||
|
## Alert Details
|
||||||
|
|
||||||
|
- **Alert:** `host_reboot`
|
||||||
|
- **Rule:** `changes(node_boot_time_seconds[10m]) > 0`
|
||||||
|
- **Host:** monitoring02
|
||||||
|
- **Time:** 2026-02-10T16:27:36Z
|
||||||
|
|
||||||
|
## Investigation Findings
|
||||||
|
|
||||||
|
### Evidence Against Actual Reboot
|
||||||
|
|
||||||
|
1. **Uptime:** System had been up for ~40 hours (143,751 seconds) at time of alert
|
||||||
|
2. **Consistent BOOT_ID:** All logs showed the same systemd BOOT_ID (`fd26e7f3d86f4cd688d1b1d7af62f2ad`) from Feb 9 through the alert time
|
||||||
|
3. **No log gaps:** Logs were continuous - no shutdown/restart cycle visible
|
||||||
|
4. **Prometheus metrics:** `node_boot_time_seconds` showed a 1-second fluctuation, then returned to normal
|
||||||
|
|
||||||
|
### Root Cause: NTP Clock Adjustment
|
||||||
|
|
||||||
|
The `node_boot_time_seconds` metric fluctuated by 1 second due to how Linux calculates boot time:
|
||||||
|
|
||||||
|
```
|
||||||
|
btime = current_wall_clock_time - monotonic_uptime
|
||||||
|
```
|
||||||
|
|
||||||
|
When NTP adjusts the wall clock, `btime` shifts by the same amount. The `node_timex_*` metrics confirmed this:
|
||||||
|
|
||||||
|
| Metric | Value |
|
||||||
|
|--------|-------|
|
||||||
|
| `node_timex_maxerror_seconds` (max in 3h) | 1.02 seconds |
|
||||||
|
| `node_timex_maxerror_seconds` (max in 24h) | 2.05 seconds |
|
||||||
|
| `node_timex_sync_status` | 1 (synced) |
|
||||||
|
| Current `node_timex_offset_seconds` | ~9ms (normal) |
|
||||||
|
|
||||||
|
The kernel's estimated maximum clock error spiked to over 1 second, causing the boot time calculation to drift momentarily.
|
||||||
|
|
||||||
|
Additionally, `systemd-resolved` logged "Clock change detected. Flushing caches." at 16:26:53Z, corroborating the NTP adjustment.
|
||||||
|
|
||||||
|
## Current Time Sync Configuration
|
||||||
|
|
||||||
|
### NixOS Guests
|
||||||
|
- **NTP client:** systemd-timesyncd (NixOS default)
|
||||||
|
- **No explicit configuration** in the codebase
|
||||||
|
- Uses default NixOS NTP server pool
|
||||||
|
|
||||||
|
### Proxmox VMs
|
||||||
|
- **Clocksource:** `kvm-clock` (optimal for KVM VMs)
|
||||||
|
- **QEMU guest agent:** Enabled
|
||||||
|
- **No additional QEMU timing args** configured
|
||||||
|
|
||||||
|
## Potential Improvements
|
||||||
|
|
||||||
|
### 1. Improve Alert Rule (Recommended)
|
||||||
|
|
||||||
|
Add tolerance to filter out small NTP adjustments:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Current rule (triggers on any change)
|
||||||
|
expr: changes(node_boot_time_seconds[10m]) > 0
|
||||||
|
|
||||||
|
# Improved rule (requires >60 second shift)
|
||||||
|
expr: changes(node_boot_time_seconds[10m]) > 0 and abs(delta(node_boot_time_seconds[10m])) > 60
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Switch to Chrony (Optional)
|
||||||
|
|
||||||
|
Chrony handles time adjustments more gracefully than systemd-timesyncd:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
# In common/vm/qemu-guest.nix
|
||||||
|
{
|
||||||
|
services.qemuGuest.enable = true;
|
||||||
|
|
||||||
|
services.timesyncd.enable = false;
|
||||||
|
services.chrony = {
|
||||||
|
enable = true;
|
||||||
|
extraConfig = ''
|
||||||
|
makestep 1 3
|
||||||
|
rtcsync
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Add QEMU Timing Args (Optional)
|
||||||
|
|
||||||
|
In `terraform/vms.tf`:
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
args = "-global kvm-pit.lost_tick_policy=delay -rtc driftfix=slew"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Local NTP Server (Optional)
|
||||||
|
|
||||||
|
Running a local NTP server (e.g., on ns1/ns2) would reduce latency and improve sync stability across all hosts.
|
||||||
|
|
||||||
|
## Monitoring NTP Health
|
||||||
|
|
||||||
|
The `node_timex_*` metrics from node_exporter provide visibility into NTP health:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
# Clock offset from reference
|
||||||
|
node_timex_offset_seconds
|
||||||
|
|
||||||
|
# Sync status (1 = synced)
|
||||||
|
node_timex_sync_status
|
||||||
|
|
||||||
|
# Maximum estimated error - useful for alerting
|
||||||
|
node_timex_maxerror_seconds
|
||||||
|
```
|
||||||
|
|
||||||
|
A potential alert for NTP issues:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- alert: ntp_clock_drift
|
||||||
|
expr: node_timex_maxerror_seconds > 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "High clock drift on {{ $labels.hostname }}"
|
||||||
|
description: "NTP max error is {{ $value }}s on {{ $labels.hostname }}"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Conclusion
|
||||||
|
|
||||||
|
No action required for the alert itself - the system was healthy. Consider implementing the improved alert rule to prevent future false positives from NTP adjustments.
|
||||||
371
docs/plans/completed/nats-deploy-service.md
Normal file
371
docs/plans/completed/nats-deploy-service.md
Normal file
@@ -0,0 +1,371 @@
|
|||||||
|
# NATS-Based Deployment Service
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Create a message-based deployment system that allows triggering NixOS configuration updates on-demand, rather than waiting for the daily auto-upgrade timer. This enables faster iteration when testing changes and immediate fleet-wide deployments.
|
||||||
|
|
||||||
|
## Goals
|
||||||
|
|
||||||
|
1. **On-demand deployment** - Trigger config updates immediately via NATS message
|
||||||
|
2. **Targeted deployment** - Deploy to specific hosts or all hosts
|
||||||
|
3. **Branch/revision support** - Test feature branches before merging to master
|
||||||
|
4. **MCP integration** - Allow Claude Code to trigger deployments during development
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
- **Auto-upgrade**: All hosts run `nixos-upgrade.service` daily, pulling from master
|
||||||
|
- **Manual testing**: `nixos-rebuild-test <action> <branch>` helper exists on all hosts
|
||||||
|
- **NATS**: Running on nats1 with JetStream enabled, using NKey authentication
|
||||||
|
- **Accounts**: ADMIN (system) and HOMELAB (user workloads with JetStream)
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────┐ ┌─────────────┐
|
||||||
|
│ MCP Tool │ deploy.test.> │ Admin CLI │ deploy.test.> + deploy.prod.>
|
||||||
|
│ (claude) │────────────┐ ┌─────│ (torjus) │
|
||||||
|
└─────────────┘ │ │ └─────────────┘
|
||||||
|
▼ ▼
|
||||||
|
┌──────────────┐
|
||||||
|
│ nats1 │
|
||||||
|
│ (authz) │
|
||||||
|
└──────┬───────┘
|
||||||
|
│
|
||||||
|
┌─────────────────┼─────────────────┐
|
||||||
|
│ │ │
|
||||||
|
▼ ▼ ▼
|
||||||
|
┌──────────┐ ┌──────────┐ ┌──────────┐
|
||||||
|
│ template1│ │ ns1 │ │ ha1 │
|
||||||
|
│ tier=test│ │ tier=prod│ │ tier=prod│
|
||||||
|
└──────────┘ └──────────┘ └──────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## Repository Structure
|
||||||
|
|
||||||
|
The project lives in a **separate repository** (e.g., `homelab-deploy`) containing:
|
||||||
|
|
||||||
|
```
|
||||||
|
homelab-deploy/
|
||||||
|
├── flake.nix # Nix flake with Go package + NixOS module
|
||||||
|
├── go.mod
|
||||||
|
├── go.sum
|
||||||
|
├── cmd/
|
||||||
|
│ └── homelab-deploy/
|
||||||
|
│ └── main.go # CLI entrypoint with subcommands
|
||||||
|
├── internal/
|
||||||
|
│ ├── listener/ # Listener mode logic
|
||||||
|
│ ├── mcp/ # MCP server mode logic
|
||||||
|
│ └── deploy/ # Shared deployment logic
|
||||||
|
└── nixos/
|
||||||
|
└── module.nix # NixOS module for listener service
|
||||||
|
```
|
||||||
|
|
||||||
|
This repo imports the flake as an input and uses the NixOS module.
|
||||||
|
|
||||||
|
## Single Binary with Subcommands
|
||||||
|
|
||||||
|
The `homelab-deploy` binary supports multiple modes:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run as listener on a host (systemd service)
|
||||||
|
homelab-deploy listener --hostname ns1 --nats-url nats://nats1:4222
|
||||||
|
|
||||||
|
# Run as MCP server (for Claude Code)
|
||||||
|
homelab-deploy mcp --nats-url nats://nats1:4222
|
||||||
|
|
||||||
|
# CLI commands for manual use
|
||||||
|
homelab-deploy deploy ns1 --branch feature-x --action switch # single host
|
||||||
|
homelab-deploy deploy --tier test --all --action boot # all test hosts
|
||||||
|
homelab-deploy deploy --tier prod --all --action boot # all prod hosts (admin only)
|
||||||
|
homelab-deploy deploy --tier prod --role dns --action switch # all prod dns hosts
|
||||||
|
homelab-deploy status
|
||||||
|
```
|
||||||
|
|
||||||
|
## Components
|
||||||
|
|
||||||
|
### Listener Mode
|
||||||
|
|
||||||
|
A systemd service on each host that:
|
||||||
|
- Subscribes to multiple subjects for targeted and group deployments
|
||||||
|
- Validates incoming messages (revision, action)
|
||||||
|
- Executes `nixos-rebuild` with specified parameters
|
||||||
|
- Reports status back via NATS
|
||||||
|
|
||||||
|
**Subject structure:**
|
||||||
|
```
|
||||||
|
deploy.<tier>.<hostname> # specific host (e.g., deploy.prod.ns1)
|
||||||
|
deploy.<tier>.all # all hosts in tier (e.g., deploy.test.all)
|
||||||
|
deploy.<tier>.role.<role> # all hosts with role in tier (e.g., deploy.prod.role.dns)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Listener subscriptions** (based on `homelab.host` config):
|
||||||
|
- `deploy.<tier>.<hostname>` - direct messages to this host
|
||||||
|
- `deploy.<tier>.all` - broadcast to all hosts in tier
|
||||||
|
- `deploy.<tier>.role.<role>` - broadcast to hosts with matching role (if role is set)
|
||||||
|
|
||||||
|
Example: ns1 with `tier=prod, role=dns` subscribes to:
|
||||||
|
- `deploy.prod.ns1`
|
||||||
|
- `deploy.prod.all`
|
||||||
|
- `deploy.prod.role.dns`
|
||||||
|
|
||||||
|
**NixOS module configuration:**
|
||||||
|
```nix
|
||||||
|
services.homelab-deploy.listener = {
|
||||||
|
enable = true;
|
||||||
|
timeout = 600; # seconds, default 10 minutes
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
The listener reads tier and role from `config.homelab.host` (see Host Metadata below).
|
||||||
|
|
||||||
|
**Request message format:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"action": "switch" | "boot" | "test" | "dry-activate",
|
||||||
|
"revision": "master" | "feature-branch" | "abc123...",
|
||||||
|
"reply_to": "deploy.responses.<request-id>"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response message format:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"status": "accepted" | "rejected" | "started" | "completed" | "failed",
|
||||||
|
"error": "invalid_revision" | "already_running" | "build_failed" | null,
|
||||||
|
"message": "human-readable details"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Request/Reply flow:**
|
||||||
|
1. MCP/CLI sends deploy request with unique `reply_to` subject
|
||||||
|
2. Listener validates request (e.g., `git ls-remote` to check revision exists)
|
||||||
|
3. Listener sends immediate response:
|
||||||
|
- `{"status": "rejected", "error": "invalid_revision", "message": "branch 'foo' not found"}`, or
|
||||||
|
- `{"status": "started", "message": "starting nixos-rebuild switch"}`
|
||||||
|
4. If started, listener runs nixos-rebuild
|
||||||
|
5. Listener sends final response:
|
||||||
|
- `{"status": "completed", "message": "successfully switched to generation 42"}`, or
|
||||||
|
- `{"status": "failed", "error": "build_failed", "message": "nixos-rebuild exited with code 1"}`
|
||||||
|
|
||||||
|
This provides immediate feedback on validation errors (bad revision, already running) without waiting for the build to fail.
|
||||||
|
|
||||||
|
### MCP Mode
|
||||||
|
|
||||||
|
Runs as an MCP server providing tools for Claude Code.
|
||||||
|
|
||||||
|
**Tools:**
|
||||||
|
| Tool | Description | Tier Access |
|
||||||
|
|------|-------------|-------------|
|
||||||
|
| `deploy` | Deploy to test hosts (individual, all, or by role) | test only |
|
||||||
|
| `deploy_admin` | Deploy to any host (requires `--enable-admin` flag) | test + prod |
|
||||||
|
| `deploy_status` | Check deployment status/history | n/a |
|
||||||
|
| `list_hosts` | List available deployment targets | n/a |
|
||||||
|
|
||||||
|
**CLI flags:**
|
||||||
|
```bash
|
||||||
|
# Default: only test-tier deployments available
|
||||||
|
homelab-deploy mcp --nats-url nats://nats1:4222
|
||||||
|
|
||||||
|
# Enable admin tool (requires admin NKey to be configured)
|
||||||
|
homelab-deploy mcp --nats-url nats://nats1:4222 --enable-admin --admin-nkey-file /path/to/admin.nkey
|
||||||
|
```
|
||||||
|
|
||||||
|
**Security layers:**
|
||||||
|
1. **MCP flag**: `deploy_admin` tool only exposed when `--enable-admin` is passed
|
||||||
|
2. **NATS authz**: Even if tool is exposed, NATS rejects publishes without valid admin NKey
|
||||||
|
3. **Claude Code permissions**: Can set `mcp__homelab-deploy__deploy_admin` to `ask` mode for confirmation popup
|
||||||
|
|
||||||
|
By default, the MCP only loads test-tier credentials and exposes the `deploy` tool. Claude can:
|
||||||
|
- Deploy to individual test hosts
|
||||||
|
- Deploy to all test hosts at once (`deploy.test.all`)
|
||||||
|
- Deploy to test hosts by role (`deploy.test.role.<role>`)
|
||||||
|
|
||||||
|
### Tiered Permissions
|
||||||
|
|
||||||
|
Authorization is enforced at the NATS layer using subject-based permissions. Different deployer credentials have different publish rights:
|
||||||
|
|
||||||
|
**NATS user configuration (on nats1):**
|
||||||
|
```nix
|
||||||
|
accounts = {
|
||||||
|
HOMELAB = {
|
||||||
|
users = [
|
||||||
|
# MCP/Claude - test tier only
|
||||||
|
{
|
||||||
|
nkey = "UABC..."; # mcp-deployer
|
||||||
|
permissions = {
|
||||||
|
publish = [ "deploy.test.>" ];
|
||||||
|
subscribe = [ "deploy.responses.>" ];
|
||||||
|
};
|
||||||
|
}
|
||||||
|
# Admin - full access to all tiers
|
||||||
|
{
|
||||||
|
nkey = "UXYZ..."; # admin-deployer
|
||||||
|
permissions = {
|
||||||
|
publish = [ "deploy.test.>" "deploy.prod.>" ];
|
||||||
|
subscribe = [ "deploy.responses.>" ];
|
||||||
|
};
|
||||||
|
}
|
||||||
|
# Host listeners - subscribe to their tier, publish responses
|
||||||
|
{
|
||||||
|
nkey = "UDEF..."; # host-listener (one per host)
|
||||||
|
permissions = {
|
||||||
|
subscribe = [ "deploy.*.>" ];
|
||||||
|
publish = [ "deploy.responses.>" ];
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**Host tier assignments** (via `homelab.host.tier`):
|
||||||
|
| Tier | Hosts |
|
||||||
|
|------|-------|
|
||||||
|
| test | template1, nix-cache01, future test hosts |
|
||||||
|
| prod | ns1, ns2, ha1, monitoring01, http-proxy, etc. |
|
||||||
|
|
||||||
|
**Example deployment scenarios:**
|
||||||
|
|
||||||
|
| Command | Subject | MCP | Admin |
|
||||||
|
|---------|---------|-----|-------|
|
||||||
|
| Deploy to ns1 | `deploy.prod.ns1` | ❌ | ✅ |
|
||||||
|
| Deploy to template1 | `deploy.test.template1` | ✅ | ✅ |
|
||||||
|
| Deploy to all test hosts | `deploy.test.all` | ✅ | ✅ |
|
||||||
|
| Deploy to all prod hosts | `deploy.prod.all` | ❌ | ✅ |
|
||||||
|
| Deploy to all DNS servers | `deploy.prod.role.dns` | ❌ | ✅ |
|
||||||
|
|
||||||
|
All NKeys stored in Vault - MCP gets limited credentials, admin CLI gets full-access credentials.
|
||||||
|
|
||||||
|
### Host Metadata
|
||||||
|
|
||||||
|
Rather than defining `tier` in the listener config, use a central `homelab.host` module that provides host metadata for multiple consumers. This aligns with the approach proposed in `docs/plans/prometheus-scrape-target-labels.md`.
|
||||||
|
|
||||||
|
**Status:** The `homelab.host` module is implemented in `modules/homelab/host.nix`.
|
||||||
|
Hosts can be filtered by tier using `config.homelab.host.tier`.
|
||||||
|
|
||||||
|
**Module definition (in `modules/homelab/host.nix`):**
|
||||||
|
```nix
|
||||||
|
homelab.host = {
|
||||||
|
tier = lib.mkOption {
|
||||||
|
type = lib.types.enum [ "test" "prod" ];
|
||||||
|
default = "prod";
|
||||||
|
description = "Deployment tier - controls which credentials can deploy to this host";
|
||||||
|
};
|
||||||
|
|
||||||
|
priority = lib.mkOption {
|
||||||
|
type = lib.types.enum [ "high" "low" ];
|
||||||
|
default = "high";
|
||||||
|
description = "Alerting priority - low priority hosts have relaxed thresholds";
|
||||||
|
};
|
||||||
|
|
||||||
|
role = lib.mkOption {
|
||||||
|
type = lib.types.nullOr lib.types.str;
|
||||||
|
default = null;
|
||||||
|
description = "Primary role of this host (dns, database, monitoring, etc.)";
|
||||||
|
};
|
||||||
|
|
||||||
|
labels = lib.mkOption {
|
||||||
|
type = lib.types.attrsOf lib.types.str;
|
||||||
|
default = { };
|
||||||
|
description = "Additional free-form labels";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**Consumers:**
|
||||||
|
- `homelab-deploy` listener reads `config.homelab.host.tier` for subject subscription
|
||||||
|
- Prometheus scrape config reads `priority`, `role`, `labels` for target labels
|
||||||
|
- Future services can consume the same metadata
|
||||||
|
|
||||||
|
**Example host config:**
|
||||||
|
```nix
|
||||||
|
# hosts/nix-cache01/configuration.nix
|
||||||
|
homelab.host = {
|
||||||
|
tier = "test"; # can be deployed by MCP
|
||||||
|
priority = "low"; # relaxed alerting thresholds
|
||||||
|
role = "build-host";
|
||||||
|
};
|
||||||
|
|
||||||
|
# hosts/ns1/configuration.nix
|
||||||
|
homelab.host = {
|
||||||
|
tier = "prod"; # requires admin credentials
|
||||||
|
priority = "high";
|
||||||
|
role = "dns";
|
||||||
|
labels.dns_role = "primary";
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
## Implementation Steps
|
||||||
|
|
||||||
|
### Phase 1: Core Binary + Listener
|
||||||
|
|
||||||
|
1. **Create homelab-deploy repository**
|
||||||
|
- Initialize Go module
|
||||||
|
- Set up flake.nix with Go package build
|
||||||
|
|
||||||
|
2. **Implement listener mode**
|
||||||
|
- NATS subscription logic
|
||||||
|
- nixos-rebuild execution
|
||||||
|
- Status reporting via NATS reply
|
||||||
|
|
||||||
|
3. **Create NixOS module**
|
||||||
|
- Systemd service definition
|
||||||
|
- Configuration options (hostname, NATS URL, NKey path)
|
||||||
|
- Vault secret integration for NKeys
|
||||||
|
|
||||||
|
4. **Create `homelab.host` module** (in nixos-servers)
|
||||||
|
- Define `tier`, `priority`, `role`, `labels` options
|
||||||
|
- This module is shared with Prometheus label work (see `docs/plans/prometheus-scrape-target-labels.md`)
|
||||||
|
|
||||||
|
5. **Integrate with nixos-servers**
|
||||||
|
- Add flake input for homelab-deploy
|
||||||
|
- Import listener module in `system/`
|
||||||
|
- Set `homelab.host.tier` per host (test vs prod)
|
||||||
|
|
||||||
|
6. **Configure NATS tiered permissions**
|
||||||
|
- Add deployer users to nats1 config (mcp-deployer, admin-deployer)
|
||||||
|
- Set up subject ACLs per user (test-only vs full access)
|
||||||
|
- Add deployer NKeys to Vault
|
||||||
|
- Create Terraform resources for NKey secrets
|
||||||
|
|
||||||
|
### Phase 2: MCP + CLI
|
||||||
|
|
||||||
|
7. **Implement MCP mode**
|
||||||
|
- MCP server with deploy/status tools
|
||||||
|
- Request/reply pattern for deployment feedback
|
||||||
|
|
||||||
|
8. **Implement CLI commands**
|
||||||
|
- `deploy` command for manual deployments
|
||||||
|
- `status` command to check deployment state
|
||||||
|
|
||||||
|
9. **Configure Claude Code**
|
||||||
|
- Add MCP server to configuration
|
||||||
|
- Document usage
|
||||||
|
|
||||||
|
### Phase 3: Enhancements
|
||||||
|
|
||||||
|
10. Add deployment locking (prevent concurrent deploys)
|
||||||
|
11. Prometheus metrics for deployment status
|
||||||
|
|
||||||
|
## Security Considerations
|
||||||
|
|
||||||
|
- **Privilege escalation**: Listener runs as root to execute nixos-rebuild
|
||||||
|
- **Input validation**: Strictly validate revision format (branch name or commit hash)
|
||||||
|
- **Rate limiting**: Prevent rapid-fire deployments
|
||||||
|
- **Audit logging**: Log all deployment requests with source identity
|
||||||
|
- **Network isolation**: NATS only accessible from internal network
|
||||||
|
|
||||||
|
## Decisions
|
||||||
|
|
||||||
|
All open questions have been resolved. See Notes section for decision rationale.
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- The existing `nixos-rebuild-test` helper provides a good reference for the rebuild logic
|
||||||
|
- Uses NATS request/reply pattern for immediate validation feedback and completion status
|
||||||
|
- Consider using NATS headers for metadata (request ID, timestamp)
|
||||||
|
- **Timeout decision**: Metrics show no-change upgrades complete in 5-55 seconds. A 10-minute default provides ample headroom for actual updates with package downloads. Per-host override available for hosts with known longer build times.
|
||||||
|
- **Rollback**: Not needed as a separate feature - deploy an older commit hash to effectively rollback.
|
||||||
|
- **Offline hosts**: No message persistence - if host is offline, deploy fails. Daily auto-upgrade is the safety net. Avoids complexity of JetStream deduplication (host coming online and applying 10 queued updates instead of just the latest).
|
||||||
|
- **Deploy history**: Use existing Loki - listener logs deployments to journald, queryable via Loki. No need for separate JetStream persistence.
|
||||||
|
- **Naming**: `homelab-deploy` - ties it to the infrastructure rather than implementation details.
|
||||||
156
docs/plans/completed/nix-cache-reprovision.md
Normal file
156
docs/plans/completed/nix-cache-reprovision.md
Normal file
@@ -0,0 +1,156 @@
|
|||||||
|
# Nix Cache Host Reprovision
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Reprovision `nix-cache01` using the OpenTofu workflow, and improve the build/cache system with:
|
||||||
|
1. NATS-based remote build triggering (replacing the current bash script)
|
||||||
|
2. Safer flake update workflow that validates builds before pushing to master
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
**Phase 1: New Build Host** - COMPLETE
|
||||||
|
**Phase 2: NATS Build Triggering** - COMPLETE
|
||||||
|
**Phase 3: Safe Flake Update Workflow** - NOT STARTED
|
||||||
|
**Phase 4: Complete Migration** - COMPLETE
|
||||||
|
**Phase 5: Scheduled Builds** - COMPLETE
|
||||||
|
|
||||||
|
## Completed Work
|
||||||
|
|
||||||
|
### New Build Host (nix-cache02)
|
||||||
|
|
||||||
|
Instead of reprovisioning nix-cache01 in-place, we created a new host `nix-cache02` at 10.69.13.25:
|
||||||
|
|
||||||
|
- **Specs**: 8 CPU cores, 16GB RAM (temporarily, will increase to 24GB after nix-cache01 decommissioned), 200GB disk
|
||||||
|
- **Provisioned via OpenTofu** with automatic Vault credential bootstrapping
|
||||||
|
- **Builder service** configured with two repos:
|
||||||
|
- `nixos-servers` → `git+https://git.t-juice.club/torjus/nixos-servers.git`
|
||||||
|
- `nixos` (gunter) → `git+https://git.t-juice.club/torjus/nixos.git`
|
||||||
|
|
||||||
|
### NATS-Based Build Triggering
|
||||||
|
|
||||||
|
The `homelab-deploy` tool was extended with a builder mode:
|
||||||
|
|
||||||
|
**NATS Subjects:**
|
||||||
|
- `build.<repo>.<target>` - e.g., `build.nixos-servers.all` or `build.nixos-servers.ns1`
|
||||||
|
|
||||||
|
**NATS Permissions (in DEPLOY account):**
|
||||||
|
| User | Publish | Subscribe |
|
||||||
|
|------|---------|-----------|
|
||||||
|
| Builder | `build.responses.>` | `build.>` |
|
||||||
|
| Test deployer | `deploy.test.>`, `deploy.discover`, `build.>` | `deploy.responses.>`, `deploy.discover`, `build.responses.>` |
|
||||||
|
| Admin deployer | `deploy.>`, `build.>` | `deploy.>`, `build.responses.>` |
|
||||||
|
|
||||||
|
**Vault Secrets:**
|
||||||
|
- `shared/homelab-deploy/builder-nkey` - NKey seed for builder authentication
|
||||||
|
|
||||||
|
**NixOS Configuration:**
|
||||||
|
- `hosts/nix-cache02/builder.nix` - Builder service configuration
|
||||||
|
- `services/nats/default.nix` - Updated with builder NATS user
|
||||||
|
|
||||||
|
**MCP Integration:**
|
||||||
|
- `.mcp.json` updated with `--enable-builds` flag
|
||||||
|
- Build tool available via MCP for Claude Code
|
||||||
|
|
||||||
|
**Tested:**
|
||||||
|
- Single host build: `build nixos-servers testvm01` (~30s)
|
||||||
|
- All hosts build: `build nixos-servers all` (16 hosts in ~226s)
|
||||||
|
|
||||||
|
### Harmonia Binary Cache
|
||||||
|
|
||||||
|
- Parameterized `services/nix-cache/harmonia.nix` to use hostname-based Vault paths
|
||||||
|
- Parameterized `services/nix-cache/proxy.nix` for hostname-based domain
|
||||||
|
- New signing key: `nix-cache02.home.2rjus.net-1`
|
||||||
|
- Vault secret: `hosts/nix-cache02/cache-secret`
|
||||||
|
- Removed unused Gitea Actions runner from nix-cache01
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
### nix-cache02 (Active)
|
||||||
|
- Running at 10.69.13.25
|
||||||
|
- Serving `https://nix-cache.home.2rjus.net` (canonical URL)
|
||||||
|
- Builder service active, responding to NATS build requests
|
||||||
|
- Metrics exposed on port 9973 (`homelab-deploy-builder` job)
|
||||||
|
- Harmonia binary cache server running
|
||||||
|
- Signing key: `nix-cache02.home.2rjus.net-1`
|
||||||
|
- Prod tier with `build-host` role
|
||||||
|
|
||||||
|
### nix-cache01 (Decommissioned)
|
||||||
|
- VM deleted from Proxmox
|
||||||
|
- Host configuration removed from repo
|
||||||
|
- Vault AppRole and secrets removed
|
||||||
|
- Old signing key removed from trusted-public-keys
|
||||||
|
|
||||||
|
## Remaining Work
|
||||||
|
|
||||||
|
### Phase 3: Safe Flake Update Workflow
|
||||||
|
|
||||||
|
1. Create `.github/workflows/flake-update-safe.yaml`
|
||||||
|
2. Disable or remove old `flake-update.yaml`
|
||||||
|
3. Test manually with `workflow_dispatch`
|
||||||
|
4. Monitor first automated run
|
||||||
|
|
||||||
|
### Phase 4: Complete Migration ✅
|
||||||
|
|
||||||
|
1. ~~**Add Harmonia to nix-cache02**~~ ✅ Done - new signing key, parameterized service
|
||||||
|
2. ~~**Add trusted public key to all hosts**~~ ✅ Done - `system/nix.nix` updated
|
||||||
|
3. ~~**Test cache from other hosts**~~ ✅ Done - verified from testvm01
|
||||||
|
4. ~~**Update proxy and DNS**~~ ✅ Done - `nix-cache.home.2rjus.net` CNAME now points to nix-cache02
|
||||||
|
5. ~~**Deploy to all hosts**~~ ✅ Done - all hosts have new trusted key
|
||||||
|
6. ~~**Decommission nix-cache01**~~ ✅ Done - 2026-02-10:
|
||||||
|
- Removed `hosts/nix-cache01/` directory
|
||||||
|
- Removed `services/nix-cache/build-flakes.{nix,sh}`
|
||||||
|
- Removed Vault AppRole and secrets
|
||||||
|
- Removed old signing key from `system/nix.nix`
|
||||||
|
- Removed from `flake.nix`
|
||||||
|
- Deleted VM from Proxmox
|
||||||
|
|
||||||
|
### Phase 5: Scheduled Builds ✅
|
||||||
|
|
||||||
|
Implemented a systemd timer on nix-cache02 that triggers builds every 2 hours:
|
||||||
|
|
||||||
|
- **Timer**: `scheduled-build.timer` runs every 2 hours with 5m random jitter
|
||||||
|
- **Service**: `scheduled-build.service` calls `homelab-deploy build` for both repos
|
||||||
|
- **Authentication**: Dedicated scheduler NKey stored in Vault
|
||||||
|
- **NATS user**: Added to DEPLOY account with publish `build.>` and subscribe `build.responses.>`
|
||||||
|
|
||||||
|
Files:
|
||||||
|
- `hosts/nix-cache02/scheduler.nix` - Timer and service configuration
|
||||||
|
- `services/nats/default.nix` - Scheduler NATS user
|
||||||
|
- `terraform/vault/secrets.tf` - Scheduler NKey secret
|
||||||
|
- `terraform/vault/variables.tf` - Variable for scheduler NKey
|
||||||
|
|
||||||
|
## Resolved Questions
|
||||||
|
|
||||||
|
- **Parallel vs sequential builds?** Sequential - hosts share packages, subsequent builds are fast after first
|
||||||
|
- **What about gunter?** Configured as `nixos` repo in builder settings
|
||||||
|
- **Disk size?** 200GB for new host
|
||||||
|
- **Build host specs?** 8 cores, 16-24GB RAM matches current nix-cache01
|
||||||
|
|
||||||
|
### Phase 6: Observability
|
||||||
|
|
||||||
|
1. **Alerting rules** for build failures:
|
||||||
|
```promql
|
||||||
|
# Alert if any build fails
|
||||||
|
increase(homelab_deploy_build_host_total{status="failure"}[1h]) > 0
|
||||||
|
|
||||||
|
# Alert if no successful builds in 24h (scheduled builds stopped)
|
||||||
|
time() - homelab_deploy_build_last_success_timestamp > 86400
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Grafana dashboard** for build metrics:
|
||||||
|
- Build success/failure rate over time
|
||||||
|
- Average build duration per host (histogram)
|
||||||
|
- Build frequency (builds per hour/day)
|
||||||
|
- Last successful build timestamp per repo
|
||||||
|
|
||||||
|
Available metrics:
|
||||||
|
- `homelab_deploy_builds_total{repo, status}` - total builds by repo and status
|
||||||
|
- `homelab_deploy_build_host_total{repo, host, status}` - per-host build counts
|
||||||
|
- `homelab_deploy_build_duration_seconds_{bucket,sum,count}` - build duration histogram
|
||||||
|
- `homelab_deploy_build_last_timestamp{repo}` - last build attempt
|
||||||
|
- `homelab_deploy_build_last_success_timestamp{repo}` - last successful build
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- [x] ~~When to cut over DNS from nix-cache01 to nix-cache02?~~ Done - 2026-02-10
|
||||||
|
- [ ] Implement safe flake update workflow before or after full migration?
|
||||||
176
docs/plans/completed/nixos-exporter.md
Normal file
176
docs/plans/completed/nixos-exporter.md
Normal file
@@ -0,0 +1,176 @@
|
|||||||
|
# NixOS Prometheus Exporter
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Build a generic Prometheus exporter for NixOS-specific metrics. This exporter should be useful for any NixOS deployment, not just our homelab.
|
||||||
|
|
||||||
|
## Goal
|
||||||
|
|
||||||
|
Provide visibility into NixOS system state that standard exporters don't cover:
|
||||||
|
- Generation management (count, age, current vs booted)
|
||||||
|
- Flake input freshness
|
||||||
|
- Upgrade status
|
||||||
|
|
||||||
|
## Metrics
|
||||||
|
|
||||||
|
### Core Metrics
|
||||||
|
|
||||||
|
| Metric | Description | Source |
|
||||||
|
|--------|-------------|--------|
|
||||||
|
| `nixos_generation_count` | Number of system generations | Count entries in `/nix/var/nix/profiles/system-*` |
|
||||||
|
| `nixos_current_generation` | Active generation number | Parse `readlink /run/current-system` |
|
||||||
|
| `nixos_booted_generation` | Generation that was booted | Parse `/run/booted-system` |
|
||||||
|
| `nixos_generation_age_seconds` | Age of current generation | File mtime of current system profile |
|
||||||
|
| `nixos_config_mismatch` | 1 if booted != current, 0 otherwise | Compare symlink targets |
|
||||||
|
|
||||||
|
### Flake Metrics (optional collector)
|
||||||
|
|
||||||
|
| Metric | Description | Source |
|
||||||
|
|--------|-------------|--------|
|
||||||
|
| `nixos_flake_input_age_seconds` | Age of each flake.lock input | Parse `lastModified` from flake.lock |
|
||||||
|
| `nixos_flake_input_info` | Info gauge with rev label | Parse `rev` from flake.lock |
|
||||||
|
|
||||||
|
Labels: `input` (e.g., "nixpkgs", "home-manager")
|
||||||
|
|
||||||
|
### Future Metrics
|
||||||
|
|
||||||
|
| Metric | Description | Source |
|
||||||
|
|--------|-------------|--------|
|
||||||
|
| `nixos_upgrade_pending` | 1 if remote differs from local | Compare flake refs (expensive) |
|
||||||
|
| `nixos_store_size_bytes` | Size of /nix/store | `du` or filesystem stats |
|
||||||
|
| `nixos_store_path_count` | Number of store paths | Count entries |
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
Single binary with optional collectors enabled via config or flags.
|
||||||
|
|
||||||
|
```
|
||||||
|
nixos-exporter
|
||||||
|
├── main.go
|
||||||
|
├── collector/
|
||||||
|
│ ├── generation.go # Core generation metrics
|
||||||
|
│ └── flake.go # Flake input metrics
|
||||||
|
└── config/
|
||||||
|
└── config.go
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
listen_addr: ":9971"
|
||||||
|
collectors:
|
||||||
|
generation:
|
||||||
|
enabled: true
|
||||||
|
flake:
|
||||||
|
enabled: false
|
||||||
|
lock_path: "/etc/nixos/flake.lock" # or auto-detect from /run/current-system
|
||||||
|
```
|
||||||
|
|
||||||
|
Command-line alternative:
|
||||||
|
```bash
|
||||||
|
nixos-exporter --listen=:9971 --collector.flake --flake.lock-path=/etc/nixos/flake.lock
|
||||||
|
```
|
||||||
|
|
||||||
|
## NixOS Module
|
||||||
|
|
||||||
|
```nix
|
||||||
|
services.prometheus.exporters.nixos = {
|
||||||
|
enable = true;
|
||||||
|
port = 9971;
|
||||||
|
collectors = [ "generation" "flake" ];
|
||||||
|
flake.lockPath = "/etc/nixos/flake.lock";
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
The module should integrate with nixpkgs' existing `services.prometheus.exporters.*` pattern.
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
### Language
|
||||||
|
|
||||||
|
Go - mature prometheus client library, single static binary, easy cross-compilation.
|
||||||
|
|
||||||
|
### Phase 1: Core
|
||||||
|
1. Create git repository
|
||||||
|
2. Implement generation collector (count, current, booted, age, mismatch)
|
||||||
|
3. Basic HTTP server with `/metrics` endpoint
|
||||||
|
4. NixOS module
|
||||||
|
|
||||||
|
### Phase 2: Flake Collector
|
||||||
|
1. Parse flake.lock JSON format
|
||||||
|
2. Extract lastModified timestamps per input
|
||||||
|
3. Add input labels
|
||||||
|
|
||||||
|
### Phase 3: Packaging
|
||||||
|
1. Add to nixpkgs or publish as flake
|
||||||
|
2. Documentation
|
||||||
|
3. Example Grafana dashboard
|
||||||
|
|
||||||
|
## Example Output
|
||||||
|
|
||||||
|
```
|
||||||
|
# HELP nixos_generation_count Total number of system generations
|
||||||
|
# TYPE nixos_generation_count gauge
|
||||||
|
nixos_generation_count 47
|
||||||
|
|
||||||
|
# HELP nixos_current_generation Currently active generation number
|
||||||
|
# TYPE nixos_current_generation gauge
|
||||||
|
nixos_current_generation 47
|
||||||
|
|
||||||
|
# HELP nixos_booted_generation Generation that was booted
|
||||||
|
# TYPE nixos_booted_generation gauge
|
||||||
|
nixos_booted_generation 46
|
||||||
|
|
||||||
|
# HELP nixos_generation_age_seconds Age of current generation in seconds
|
||||||
|
# TYPE nixos_generation_age_seconds gauge
|
||||||
|
nixos_generation_age_seconds 3600
|
||||||
|
|
||||||
|
# HELP nixos_config_mismatch 1 if booted generation differs from current
|
||||||
|
# TYPE nixos_config_mismatch gauge
|
||||||
|
nixos_config_mismatch 1
|
||||||
|
|
||||||
|
# HELP nixos_flake_input_age_seconds Age of flake input in seconds
|
||||||
|
# TYPE nixos_flake_input_age_seconds gauge
|
||||||
|
nixos_flake_input_age_seconds{input="nixpkgs"} 259200
|
||||||
|
nixos_flake_input_age_seconds{input="home-manager"} 86400
|
||||||
|
```
|
||||||
|
|
||||||
|
## Alert Examples
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- alert: NixOSConfigStale
|
||||||
|
expr: nixos_generation_age_seconds > 7 * 24 * 3600
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "NixOS config on {{ $labels.instance }} is over 7 days old"
|
||||||
|
|
||||||
|
- alert: NixOSRebootRequired
|
||||||
|
expr: nixos_config_mismatch == 1
|
||||||
|
for: 24h
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: "{{ $labels.instance }} needs reboot to apply config"
|
||||||
|
|
||||||
|
- alert: NixpkgsInputStale
|
||||||
|
expr: nixos_flake_input_age_seconds{input="nixpkgs"} > 30 * 24 * 3600
|
||||||
|
for: 1d
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: "nixpkgs input on {{ $labels.instance }} is over 30 days old"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- [ ] How to detect flake.lock path automatically? (check /run/current-system for flake info)
|
||||||
|
- [ ] Should generation collector need root? (probably not, just reading symlinks)
|
||||||
|
- [ ] Include in nixpkgs or distribute as standalone flake?
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Port 9971 suggested (9970 reserved for homelab-exporter)
|
||||||
|
- Keep scope focused on NixOS-specific metrics - don't duplicate node-exporter
|
||||||
|
- Consider submitting to prometheus exporter registry once stable
|
||||||
107
docs/plans/completed/ns1-recreation.md
Normal file
107
docs/plans/completed/ns1-recreation.md
Normal file
@@ -0,0 +1,107 @@
|
|||||||
|
# ns1 Recreation Plan
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Recreate ns1 using the OpenTofu workflow after the existing VM entered emergency mode due to incorrect hardware-configuration.nix (hardcoded UUIDs that don't match actual disk layout).
|
||||||
|
|
||||||
|
## Current ns1 Configuration to Preserve
|
||||||
|
|
||||||
|
- **IP:** 10.69.13.5/24
|
||||||
|
- **Gateway:** 10.69.13.1
|
||||||
|
- **Role:** Primary DNS (authoritative + resolver)
|
||||||
|
- **Services:**
|
||||||
|
- `../../services/ns/master-authorative.nix`
|
||||||
|
- `../../services/ns/resolver.nix`
|
||||||
|
- **Metadata:**
|
||||||
|
- `homelab.host.role = "dns"`
|
||||||
|
- `homelab.host.labels.dns_role = "primary"`
|
||||||
|
- **Vault:** enabled
|
||||||
|
- **Deploy:** enabled
|
||||||
|
|
||||||
|
## Execution Steps
|
||||||
|
|
||||||
|
### Phase 1: Remove Old Configuration
|
||||||
|
|
||||||
|
```bash
|
||||||
|
nix develop -c create-host --remove --hostname ns1 --force
|
||||||
|
```
|
||||||
|
|
||||||
|
This removes:
|
||||||
|
- `hosts/ns1/` directory
|
||||||
|
- Entry from `flake.nix`
|
||||||
|
- Any terraform entries (none exist currently)
|
||||||
|
|
||||||
|
### Phase 2: Create New Configuration
|
||||||
|
|
||||||
|
```bash
|
||||||
|
nix develop -c create-host --hostname ns1 --ip 10.69.13.5/24
|
||||||
|
```
|
||||||
|
|
||||||
|
This creates:
|
||||||
|
- `hosts/ns1/` with template2-based configuration
|
||||||
|
- Entry in `flake.nix`
|
||||||
|
- Entry in `terraform/vms.tf`
|
||||||
|
- Vault wrapped token for bootstrap
|
||||||
|
|
||||||
|
### Phase 3: Customize Configuration
|
||||||
|
|
||||||
|
After create-host, manually update `hosts/ns1/configuration.nix` to add:
|
||||||
|
|
||||||
|
1. DNS service imports:
|
||||||
|
```nix
|
||||||
|
../../services/ns/master-authorative.nix
|
||||||
|
../../services/ns/resolver.nix
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Host metadata:
|
||||||
|
```nix
|
||||||
|
homelab.host = {
|
||||||
|
tier = "prod";
|
||||||
|
role = "dns";
|
||||||
|
labels.dns_role = "primary";
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Disable resolved (conflicts with Unbound):
|
||||||
|
```nix
|
||||||
|
services.resolved.enable = false;
|
||||||
|
```
|
||||||
|
|
||||||
|
### Phase 4: Commit Changes
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add -A
|
||||||
|
git commit -m "ns1: recreate with OpenTofu workflow
|
||||||
|
|
||||||
|
Old VM had incorrect hardware-configuration.nix with hardcoded UUIDs
|
||||||
|
that didn't match actual disk layout, causing boot failure.
|
||||||
|
|
||||||
|
Recreated using template2-based configuration for OpenTofu provisioning."
|
||||||
|
```
|
||||||
|
|
||||||
|
### Phase 5: Infrastructure
|
||||||
|
|
||||||
|
1. Delete old ns1 VM in Proxmox (it's broken anyway)
|
||||||
|
2. Run `nix develop -c tofu -chdir=terraform apply`
|
||||||
|
3. Wait for bootstrap to complete
|
||||||
|
4. Verify ns1 is functional:
|
||||||
|
- DNS resolution working
|
||||||
|
- Zone transfer to ns2 working
|
||||||
|
- All exporters responding
|
||||||
|
|
||||||
|
### Phase 6: Finalize
|
||||||
|
|
||||||
|
- Push to master
|
||||||
|
- Move this plan to `docs/plans/completed/`
|
||||||
|
|
||||||
|
## Rollback
|
||||||
|
|
||||||
|
If the new VM fails:
|
||||||
|
1. ns2 is still operational as secondary DNS
|
||||||
|
2. Can recreate with different settings if needed
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- ns2 will continue serving DNS during the migration
|
||||||
|
- Zone data is generated from flake, so no data loss
|
||||||
|
- The old VM's disk can be kept briefly in Proxmox as backup if desired
|
||||||
87
docs/plans/completed/openbao-kanidm-oidc.md
Normal file
87
docs/plans/completed/openbao-kanidm-oidc.md
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
# OpenBao + Kanidm OIDC Integration
|
||||||
|
|
||||||
|
## Status: Completed
|
||||||
|
|
||||||
|
Implemented 2026-02-09.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Enable Kanidm users to authenticate to OpenBao (Vault) using OIDC for Web UI access. Members of the `admins` group get full read/write access to secrets.
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
### Files Modified
|
||||||
|
|
||||||
|
| File | Changes |
|
||||||
|
|------|---------|
|
||||||
|
| `terraform/vault/oidc.tf` | New - OIDC auth backend and roles |
|
||||||
|
| `terraform/vault/policies.tf` | Added oidc-admin and oidc-default policies |
|
||||||
|
| `terraform/vault/secrets.tf` | Added OAuth2 client secret |
|
||||||
|
| `terraform/vault/approle.tf` | Granted kanidm01 access to openbao secrets |
|
||||||
|
| `services/kanidm/default.nix` | Added openbao OAuth2 client, enabled imperative group membership |
|
||||||
|
|
||||||
|
### Kanidm Configuration
|
||||||
|
|
||||||
|
OAuth2 client `openbao` with:
|
||||||
|
- Confidential client (uses client secret)
|
||||||
|
- Web UI callback only: `https://vault.home.2rjus.net:8200/ui/vault/auth/oidc/oidc/callback`
|
||||||
|
- Legacy crypto enabled (RS256 for OpenBao compatibility)
|
||||||
|
- Scope maps for `admins` and `users` groups
|
||||||
|
|
||||||
|
Group membership is now managed imperatively (`overwriteMembers = false`) to prevent provisioning from resetting group memberships on service restart.
|
||||||
|
|
||||||
|
### OpenBao Configuration
|
||||||
|
|
||||||
|
OIDC auth backend at `/oidc` with two roles:
|
||||||
|
|
||||||
|
| Role | Bound Claims | Policy | Access |
|
||||||
|
|------|--------------|--------|--------|
|
||||||
|
| `admin` | `groups = admins@home.2rjus.net` | `oidc-admin` | Full read/write to secrets, system health/metrics |
|
||||||
|
| `default` | (none) | `oidc-default` | Token lookup-self, system health |
|
||||||
|
|
||||||
|
Both roles request scopes: `openid`, `profile`, `email`, `groups`
|
||||||
|
|
||||||
|
### Policies
|
||||||
|
|
||||||
|
**oidc-admin:**
|
||||||
|
- `secret/*` - create, read, update, delete, list
|
||||||
|
- `sys/health` - read
|
||||||
|
- `sys/metrics` - read
|
||||||
|
- `sys/auth` - read
|
||||||
|
- `sys/mounts` - read
|
||||||
|
|
||||||
|
**oidc-default:**
|
||||||
|
- `auth/token/lookup-self` - read
|
||||||
|
- `sys/health` - read
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Web UI Login
|
||||||
|
1. Navigate to https://vault.home.2rjus.net:8200
|
||||||
|
2. Select "OIDC" authentication method
|
||||||
|
3. Enter role: `admin` (for admins) or `default` (for any user)
|
||||||
|
4. Click "Sign in with OIDC"
|
||||||
|
5. Authenticate with Kanidm
|
||||||
|
|
||||||
|
### Group Management
|
||||||
|
Add users to admins group for full access:
|
||||||
|
```bash
|
||||||
|
kanidm group add-members admins <username>
|
||||||
|
```
|
||||||
|
|
||||||
|
## Limitations
|
||||||
|
|
||||||
|
**CLI login not supported:** Kanidm requires HTTPS for all redirect URIs on confidential (non-public) OAuth2 clients. OpenBao CLI uses `http://localhost:8250/oidc/callback` which Kanidm rejects. Public clients would allow localhost redirects, but OpenBao requires a client secret for OIDC auth.
|
||||||
|
|
||||||
|
## Lessons Learned
|
||||||
|
|
||||||
|
1. **Kanidm group names:** Groups are returned as `groupname@domain` (e.g., `admins@home.2rjus.net`), not just the short name
|
||||||
|
2. **RS256 required:** OpenBao only supports RS256 for JWT signing; Kanidm defaults to ES256, requiring `enableLegacyCrypto = true`
|
||||||
|
3. **Scope request:** OIDC roles must explicitly request the `groups` scope via `oidc_scopes`
|
||||||
|
4. **Provisioning resets:** Kanidm provisioning with default `overwriteMembers = true` resets group memberships on restart
|
||||||
|
5. **Two-phase Terraform:** Secret must exist before OIDC backend can validate discovery URL
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [OpenBao JWT/OIDC Auth Method](https://openbao.org/docs/auth/jwt/)
|
||||||
|
- [Kanidm OAuth2 Documentation](https://kanidm.github.io/kanidm/stable/integrations/oauth2.html)
|
||||||
113
docs/plans/completed/pgdb1-decommission.md
Normal file
113
docs/plans/completed/pgdb1-decommission.md
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
# pgdb1 Decommissioning Plan
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Decommission the pgdb1 PostgreSQL server. The only consumer was Open WebUI on gunter, which has been migrated to use a local PostgreSQL instance.
|
||||||
|
|
||||||
|
## Pre-flight Verification
|
||||||
|
|
||||||
|
Before proceeding, verify that gunter is no longer using pgdb1:
|
||||||
|
|
||||||
|
1. Check Open WebUI on gunter is configured for local PostgreSQL (not 10.69.13.16)
|
||||||
|
2. Optionally: Check pgdb1 for recent connection activity:
|
||||||
|
```bash
|
||||||
|
ssh pgdb1 'sudo -u postgres psql -c "SELECT * FROM pg_stat_activity WHERE datname IS NOT NULL;"'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Files to Remove
|
||||||
|
|
||||||
|
### Host Configuration
|
||||||
|
- `hosts/pgdb1/default.nix`
|
||||||
|
- `hosts/pgdb1/configuration.nix`
|
||||||
|
- `hosts/pgdb1/hardware-configuration.nix`
|
||||||
|
- `hosts/pgdb1/` (directory)
|
||||||
|
|
||||||
|
### Service Module
|
||||||
|
- `services/postgres/postgres.nix`
|
||||||
|
- `services/postgres/default.nix`
|
||||||
|
- `services/postgres/` (directory)
|
||||||
|
|
||||||
|
Note: This service module is only used by pgdb1, so it can be removed entirely.
|
||||||
|
|
||||||
|
### Flake Entry
|
||||||
|
Remove from `flake.nix` (lines 131-138):
|
||||||
|
```nix
|
||||||
|
pgdb1 = nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
specialArgs = {
|
||||||
|
inherit inputs self;
|
||||||
|
};
|
||||||
|
modules = commonModules ++ [
|
||||||
|
./hosts/pgdb1
|
||||||
|
];
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
### Vault AppRole
|
||||||
|
Remove from `terraform/vault/approle.tf` (lines 69-73):
|
||||||
|
```hcl
|
||||||
|
"pgdb1" = {
|
||||||
|
paths = [
|
||||||
|
"secret/data/hosts/pgdb1/*",
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Monitoring Rules
|
||||||
|
Remove from `services/monitoring/rules.yml` the `postgres_down` alert (lines 359-365):
|
||||||
|
```yaml
|
||||||
|
- name: postgres_rules
|
||||||
|
rules:
|
||||||
|
- alert: postgres_down
|
||||||
|
expr: node_systemd_unit_state{instance="pgdb1.home.2rjus.net:9100", name="postgresql.service", state="active"} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
```
|
||||||
|
|
||||||
|
### Utility Scripts
|
||||||
|
Delete `rebuild-all.sh` entirely (obsolete script).
|
||||||
|
|
||||||
|
## Execution Steps
|
||||||
|
|
||||||
|
### Phase 1: Verification
|
||||||
|
- [ ] Confirm Open WebUI on gunter uses local PostgreSQL
|
||||||
|
- [ ] Verify no active connections to pgdb1
|
||||||
|
|
||||||
|
### Phase 2: Code Cleanup
|
||||||
|
- [ ] Create feature branch: `git checkout -b decommission-pgdb1`
|
||||||
|
- [ ] Remove `hosts/pgdb1/` directory
|
||||||
|
- [ ] Remove `services/postgres/` directory
|
||||||
|
- [ ] Remove pgdb1 entry from `flake.nix`
|
||||||
|
- [ ] Remove postgres alert from `services/monitoring/rules.yml`
|
||||||
|
- [ ] Delete `rebuild-all.sh` (obsolete)
|
||||||
|
- [ ] Run `nix flake check` to verify no broken references
|
||||||
|
- [ ] Commit changes
|
||||||
|
|
||||||
|
### Phase 3: Terraform Cleanup
|
||||||
|
- [ ] Remove pgdb1 from `terraform/vault/approle.tf`
|
||||||
|
- [ ] Run `tofu plan` in `terraform/vault/` to preview changes
|
||||||
|
- [ ] Run `tofu apply` to remove the AppRole
|
||||||
|
- [ ] Commit terraform changes
|
||||||
|
|
||||||
|
### Phase 4: Infrastructure Cleanup
|
||||||
|
- [ ] Shut down pgdb1 VM in Proxmox
|
||||||
|
- [ ] Delete the VM from Proxmox
|
||||||
|
- [ ] (Optional) Remove any DNS entries if not auto-generated
|
||||||
|
|
||||||
|
### Phase 5: Finalize
|
||||||
|
- [ ] Merge feature branch to master
|
||||||
|
- [ ] Trigger auto-upgrade on DNS servers (ns1, ns2) to remove DNS entry
|
||||||
|
- [ ] Move this plan to `docs/plans/completed/`
|
||||||
|
|
||||||
|
## Rollback
|
||||||
|
|
||||||
|
If issues arise after decommissioning:
|
||||||
|
1. The VM can be recreated from template using the git history
|
||||||
|
2. Database data would need to be restored from backup (if any exists)
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- pgdb1 IP: 10.69.13.16
|
||||||
|
- The postgres service allowed connections from gunter (10.69.30.105)
|
||||||
|
- No restic backup was configured for this host
|
||||||
205
docs/plans/completed/prometheus-scrape-target-labels.md
Normal file
205
docs/plans/completed/prometheus-scrape-target-labels.md
Normal file
@@ -0,0 +1,205 @@
|
|||||||
|
# Prometheus Scrape Target Labels
|
||||||
|
|
||||||
|
## Implementation Status
|
||||||
|
|
||||||
|
| Step | Status | Notes |
|
||||||
|
|------|--------|-------|
|
||||||
|
| 1. Create `homelab.host` module | ✅ Complete | `modules/homelab/host.nix` |
|
||||||
|
| 2. Update `lib/monitoring.nix` | ✅ Complete | Labels extracted and propagated |
|
||||||
|
| 3. Update Prometheus config | ✅ Complete | Uses structured static_configs |
|
||||||
|
| 4. Set metadata on hosts | ✅ Complete | All relevant hosts configured |
|
||||||
|
| 5. Update alert rules | ✅ Complete | Role-based filtering implemented |
|
||||||
|
| 6. Labels for service targets | ✅ Complete | Host labels propagated to all services |
|
||||||
|
| 7. Add hostname label | ✅ Complete | All targets have `hostname` label for easy filtering |
|
||||||
|
|
||||||
|
**Hosts with metadata configured:**
|
||||||
|
- `ns1`, `ns2`: `role = "dns"`, `labels.dns_role = "primary"/"secondary"`
|
||||||
|
- `nix-cache01`: `role = "build-host"`
|
||||||
|
- `vault01`: `role = "vault"`
|
||||||
|
- `testvm01/02/03`: `tier = "test"`
|
||||||
|
|
||||||
|
**Implementation complete.** Branch: `prometheus-scrape-target-labels`
|
||||||
|
|
||||||
|
**Query examples:**
|
||||||
|
- `{hostname="ns1"}` - all metrics from ns1 (any job/port)
|
||||||
|
- `node_cpu_seconds_total{hostname="monitoring01"}` - specific metric by hostname
|
||||||
|
- `up{role="dns"}` - all DNS servers
|
||||||
|
- `up{tier="test"}` - all test-tier hosts
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Goal
|
||||||
|
|
||||||
|
Add support for custom per-host labels on Prometheus scrape targets, enabling alert rules to reference host metadata (priority, role) instead of hardcoding instance names.
|
||||||
|
|
||||||
|
**Related:** This plan shares the `homelab.host` module with `docs/plans/completed/nats-deploy-service.md`, which uses the same metadata for deployment tier assignment.
|
||||||
|
|
||||||
|
## Motivation
|
||||||
|
|
||||||
|
Some hosts have workloads that make generic alert thresholds inappropriate. For example, `nix-cache01` regularly hits high CPU during builds, requiring a longer `for` duration on `high_cpu_load`. Currently this is handled by excluding specific instance names in PromQL expressions, which is brittle and doesn't scale.
|
||||||
|
|
||||||
|
With per-host labels, alert rules can use semantic filters like `{priority!="low"}` instead of `{instance!="nix-cache01.home.2rjus.net:9100"}`.
|
||||||
|
|
||||||
|
## Proposed Labels
|
||||||
|
|
||||||
|
### `priority`
|
||||||
|
|
||||||
|
Indicates alerting importance. Hosts with `priority = "low"` can have relaxed thresholds or longer durations in alert rules.
|
||||||
|
|
||||||
|
Values: `"high"` (default), `"low"`
|
||||||
|
|
||||||
|
### `role`
|
||||||
|
|
||||||
|
Describes the function of the host. Useful for grouping in dashboards and targeting role-specific alert rules.
|
||||||
|
|
||||||
|
Values: free-form string, e.g. `"dns"`, `"build-host"`, `"database"`, `"monitoring"`
|
||||||
|
|
||||||
|
**Note on multiple roles:** Prometheus labels are strictly string values, not lists. For hosts that serve multiple roles there are a few options:
|
||||||
|
|
||||||
|
- **Separate boolean labels:** `role_build_host = "true"`, `role_cache_server = "true"` -- flexible but verbose, and requires updating the module when new roles are added.
|
||||||
|
- **Delimited string:** `role = "build-host,cache-server"` -- works with regex matchers (`{role=~".*build-host.*"}`), but regex matching is less clean and more error-prone.
|
||||||
|
- **Pick a primary role:** `role = "build-host"` -- simplest, and probably sufficient since most hosts have one primary role.
|
||||||
|
|
||||||
|
Recommendation: start with a single primary role string. If multi-role matching becomes a real need, switch to separate boolean labels.
|
||||||
|
|
||||||
|
### `dns_role`
|
||||||
|
|
||||||
|
For DNS servers specifically, distinguish between primary and secondary resolvers. The secondary resolver (ns2) receives very little traffic and has a cold cache, making generic cache hit ratio alerts inappropriate.
|
||||||
|
|
||||||
|
Values: `"primary"`, `"secondary"`
|
||||||
|
|
||||||
|
Example use case: The `unbound_low_cache_hit_ratio` alert fires on ns2 because its cache hit ratio (~62%) is lower than ns1 (~90%). This is expected behavior since ns2 gets ~100x less traffic. With a `dns_role` label, the alert can either exclude secondaries or use different thresholds:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
# Only alert on primary DNS
|
||||||
|
unbound_cache_hit_ratio < 0.7 and on(instance) unbound_up{dns_role="primary"}
|
||||||
|
|
||||||
|
# Or use different thresholds
|
||||||
|
(unbound_cache_hit_ratio < 0.7 and on(instance) unbound_up{dns_role="primary"})
|
||||||
|
or
|
||||||
|
(unbound_cache_hit_ratio < 0.5 and on(instance) unbound_up{dns_role="secondary"})
|
||||||
|
```
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
This implementation uses a shared `homelab.host` module that provides host metadata for multiple consumers (Prometheus labels, deployment tiers, etc.). See also `docs/plans/completed/nats-deploy-service.md` which uses the same module for deployment tier assignment.
|
||||||
|
|
||||||
|
### 1. Create `homelab.host` module
|
||||||
|
|
||||||
|
✅ **Complete.** The module is in `modules/homelab/host.nix`.
|
||||||
|
|
||||||
|
Create `modules/homelab/host.nix` with shared host metadata options:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
{ lib, ... }:
|
||||||
|
{
|
||||||
|
options.homelab.host = {
|
||||||
|
tier = lib.mkOption {
|
||||||
|
type = lib.types.enum [ "test" "prod" ];
|
||||||
|
default = "prod";
|
||||||
|
description = "Deployment tier - controls which credentials can deploy to this host";
|
||||||
|
};
|
||||||
|
|
||||||
|
priority = lib.mkOption {
|
||||||
|
type = lib.types.enum [ "high" "low" ];
|
||||||
|
default = "high";
|
||||||
|
description = "Alerting priority - low priority hosts have relaxed thresholds";
|
||||||
|
};
|
||||||
|
|
||||||
|
role = lib.mkOption {
|
||||||
|
type = lib.types.nullOr lib.types.str;
|
||||||
|
default = null;
|
||||||
|
description = "Primary role of this host (dns, database, monitoring, etc.)";
|
||||||
|
};
|
||||||
|
|
||||||
|
labels = lib.mkOption {
|
||||||
|
type = lib.types.attrsOf lib.types.str;
|
||||||
|
default = { };
|
||||||
|
description = "Additional free-form labels (e.g., dns_role = 'primary')";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Import this module in `modules/homelab/default.nix`.
|
||||||
|
|
||||||
|
### 2. Update `lib/monitoring.nix`
|
||||||
|
|
||||||
|
✅ **Complete.** Labels are now extracted and propagated.
|
||||||
|
|
||||||
|
- `extractHostMonitoring` should also extract `homelab.host` values (priority, role, labels).
|
||||||
|
- Build the combined label set from `homelab.host`:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
# Combine structured options + free-form labels
|
||||||
|
effectiveLabels =
|
||||||
|
(lib.optionalAttrs (host.priority != "high") { priority = host.priority; })
|
||||||
|
// (lib.optionalAttrs (host.role != null) { role = host.role; })
|
||||||
|
// host.labels;
|
||||||
|
```
|
||||||
|
|
||||||
|
- `generateNodeExporterTargets` returns structured `static_configs` entries, grouping targets by their label sets:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
# Before (flat list):
|
||||||
|
["ns1.home.2rjus.net:9100", "ns2.home.2rjus.net:9100", ...]
|
||||||
|
|
||||||
|
# After (grouped by labels):
|
||||||
|
[
|
||||||
|
{ targets = ["ns1.home.2rjus.net:9100", "ns2.home.2rjus.net:9100", ...]; }
|
||||||
|
{ targets = ["nix-cache01.home.2rjus.net:9100"]; labels = { priority = "low"; role = "build-host"; }; }
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
This requires grouping hosts by their label attrset and producing one `static_configs` entry per unique label combination. Hosts with default values (priority=high, no role, no labels) get grouped together with no extra labels (preserving current behavior).
|
||||||
|
|
||||||
|
### 3. Update `services/monitoring/prometheus.nix`
|
||||||
|
|
||||||
|
✅ **Complete.** Now uses structured static_configs output.
|
||||||
|
|
||||||
|
Change the node-exporter scrape config to use the new structured output:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
# Before:
|
||||||
|
static_configs = [{ targets = nodeExporterTargets; }];
|
||||||
|
|
||||||
|
# After:
|
||||||
|
static_configs = nodeExporterTargets;
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Set metadata on hosts
|
||||||
|
|
||||||
|
✅ **Complete.** All relevant hosts have metadata configured. Note: The implementation filters by `role` rather than `priority`, which matches the existing nix-cache01 configuration.
|
||||||
|
|
||||||
|
Example in `hosts/nix-cache01/configuration.nix`:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
homelab.host = {
|
||||||
|
priority = "low"; # relaxed alerting thresholds
|
||||||
|
role = "build-host";
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**Note:** Current implementation only sets `role = "build-host"`. Consider adding `priority = "low"` when label propagation is implemented.
|
||||||
|
|
||||||
|
Example in `hosts/ns1/configuration.nix`:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
homelab.host = {
|
||||||
|
role = "dns";
|
||||||
|
labels.dns_role = "primary";
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**Note:** `tier` and `priority` use defaults ("prod" and "high"), which is the intended behavior. The current ns1/ns2 configurations match this pattern.
|
||||||
|
|
||||||
|
### 5. Update alert rules
|
||||||
|
|
||||||
|
✅ **Complete.** Updated `services/monitoring/rules.yml`:
|
||||||
|
|
||||||
|
- `high_cpu_load`: Replaced `instance!="nix-cache01..."` with `role!="build-host"` for standard hosts (15m duration) and `role="build-host"` for build hosts (2h duration).
|
||||||
|
- `unbound_low_cache_hit_ratio`: Added `dns_role="primary"` filter to only alert on the primary DNS resolver (secondary has a cold cache).
|
||||||
|
|
||||||
|
### 6. Labels for `generateScrapeConfigs` (service targets)
|
||||||
|
|
||||||
|
✅ **Complete.** Host labels are now propagated to all auto-generated service scrape targets (unbound, homelab-deploy, nixos-exporter, etc.). This enables semantic filtering on any service metric, such as using `dns_role="primary"` with the unbound job.
|
||||||
86
docs/plans/completed/sops-to-openbao-migration.md
Normal file
86
docs/plans/completed/sops-to-openbao-migration.md
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
# Sops to OpenBao Secrets Migration Plan
|
||||||
|
|
||||||
|
## Status: Complete (except ca, deferred)
|
||||||
|
|
||||||
|
## Remaining sops cleanup
|
||||||
|
|
||||||
|
The `sops-nix` flake input, `system/sops.nix`, `.sops.yaml`, and `secrets/` directory are
|
||||||
|
still present because `ca` still uses sops for its step-ca secrets (5 secrets in
|
||||||
|
`services/ca/default.nix`). The `services/authelia/` and `services/lldap/` modules also
|
||||||
|
reference sops but are only used by auth01 (decommissioned).
|
||||||
|
|
||||||
|
Once `ca` is migrated to OpenBao PKI (Phase 4c in host-migration-to-opentofu.md), remove:
|
||||||
|
- `sops-nix` input from `flake.nix`
|
||||||
|
- `sops-nix.nixosModules.sops` from all host module lists in `flake.nix`
|
||||||
|
- `inherit sops-nix` from all specialArgs in `flake.nix`
|
||||||
|
- `system/sops.nix` and its import in `system/default.nix`
|
||||||
|
- `.sops.yaml`
|
||||||
|
- `secrets/` directory
|
||||||
|
- All `sops.secrets.*` declarations in `services/ca/`, `services/authelia/`, `services/lldap/`
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Migrate all hosts from sops-nix secrets to OpenBao (vault) secrets management. Pilot with ha1, then roll out to remaining hosts in waves.
|
||||||
|
|
||||||
|
## Pre-requisites (completed)
|
||||||
|
|
||||||
|
1. Hardcoded root password hash in `system/root-user.nix` (removes sops dependency for all hosts)
|
||||||
|
2. Added `extractKey` option to `system/vault-secrets.nix` (extracts single key as file)
|
||||||
|
|
||||||
|
## Deployment Order
|
||||||
|
|
||||||
|
### Pilot: ha1
|
||||||
|
- Terraform: shared/backup/password secret, ha1 AppRole policy
|
||||||
|
- Provision AppRole credentials via `playbooks/provision-approle.yml`
|
||||||
|
- NixOS: vault.enable + backup-helper vault secret
|
||||||
|
|
||||||
|
### Wave 1: nats1, jelly01, pgdb1
|
||||||
|
- No service secrets (only root password, already handled)
|
||||||
|
- Just need AppRole policies + credential provisioning
|
||||||
|
|
||||||
|
### Wave 2: monitoring01
|
||||||
|
- 3 secrets: backup password, nats nkey, pve-exporter config
|
||||||
|
- Updates: alerttonotify.nix, pve.nix, configuration.nix
|
||||||
|
|
||||||
|
### Wave 3: ns1, then ns2 (critical - deploy ns1 first, verify, then ns2)
|
||||||
|
- DNS zone transfer key (shared/dns/xfer-key)
|
||||||
|
|
||||||
|
### Wave 4: http-proxy
|
||||||
|
- WireGuard private key
|
||||||
|
|
||||||
|
### Wave 5: nix-cache01
|
||||||
|
- Cache signing key + Gitea Actions token
|
||||||
|
|
||||||
|
### Wave 6: ca (DEFERRED - waiting for PKI migration)
|
||||||
|
|
||||||
|
### Skipped: auth01 (decommissioned)
|
||||||
|
|
||||||
|
## Terraform variables needed
|
||||||
|
|
||||||
|
User must extract from sops and add to `terraform/vault/terraform.tfvars`:
|
||||||
|
|
||||||
|
| Variable | Source |
|
||||||
|
|----------|--------|
|
||||||
|
| `backup_helper_secret` | `sops -d secrets/secrets.yaml` |
|
||||||
|
| `ns_xfer_key` | `sops -d secrets/secrets.yaml` |
|
||||||
|
| `nats_nkey` | `sops -d secrets/secrets.yaml` |
|
||||||
|
| `pve_exporter_config` | `sops -d secrets/monitoring01/pve-exporter.yaml` |
|
||||||
|
| `wireguard_private_key` | `sops -d secrets/http-proxy/wireguard.yaml` |
|
||||||
|
| `cache_signing_key` | `sops -d secrets/nix-cache01/cache-secret` |
|
||||||
|
| `actions_token_1` | `sops -d secrets/nix-cache01/actions_token_1` |
|
||||||
|
|
||||||
|
## Provisioning AppRole credentials
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export BAO_ADDR='https://vault01.home.2rjus.net:8200'
|
||||||
|
export BAO_TOKEN='<root-token>'
|
||||||
|
nix develop -c ansible-playbook playbooks/provision-approle.yml -e hostname=<host>
|
||||||
|
```
|
||||||
|
|
||||||
|
## Verification (per host)
|
||||||
|
|
||||||
|
1. `systemctl status vault-secret-*` - all secret fetch services succeeded
|
||||||
|
2. Check secret files exist at expected paths with correct permissions
|
||||||
|
3. Verify dependent services are running
|
||||||
|
4. Check `/var/lib/vault/cache/` is populated (fallback ready)
|
||||||
|
5. Reboot host to verify boot-time secret fetching works
|
||||||
109
docs/plans/completed/zigbee-sensor-battery-monitoring.md
Normal file
109
docs/plans/completed/zigbee-sensor-battery-monitoring.md
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
# Zigbee Sensor Battery Monitoring
|
||||||
|
|
||||||
|
**Status:** Completed
|
||||||
|
**Branch:** `zigbee-battery-fix`
|
||||||
|
**Commit:** `c515a6b home-assistant: fix zigbee sensor battery reporting`
|
||||||
|
|
||||||
|
## Problem
|
||||||
|
|
||||||
|
Three Aqara Zigbee temperature sensors report `battery: 0` in their MQTT payload, making the `hass_sensor_battery_percent` Prometheus metric useless for battery monitoring on these devices.
|
||||||
|
|
||||||
|
Affected sensors:
|
||||||
|
- **Temp Living Room** (`0x54ef441000a54d3c`) — WSDCGQ12LM
|
||||||
|
- **Temp Office** (`0x54ef441000a547bd`) — WSDCGQ12LM
|
||||||
|
- **temp_server** (`0x54ef441000a564b6`) — WSDCGQ12LM
|
||||||
|
|
||||||
|
The **Temp Bedroom** sensor (`0x00124b0025495463`) is a SONOFF SNZB-02 and reports battery correctly.
|
||||||
|
|
||||||
|
## Findings
|
||||||
|
|
||||||
|
- All three sensors are actively reporting temperature, humidity, and pressure data — they are not dead.
|
||||||
|
- The Zigbee2MQTT payload includes a `voltage` field (e.g., `2707` = 2.707V), which indicates healthy battery levels (~40-60% for a CR2032 coin cell).
|
||||||
|
- CR2032 voltage reference: ~3.0V fresh, ~2.7V mid-life, ~2.1V dead.
|
||||||
|
- The `voltage` field is not exposed as a Prometheus metric — it exists only in the MQTT payload.
|
||||||
|
- This is a known firmware quirk with some Aqara WSDCGQ12LM sensors that always report 0% battery.
|
||||||
|
|
||||||
|
## Device Inventory
|
||||||
|
|
||||||
|
Full list of Zigbee devices on ha1 (12 total):
|
||||||
|
|
||||||
|
| Device | IEEE Address | Model | Type |
|
||||||
|
|--------|-------------|-------|------|
|
||||||
|
| temp_server | 0x54ef441000a564b6 | WSDCGQ12LM | Temperature sensor (battery fix applied) |
|
||||||
|
| (Temp Living Room) | 0x54ef441000a54d3c | WSDCGQ12LM | Temperature sensor (battery fix applied) |
|
||||||
|
| (Temp Office) | 0x54ef441000a547bd | WSDCGQ12LM | Temperature sensor (battery fix applied) |
|
||||||
|
| (Temp Bedroom) | 0x00124b0025495463 | SNZB-02 | Temperature sensor (battery works) |
|
||||||
|
| (Water leak) | 0x54ef4410009ac117 | SJCGQ12LM | Water leak sensor |
|
||||||
|
| btn_livingroom | 0x54ef441000a1f907 | WXKG13LM | Wireless mini switch |
|
||||||
|
| btn_bedroom | 0x54ef441000a1ee71 | WXKG13LM | Wireless mini switch |
|
||||||
|
| (Hue bulb) | 0x001788010dc35d06 | 9290024688 | Hue E27 1100lm (Router) |
|
||||||
|
| (Hue bulb) | 0x001788010dc5f003 | 9290024688 | Hue E27 1100lm (Router) |
|
||||||
|
| (Hue ceiling) | 0x001788010e371aa4 | 915005997301 | Hue Infuse medium (Router) |
|
||||||
|
| (Hue ceiling) | 0x001788010d253b99 | 915005997301 | Hue Infuse medium (Router) |
|
||||||
|
| (Hue wall) | 0x001788010d1b599a | 929003052901 | Hue Sana wall light (Router, transition=5) |
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
### Solution 1: Calculate battery from voltage in Zigbee2MQTT (Implemented)
|
||||||
|
|
||||||
|
Override the Home Assistant battery entity's `value_template` in Zigbee2MQTT device configuration to calculate battery percentage from voltage.
|
||||||
|
|
||||||
|
**Formula:** `(voltage - 2100) / 9` (maps 2100-3000mV to 0-100%)
|
||||||
|
|
||||||
|
**Changes in `services/home-assistant/default.nix`:**
|
||||||
|
- Device configuration moved from external `devices.yaml` to inline NixOS config
|
||||||
|
- Three affected sensors have `homeassistant.sensor_battery.value_template` override
|
||||||
|
- All 12 devices now declaratively managed
|
||||||
|
|
||||||
|
**Expected battery values based on current voltages:**
|
||||||
|
| Sensor | Voltage | Expected Battery |
|
||||||
|
|--------|---------|------------------|
|
||||||
|
| Temp Living Room | 2710 mV | ~68% |
|
||||||
|
| Temp Office | 2658 mV | ~62% |
|
||||||
|
| temp_server | 2765 mV | ~74% |
|
||||||
|
|
||||||
|
### Solution 2: Alert on sensor staleness (Implemented)
|
||||||
|
|
||||||
|
Added Prometheus alert `zigbee_sensor_stale` in `services/monitoring/rules.yml` that fires when a Zigbee temperature sensor hasn't updated in over 1 hour. This provides defense-in-depth for detecting dead sensors regardless of battery reporting accuracy.
|
||||||
|
|
||||||
|
**Alert details:**
|
||||||
|
- Expression: `(time() - hass_last_updated_time_seconds{entity=~"sensor\\.(0x[0-9a-f]+|temp_server)_temperature"}) > 3600`
|
||||||
|
- Severity: warning
|
||||||
|
- For: 5m
|
||||||
|
|
||||||
|
## Pre-Deployment Verification
|
||||||
|
|
||||||
|
### Backup Verification
|
||||||
|
|
||||||
|
Before deployment, verified ha1 backup configuration and ran manual backup:
|
||||||
|
|
||||||
|
**Backup paths:**
|
||||||
|
- `/var/lib/hass` ✓
|
||||||
|
- `/var/lib/zigbee2mqtt` ✓
|
||||||
|
- `/var/lib/mosquitto` ✓
|
||||||
|
|
||||||
|
**Manual backup (2026-02-05 22:45:23):**
|
||||||
|
- Snapshot ID: `59704dfa`
|
||||||
|
- Files: 77 total (0 new, 13 changed, 64 unmodified)
|
||||||
|
- Data: 62.635 MiB processed, 6.928 MiB stored (compressed)
|
||||||
|
|
||||||
|
### Other directories reviewed
|
||||||
|
|
||||||
|
- `/var/lib/vault` — Contains AppRole credentials; not backed up (can be re-provisioned via Ansible)
|
||||||
|
- `/var/lib/sops-nix` — Legacy; ha1 uses Vault now
|
||||||
|
|
||||||
|
## Post-Deployment Steps
|
||||||
|
|
||||||
|
After deploying to ha1:
|
||||||
|
|
||||||
|
1. Restart zigbee2mqtt service (automatic on NixOS rebuild)
|
||||||
|
2. In Home Assistant, the battery entities may need to be re-discovered:
|
||||||
|
- Go to Settings → Devices & Services → MQTT
|
||||||
|
- The new `value_template` should take effect after entity re-discovery
|
||||||
|
- If not, try disabling and re-enabling the battery entities
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Device configuration is now declarative in NixOS. Future device additions via Zigbee2MQTT frontend will need to be added to the NixOS config to persist.
|
||||||
|
- The `devices.yaml` file on ha1 will be overwritten on service start but can be removed after confirming the new config works.
|
||||||
|
- The NixOS zigbee2mqtt module defaults to `devices = "devices.yaml"` but our explicit inline config overrides this.
|
||||||
179
docs/plans/homelab-exporter.md
Normal file
179
docs/plans/homelab-exporter.md
Normal file
@@ -0,0 +1,179 @@
|
|||||||
|
# Homelab Infrastructure Exporter
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Build a Prometheus exporter for metrics specific to our homelab infrastructure. Unlike the generic nixos-exporter, this covers services and patterns unique to our environment.
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
### Existing Exporters
|
||||||
|
- **node-exporter** (all hosts): System metrics
|
||||||
|
- **systemd-exporter** (all hosts): Service restart counts, IP accounting
|
||||||
|
- **labmon** (monitoring01): TLS certificate monitoring, step-ca health
|
||||||
|
- **Service-specific**: unbound, postgres, nats, jellyfin, home-assistant, caddy, step-ca
|
||||||
|
|
||||||
|
### Gaps
|
||||||
|
- No visibility into Vault/OpenBao lease expiry
|
||||||
|
- No ACME certificate expiry from internal CA
|
||||||
|
- No Proxmox guest agent metrics from inside VMs
|
||||||
|
|
||||||
|
## Metrics
|
||||||
|
|
||||||
|
### Vault/OpenBao Metrics
|
||||||
|
|
||||||
|
| Metric | Description | Source |
|
||||||
|
|--------|-------------|--------|
|
||||||
|
| `homelab_vault_token_expiry_seconds` | Seconds until AppRole token expires | Token metadata or lease file |
|
||||||
|
| `homelab_vault_token_renewable` | 1 if token is renewable | Token metadata |
|
||||||
|
|
||||||
|
Labels: `role` (AppRole name)
|
||||||
|
|
||||||
|
### ACME Certificate Metrics
|
||||||
|
|
||||||
|
| Metric | Description | Source |
|
||||||
|
|--------|-------------|--------|
|
||||||
|
| `homelab_acme_cert_expiry_seconds` | Seconds until certificate expires | Parse cert from `/var/lib/acme/*/cert.pem` |
|
||||||
|
| `homelab_acme_cert_not_after` | Unix timestamp of cert expiry | Certificate NotAfter field |
|
||||||
|
|
||||||
|
Labels: `domain`, `issuer`
|
||||||
|
|
||||||
|
Note: labmon already monitors external TLS endpoints. This covers local ACME-managed certs.
|
||||||
|
|
||||||
|
### Proxmox Guest Metrics (future)
|
||||||
|
|
||||||
|
| Metric | Description | Source |
|
||||||
|
|--------|-------------|--------|
|
||||||
|
| `homelab_proxmox_guest_info` | Info gauge with VM ID, name | QEMU guest agent |
|
||||||
|
| `homelab_proxmox_guest_agent_running` | 1 if guest agent is responsive | Agent ping |
|
||||||
|
|
||||||
|
### DNS Zone Metrics (future)
|
||||||
|
|
||||||
|
| Metric | Description | Source |
|
||||||
|
|--------|-------------|--------|
|
||||||
|
| `homelab_dns_zone_serial` | Current zone serial number | DNS AXFR or zone file |
|
||||||
|
|
||||||
|
Labels: `zone`
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
Single binary with collectors enabled via config. Runs on hosts that need specific collectors.
|
||||||
|
|
||||||
|
```
|
||||||
|
homelab-exporter
|
||||||
|
├── main.go
|
||||||
|
├── collector/
|
||||||
|
│ ├── vault.go # Vault/OpenBao token metrics
|
||||||
|
│ ├── acme.go # ACME certificate metrics
|
||||||
|
│ └── proxmox.go # Proxmox guest agent (future)
|
||||||
|
└── config/
|
||||||
|
└── config.go
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
listen_addr: ":9970"
|
||||||
|
collectors:
|
||||||
|
vault:
|
||||||
|
enabled: true
|
||||||
|
token_path: "/var/lib/vault/token"
|
||||||
|
acme:
|
||||||
|
enabled: true
|
||||||
|
cert_dirs:
|
||||||
|
- "/var/lib/acme"
|
||||||
|
proxmox:
|
||||||
|
enabled: false
|
||||||
|
```
|
||||||
|
|
||||||
|
## NixOS Module
|
||||||
|
|
||||||
|
```nix
|
||||||
|
services.homelab-exporter = {
|
||||||
|
enable = true;
|
||||||
|
port = 9970;
|
||||||
|
collectors = {
|
||||||
|
vault = {
|
||||||
|
enable = true;
|
||||||
|
tokenPath = "/var/lib/vault/token";
|
||||||
|
};
|
||||||
|
acme = {
|
||||||
|
enable = true;
|
||||||
|
certDirs = [ "/var/lib/acme" ];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# Auto-register scrape target
|
||||||
|
homelab.monitoring.scrapeTargets = [{
|
||||||
|
job_name = "homelab-exporter";
|
||||||
|
port = 9970;
|
||||||
|
}];
|
||||||
|
```
|
||||||
|
|
||||||
|
## Integration
|
||||||
|
|
||||||
|
### Deployment
|
||||||
|
|
||||||
|
Deploy on hosts that have relevant data:
|
||||||
|
- **All hosts with ACME certs**: acme collector
|
||||||
|
- **All hosts with Vault**: vault collector
|
||||||
|
- **Proxmox VMs**: proxmox collector (when implemented)
|
||||||
|
|
||||||
|
### Relationship with nixos-exporter
|
||||||
|
|
||||||
|
These are complementary:
|
||||||
|
- **nixos-exporter** (port 9971): Generic NixOS metrics, deploy everywhere
|
||||||
|
- **homelab-exporter** (port 9970): Infrastructure-specific, deploy selectively
|
||||||
|
|
||||||
|
Both can run on the same host if needed.
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
### Language
|
||||||
|
|
||||||
|
Go - consistent with labmon and nixos-exporter.
|
||||||
|
|
||||||
|
### Phase 1: Core + ACME
|
||||||
|
1. Create git repository (git.t-juice.club/torjus/homelab-exporter)
|
||||||
|
2. Implement ACME certificate collector
|
||||||
|
3. HTTP server with `/metrics`
|
||||||
|
4. NixOS module
|
||||||
|
|
||||||
|
### Phase 2: Vault Collector
|
||||||
|
1. Implement token expiry detection
|
||||||
|
2. Handle missing/expired tokens gracefully
|
||||||
|
|
||||||
|
### Phase 3: Dashboard
|
||||||
|
1. Create Grafana dashboard for infrastructure health
|
||||||
|
2. Add to existing monitoring service module
|
||||||
|
|
||||||
|
## Alert Examples
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- alert: VaultTokenExpiringSoon
|
||||||
|
expr: homelab_vault_token_expiry_seconds < 3600
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Vault token on {{ $labels.instance }} expires in < 1 hour"
|
||||||
|
|
||||||
|
- alert: ACMECertExpiringSoon
|
||||||
|
expr: homelab_acme_cert_expiry_seconds < 7 * 24 * 3600
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "ACME cert {{ $labels.domain }} on {{ $labels.instance }} expires in < 7 days"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- [ ] How to read Vault token expiry without re-authenticating?
|
||||||
|
- [ ] Should ACME collector also check key/cert match?
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Port 9970 (labmon uses 9969, nixos-exporter will use 9971)
|
||||||
|
- Keep infrastructure-specific logic here, generic NixOS stuff in nixos-exporter
|
||||||
|
- Consider merging Proxmox metrics with pve-exporter if overlap is significant
|
||||||
220
docs/plans/host-migration-to-opentofu.md
Normal file
220
docs/plans/host-migration-to-opentofu.md
Normal file
@@ -0,0 +1,220 @@
|
|||||||
|
# Host Migration to OpenTofu
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Migrate all existing hosts (provisioned manually before the OpenTofu pipeline) into the new
|
||||||
|
OpenTofu-managed provisioning workflow. Hosts are categorized by their state requirements:
|
||||||
|
stateless hosts are simply recreated, stateful hosts require backup and restore, and some
|
||||||
|
hosts are decommissioned or deferred.
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
Hosts already managed by OpenTofu: `vault01`, `testvm01`, `testvm02`, `testvm03`, `ns2`, `ns1`
|
||||||
|
|
||||||
|
Hosts to migrate:
|
||||||
|
|
||||||
|
| Host | Category | Notes |
|
||||||
|
|------|----------|-------|
|
||||||
|
| ~~ns1~~ | ~~Stateless~~ | ✓ Complete |
|
||||||
|
| nix-cache01 | Stateless | Binary cache, recreate |
|
||||||
|
| http-proxy | Stateless | Reverse proxy, recreate |
|
||||||
|
| nats1 | Stateless | Messaging, recreate |
|
||||||
|
| ha1 | Stateful | Home Assistant + Zigbee2MQTT + Mosquitto |
|
||||||
|
| ~~monitoring01~~ | ~~Decommission~~ | ✓ Complete — replaced by monitoring02 (VictoriaMetrics) |
|
||||||
|
| jelly01 | Stateful | Jellyfin metadata, watch history, config |
|
||||||
|
| ~~pgdb1~~ | ~~Decommission~~ | ✓ Complete |
|
||||||
|
| ~~jump~~ | ~~Decommission~~ | ✓ Complete |
|
||||||
|
| ~~auth01~~ | ~~Decommission~~ | ✓ Complete |
|
||||||
|
| ~~ca~~ | ~~Deferred~~ | ✓ Complete |
|
||||||
|
|
||||||
|
## Phase 1: Backup Preparation
|
||||||
|
|
||||||
|
Before migrating any stateful host, ensure restic backups are in place and verified.
|
||||||
|
|
||||||
|
### ~~1a. Expand monitoring01 Grafana Backup~~ ✓ N/A
|
||||||
|
|
||||||
|
~~The existing backup only covers `/var/lib/grafana/plugins` and a sqlite dump of `grafana.db`.
|
||||||
|
Expand to back up all of `/var/lib/grafana/` to capture config directory and any other state.~~
|
||||||
|
|
||||||
|
No longer needed — monitoring01 decommissioned, replaced by monitoring02 with declarative Grafana dashboards.
|
||||||
|
|
||||||
|
### 1b. Add Jellyfin Backup to jelly01
|
||||||
|
|
||||||
|
No backup currently exists. Add a restic backup job for `/var/lib/jellyfin/` which contains:
|
||||||
|
- `config/` — server settings, library configuration
|
||||||
|
- `data/` — user watch history, playback state, library metadata
|
||||||
|
|
||||||
|
Media files are on the NAS (`nas.home.2rjus.net:/mnt/hdd-pool/media`) and do not need backup.
|
||||||
|
The cache directory (`/var/cache/jellyfin/`) does not need backup — it regenerates.
|
||||||
|
|
||||||
|
### 1c. Verify Existing ha1 Backup
|
||||||
|
|
||||||
|
ha1 already backs up `/var/lib/hass`, `/var/lib/zigbee2mqtt`, `/var/lib/mosquitto`. Verify
|
||||||
|
these backups are current and restorable before proceeding with migration.
|
||||||
|
|
||||||
|
### 1d. Verify All Backups
|
||||||
|
|
||||||
|
After adding/expanding backup jobs:
|
||||||
|
1. Trigger a manual backup run on each host
|
||||||
|
2. Verify backup integrity with `restic check`
|
||||||
|
3. Test a restore to a temporary location to confirm data is recoverable
|
||||||
|
|
||||||
|
## Phase 2: Stateless Host Migration
|
||||||
|
|
||||||
|
These hosts have no meaningful state and can be recreated fresh. For each host:
|
||||||
|
|
||||||
|
1. Add the host definition to `terraform/vms.tf` (using `create-host` or manually)
|
||||||
|
2. Commit and push to master
|
||||||
|
3. Run `tofu apply` to provision the new VM
|
||||||
|
4. Wait for bootstrap to complete (VM pulls config from master and reboots)
|
||||||
|
5. Verify the host is functional
|
||||||
|
6. Decommission the old VM in Proxmox
|
||||||
|
|
||||||
|
### Migration Order
|
||||||
|
|
||||||
|
Migrate stateless hosts in an order that minimizes disruption:
|
||||||
|
|
||||||
|
1. **nix-cache01** — low risk, no downstream dependencies during migration
|
||||||
|
2. **nats1** — low risk, verify no persistent JetStream streams first
|
||||||
|
3. **http-proxy** — brief disruption to proxied services, migrate during low-traffic window
|
||||||
|
4. ~~**ns1** — ns2 already migrated, verify AXFR works after ns1 migration~~ ✓ Complete
|
||||||
|
|
||||||
|
~~For ns1/ns2: migrate ns2 first (secondary), verify AXFR works, then migrate ns1.~~ Both ns1
|
||||||
|
and ns2 migration complete. Zone transfer (AXFR) verified working between ns1 (primary) and
|
||||||
|
ns2 (secondary).
|
||||||
|
|
||||||
|
## Phase 3: Stateful Host Migration
|
||||||
|
|
||||||
|
For each stateful host, the procedure is:
|
||||||
|
|
||||||
|
1. Trigger a final restic backup
|
||||||
|
2. Stop services on the old host (to prevent state drift during migration)
|
||||||
|
3. Provision the new VM via `tofu apply`
|
||||||
|
4. Wait for bootstrap to complete
|
||||||
|
5. Stop the relevant services on the new host
|
||||||
|
6. Restore data from restic backup
|
||||||
|
7. Start services and verify functionality
|
||||||
|
8. Decommission the old VM
|
||||||
|
|
||||||
|
### 3a. monitoring01 ✓ COMPLETE
|
||||||
|
|
||||||
|
~~1. Run final Grafana backup~~
|
||||||
|
~~2. Provision new monitoring01 via OpenTofu~~
|
||||||
|
~~3. After bootstrap, restore `/var/lib/grafana/` from restic~~
|
||||||
|
~~4. Restart Grafana, verify dashboards and datasources are intact~~
|
||||||
|
~~5. Prometheus and Loki start fresh with empty data (acceptable)~~
|
||||||
|
~~6. Verify all scrape targets are being collected~~
|
||||||
|
~~7. Decommission old VM~~
|
||||||
|
|
||||||
|
Replaced by monitoring02 with VictoriaMetrics, standalone Loki and Grafana modules. Host configuration, old service modules, and terraform resources removed.
|
||||||
|
|
||||||
|
### 3b. jelly01
|
||||||
|
|
||||||
|
1. Run final Jellyfin backup
|
||||||
|
2. Provision new jelly01 via OpenTofu
|
||||||
|
3. After bootstrap, restore `/var/lib/jellyfin/` from restic
|
||||||
|
4. Verify NFS mount to NAS is working
|
||||||
|
5. Start Jellyfin, verify watch history and library metadata are present
|
||||||
|
6. Decommission old VM
|
||||||
|
|
||||||
|
### 3c. ha1
|
||||||
|
|
||||||
|
1. Verify latest restic backup is current
|
||||||
|
2. Stop Home Assistant, Zigbee2MQTT, and Mosquitto on old host
|
||||||
|
3. Provision new ha1 via OpenTofu
|
||||||
|
4. After bootstrap, restore `/var/lib/hass`, `/var/lib/zigbee2mqtt`, `/var/lib/mosquitto`
|
||||||
|
5. Start services, verify Home Assistant is functional
|
||||||
|
6. Verify Zigbee devices are still paired and communicating
|
||||||
|
7. Decommission old VM
|
||||||
|
|
||||||
|
**Note:** ha1 currently has 2 GB RAM, which is consistently tight. Average memory usage has
|
||||||
|
climbed from ~57% (30-day avg) to ~70% currently, with a 30-day low of only 187 MB free.
|
||||||
|
Consider increasing to 4 GB when reprovisioning to allow headroom for additional integrations.
|
||||||
|
|
||||||
|
**Note:** ha1 is the highest-risk migration due to Zigbee device pairings. The Zigbee
|
||||||
|
coordinator state in `/var/lib/zigbee2mqtt` should preserve pairings, but verify on a
|
||||||
|
non-critical time window.
|
||||||
|
|
||||||
|
**USB Passthrough:** The ha1 VM has a USB device passed through from the Proxmox hypervisor
|
||||||
|
(the Zigbee coordinator). The new VM must be configured with the same USB passthrough in
|
||||||
|
OpenTofu/Proxmox. Verify the USB device ID on the hypervisor and add the appropriate
|
||||||
|
`usb` block to the VM definition in `terraform/vms.tf`. The USB device must be passed
|
||||||
|
through before starting Zigbee2MQTT on the new host.
|
||||||
|
|
||||||
|
## Phase 4: Decommission Hosts
|
||||||
|
|
||||||
|
### jump ✓ COMPLETE
|
||||||
|
|
||||||
|
~~1. Verify nothing depends on the jump host (no SSH proxy configs pointing to it, etc.)~~
|
||||||
|
~~2. Remove host configuration from `hosts/jump/`~~
|
||||||
|
~~3. Remove from `flake.nix`~~
|
||||||
|
~~4. Remove any secrets in `secrets/jump/`~~
|
||||||
|
~~5. Remove from `.sops.yaml`~~
|
||||||
|
~~6. Destroy the VM in Proxmox~~
|
||||||
|
~~7. Commit cleanup~~
|
||||||
|
|
||||||
|
Host was already removed from flake.nix and VM destroyed. Configuration cleaned up in ba9f47f.
|
||||||
|
|
||||||
|
### auth01 ✓ COMPLETE
|
||||||
|
|
||||||
|
~~1. Remove host configuration from `hosts/auth01/`~~
|
||||||
|
~~2. Remove from `flake.nix`~~
|
||||||
|
~~3. Remove any secrets in `secrets/auth01/`~~
|
||||||
|
~~4. Remove from `.sops.yaml`~~
|
||||||
|
~~5. Remove `services/authelia/` and `services/lldap/` (only used by auth01)~~
|
||||||
|
~~6. Destroy the VM in Proxmox~~
|
||||||
|
~~7. Commit cleanup~~
|
||||||
|
|
||||||
|
Host configuration, services, and VM already removed.
|
||||||
|
|
||||||
|
### pgdb1 ✓ COMPLETE
|
||||||
|
|
||||||
|
~~Only consumer was Open WebUI on gunter, which has been migrated to use local PostgreSQL.~~
|
||||||
|
|
||||||
|
~~1. Verify Open WebUI on gunter is using local PostgreSQL (not pgdb1)~~
|
||||||
|
~~2. Remove host configuration from `hosts/pgdb1/`~~
|
||||||
|
~~3. Remove `services/postgres/` (only used by pgdb1)~~
|
||||||
|
~~4. Remove from `flake.nix`~~
|
||||||
|
~~5. Remove Vault AppRole from `terraform/vault/approle.tf`~~
|
||||||
|
~~6. Destroy the VM in Proxmox~~
|
||||||
|
~~7. Commit cleanup~~
|
||||||
|
|
||||||
|
Host configuration, services, terraform resources, and VM removed. See `docs/plans/pgdb1-decommission.md` for detailed plan.
|
||||||
|
|
||||||
|
## Phase 5: Decommission ca Host ✓ COMPLETE
|
||||||
|
|
||||||
|
~~Deferred until Phase 4c (PKI migration to OpenBao) is complete. Once all hosts use the
|
||||||
|
OpenBao ACME endpoint for certificates, the step-ca host can be decommissioned following
|
||||||
|
the same cleanup steps as the jump host.~~
|
||||||
|
|
||||||
|
PKI migration to OpenBao complete. Host configuration, `services/ca/`, and VM removed.
|
||||||
|
|
||||||
|
## Phase 6: Remove sops-nix ✓ COMPLETE
|
||||||
|
|
||||||
|
~~Once `ca` is decommissioned (Phase 6), `sops-nix` is no longer used by any host. Remove
|
||||||
|
all remnants:~~
|
||||||
|
~~- `sops-nix` input from `flake.nix` and `flake.lock`~~
|
||||||
|
~~- `sops-nix.nixosModules.sops` from all host module lists in `flake.nix`~~
|
||||||
|
~~- `inherit sops-nix` from all specialArgs in `flake.nix`~~
|
||||||
|
~~- `system/sops.nix` and its import in `system/default.nix`~~
|
||||||
|
~~- `.sops.yaml`~~
|
||||||
|
~~- `secrets/` directory~~
|
||||||
|
~~- All `sops.secrets.*` declarations in `services/ca/`, `services/authelia/`, `services/lldap/`~~
|
||||||
|
~~- Template scripts that generate age keys for sops (`hosts/template/scripts.nix`,
|
||||||
|
`hosts/template2/scripts.nix`)~~
|
||||||
|
|
||||||
|
All sops-nix remnants removed. See `docs/plans/completed/sops-to-openbao-migration.md` for context.
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Each host migration should be done individually, not in bulk, to limit blast radius
|
||||||
|
- Keep the old VM running until the new one is verified — do not destroy prematurely
|
||||||
|
- The old VMs use IPs that the new VMs need, so the old VM must be shut down before
|
||||||
|
the new one is provisioned (or use a temporary IP and swap after verification)
|
||||||
|
- Stateful migrations should be done during low-usage windows
|
||||||
|
- After all migrations are complete, all decommissioned hosts (jump, auth01, ca) have been removed
|
||||||
|
- Since many hosts are being recreated, this is a good opportunity to establish consistent
|
||||||
|
hostname naming conventions before provisioning the new VMs. Current naming is inconsistent
|
||||||
|
(e.g. `ns1` vs `nix-cache01`, `ha1` vs `auth01`, `pgdb1` vs `http-proxy`). Decide on a
|
||||||
|
convention before starting migrations — e.g. whether to always use numeric suffixes, a
|
||||||
|
consistent format like `service-NN`, role-based vs function-based names, etc.
|
||||||
79
docs/plans/local-ntp-chrony.md
Normal file
79
docs/plans/local-ntp-chrony.md
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
# Local NTP with Chrony
|
||||||
|
|
||||||
|
## Overview/Goal
|
||||||
|
|
||||||
|
Set up pve1 as a local NTP server and switch all NixOS VMs from systemd-timesyncd to chrony, pointing at pve1 as the sole time source. This eliminates clock drift issues that cause false `host_reboot` alerts.
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
- All NixOS hosts use `systemd-timesyncd` with default NixOS pool servers (`0.nixos.pool.ntp.org` etc.)
|
||||||
|
- No NTP/timesyncd configuration exists in the repo — all defaults
|
||||||
|
- pve1 (Proxmox, bare metal) already runs chrony but only as a client
|
||||||
|
- VMs drift noticeably — ns1 (~19ms) and jelly01 (~39ms) are worst offenders
|
||||||
|
- Clock step corrections from timesyncd trigger false `host_reboot` alerts via `changes(node_boot_time_seconds[10m]) > 0`
|
||||||
|
- pve1 itself stays at 0ms offset thanks to chrony
|
||||||
|
|
||||||
|
## Why systemd-timesyncd is Insufficient
|
||||||
|
|
||||||
|
- Minimal SNTP client, no proper clock discipline or frequency tracking
|
||||||
|
- Backs off polling interval when it thinks clock is stable, missing drift
|
||||||
|
- Corrects via step adjustments rather than gradual slewing, causing metric jumps
|
||||||
|
- Each VM resolves to different pool servers with varying accuracy
|
||||||
|
|
||||||
|
## Implementation Steps
|
||||||
|
|
||||||
|
### 1. Configure pve1 as NTP Server
|
||||||
|
|
||||||
|
Add to pve1's `/etc/chrony/chrony.conf`:
|
||||||
|
|
||||||
|
```
|
||||||
|
# Allow NTP clients from the infrastructure subnet
|
||||||
|
allow 10.69.13.0/24
|
||||||
|
```
|
||||||
|
|
||||||
|
Restart chrony on pve1.
|
||||||
|
|
||||||
|
### 2. Add Chrony to NixOS System Config
|
||||||
|
|
||||||
|
Create `system/chrony.nix` (applied to all hosts via system imports):
|
||||||
|
|
||||||
|
```nix
|
||||||
|
{
|
||||||
|
# Disable systemd-timesyncd (chrony takes over)
|
||||||
|
services.timesyncd.enable = false;
|
||||||
|
|
||||||
|
# Enable chrony pointing at pve1
|
||||||
|
services.chrony = {
|
||||||
|
enable = true;
|
||||||
|
servers = [ "pve1.home.2rjus.net" ];
|
||||||
|
serverOption = "iburst";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Optional: Add Chrony Exporter
|
||||||
|
|
||||||
|
For better visibility into NTP sync quality:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
services.prometheus.exporters.chrony.enable = true;
|
||||||
|
```
|
||||||
|
|
||||||
|
Add chrony exporter scrape targets via `homelab.monitoring.scrapeTargets` and create a Grafana dashboard for NTP offset across all hosts.
|
||||||
|
|
||||||
|
### 4. Roll Out
|
||||||
|
|
||||||
|
- Deploy to a test-tier host first to verify
|
||||||
|
- Then deploy to all hosts via auto-upgrade
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- [ ] Does pve1's chrony config need `local stratum 10` as fallback if upstream is unreachable?
|
||||||
|
- [ ] Should we also enable `enableRTCTrimming` for the VMs?
|
||||||
|
- [ ] Worth adding a chrony exporter on pve1 as well (manual install like node-exporter)?
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- No fallback NTP servers needed on VMs — if pve1 is down, all VMs are down too
|
||||||
|
- The `host_reboot` alert rule (`changes(node_boot_time_seconds[10m]) > 0`) should stop false-firing once clock corrections are slewed instead of stepped
|
||||||
|
- pn01/pn02 are bare metal but still benefit from syncing to pve1 for consistency
|
||||||
196
docs/plans/loki-improvements.md
Normal file
196
docs/plans/loki-improvements.md
Normal file
@@ -0,0 +1,196 @@
|
|||||||
|
# Loki Setup Improvements
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The current Loki deployment on monitoring01 is functional but minimal. It lacks retention policies, rate limiting, and uses local filesystem storage. This plan evaluates improvement options across several dimensions: retention management, storage backend, resource limits, and operational improvements.
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
**Loki** on monitoring01 (`services/monitoring/loki.nix`):
|
||||||
|
- Single-node deployment, no HA
|
||||||
|
- Filesystem storage at `/var/lib/loki/chunks` (~6.8 GB as of 2026-02-13)
|
||||||
|
- TSDB index (v13 schema, 24h period)
|
||||||
|
- 30-day compactor-based retention with basic rate limits
|
||||||
|
- No caching layer
|
||||||
|
- Auth disabled (trusted network)
|
||||||
|
|
||||||
|
**Promtail** on all 16 hosts (`system/monitoring/logs.nix`):
|
||||||
|
- Ships systemd journal (JSON) + `/var/log/**/*.log`
|
||||||
|
- Labels: `hostname`, `tier`, `role`, `level`, `job` (systemd-journal/varlog), `systemd_unit`
|
||||||
|
- `level` label mapped from journal PRIORITY (critical/error/warning/notice/info/debug)
|
||||||
|
- Hardcoded to `http://monitoring01.home.2rjus.net:3100`
|
||||||
|
|
||||||
|
**Additional log sources:**
|
||||||
|
- `pipe-to-loki` script (manual log submission, `job=pipe-to-loki`)
|
||||||
|
- Bootstrap logs from template2 (`job=bootstrap`)
|
||||||
|
|
||||||
|
**Context:** The VictoriaMetrics migration plan (`docs/plans/monitoring-migration-victoriametrics.md`) includes moving Loki to monitoring02 with "same configuration as current". These improvements could be applied either before or after that migration.
|
||||||
|
|
||||||
|
## Improvement Areas
|
||||||
|
|
||||||
|
### 1. Retention Policy
|
||||||
|
|
||||||
|
**Implemented.** Compactor-based retention with 30-day period. Note: Loki 3.6.3 requires `delete_request_store = "filesystem"` when retention is enabled (not documented in older guides).
|
||||||
|
|
||||||
|
```nix
|
||||||
|
compactor = {
|
||||||
|
working_directory = "/var/lib/loki/compactor";
|
||||||
|
compaction_interval = "10m";
|
||||||
|
retention_enabled = true;
|
||||||
|
retention_delete_delay = "2h";
|
||||||
|
retention_delete_worker_count = 150;
|
||||||
|
delete_request_store = "filesystem";
|
||||||
|
};
|
||||||
|
|
||||||
|
limits_config = {
|
||||||
|
retention_period = "30d";
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Storage Backend
|
||||||
|
|
||||||
|
**Decision:** Stay with filesystem storage for now. Garage S3 was considered but ruled out - the current single-node Garage (replication_factor=1) offers no real durability benefit over local disk. S3 storage can be revisited after the NAS migration, when a more robust S3-compatible solution will likely be available.
|
||||||
|
|
||||||
|
### 3. Limits Configuration
|
||||||
|
|
||||||
|
**Implemented.** Basic guardrails added alongside retention in `limits_config`:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
limits_config = {
|
||||||
|
retention_period = "30d";
|
||||||
|
ingestion_rate_mb = 10; # MB/s per tenant
|
||||||
|
ingestion_burst_size_mb = 20; # Burst allowance
|
||||||
|
max_streams_per_user = 10000; # Prevent label explosion
|
||||||
|
max_query_series = 500; # Limit query resource usage
|
||||||
|
max_query_parallelism = 8;
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Promtail Label Improvements
|
||||||
|
|
||||||
|
**Problem:** Label inconsistencies and missing useful metadata:
|
||||||
|
- The `varlog` scrape config uses `hostname` while journal uses `host` (different label name)
|
||||||
|
- No `tier` or `role` labels, making it hard to filter logs by deployment tier or host function
|
||||||
|
|
||||||
|
**Implemented:** Standardized on `hostname` to match Prometheus labels. The journal scrape previously used a relabel from `__journal__hostname` to `host`; now both scrape configs use a static `hostname` label from `config.networking.hostName`. Also updated `pipe-to-loki` and bootstrap scripts to use `hostname` instead of `host`.
|
||||||
|
|
||||||
|
1. **Standardized label:** Both scrape configs use `hostname` (matching Prometheus) via shared `hostLabels`
|
||||||
|
2. **Added `tier` label:** Static label from `config.homelab.host.tier` (`test`/`prod`) on both scrape configs
|
||||||
|
3. **Added `role` label:** Static label from `config.homelab.host.role` on both scrape configs (conditionally, only when non-null)
|
||||||
|
|
||||||
|
No cardinality impact - `tier` and `role` are 1:1 with `hostname`, so they add metadata to existing streams without creating new ones.
|
||||||
|
|
||||||
|
This enables queries like:
|
||||||
|
- `{tier="prod"} |= "error"` - all errors on prod hosts
|
||||||
|
- `{role="dns"}` - all DNS server logs
|
||||||
|
- `{tier="test", job="systemd-journal"}` - journal logs from test hosts
|
||||||
|
|
||||||
|
### 5. Journal Priority → Level Label
|
||||||
|
|
||||||
|
**Implemented.** Promtail pipeline stages map journal `PRIORITY` to a `level` label:
|
||||||
|
|
||||||
|
| PRIORITY | level |
|
||||||
|
|----------|-------|
|
||||||
|
| 0-2 | critical |
|
||||||
|
| 3 | error |
|
||||||
|
| 4 | warning |
|
||||||
|
| 5 | notice |
|
||||||
|
| 6 | info |
|
||||||
|
| 7 | debug |
|
||||||
|
|
||||||
|
Uses a `json` stage to extract PRIORITY, `template` to map to level name, and `labels` to attach it. This gives reliable level filtering for all journal logs, unlike Loki's `detected_level` which only works for apps that embed level keywords in message text.
|
||||||
|
|
||||||
|
Example queries:
|
||||||
|
- `{level="error"}` - all errors across the fleet
|
||||||
|
- `{level=~"critical|error", tier="prod"}` - prod errors and criticals
|
||||||
|
- `{level="warning", role="dns"}` - warnings from DNS servers
|
||||||
|
|
||||||
|
### 6. Enable JSON Logging on Services
|
||||||
|
|
||||||
|
**Problem:** Many services support structured JSON log output but may be using plain text by default. JSON logs are significantly easier to query in Loki - `| json` cleanly extracts all fields, whereas plain text requires fragile regex or pattern matching.
|
||||||
|
|
||||||
|
**Audit results (2026-02-13):**
|
||||||
|
|
||||||
|
**Already logging JSON:**
|
||||||
|
- Caddy (all instances) - JSON by default for access logs
|
||||||
|
- homelab-deploy (listener/builder) - Go app, logs structured JSON
|
||||||
|
|
||||||
|
**Supports JSON, not configured (high value):**
|
||||||
|
|
||||||
|
| Service | How to enable | Config file |
|
||||||
|
|---------|--------------|-------------|
|
||||||
|
| Prometheus | `--log.format=json` | `services/monitoring/prometheus.nix` |
|
||||||
|
| Alertmanager | `--log.format=json` | `services/monitoring/prometheus.nix` |
|
||||||
|
| Loki | `--log.format=json` | `services/monitoring/loki.nix` |
|
||||||
|
| Grafana | `log.console.format = "json"` | `services/monitoring/grafana.nix` |
|
||||||
|
| Tempo | `log_format: json` in config | `services/monitoring/tempo.nix` |
|
||||||
|
| OpenBao | `log_format = "json"` | `services/vault/default.nix` |
|
||||||
|
|
||||||
|
**Supports JSON, not configured (lower value - minimal log output):**
|
||||||
|
|
||||||
|
| Service | How to enable |
|
||||||
|
|---------|--------------|
|
||||||
|
| Pyroscope | `--log.format=json` (OCI container) |
|
||||||
|
| Blackbox Exporter | `--log.format=json` |
|
||||||
|
| Node Exporter | `--log.format=json` (all 16 hosts) |
|
||||||
|
| Systemd Exporter | `--log.format=json` (all 16 hosts) |
|
||||||
|
|
||||||
|
**No JSON support (syslog/text only):**
|
||||||
|
- NSD, Unbound, OpenSSH, Mosquitto
|
||||||
|
|
||||||
|
**Needs verification:**
|
||||||
|
- Kanidm, Jellyfin, Home Assistant, Harmonia, Zigbee2MQTT, NATS
|
||||||
|
|
||||||
|
**Recommendation:** Start with the monitoring stack (Prometheus, Alertmanager, Loki, Grafana, Tempo) since they're all Go apps with the same `--log.format=json` flag. Then OpenBao. The exporters are lower priority since they produce minimal log output.
|
||||||
|
|
||||||
|
### 7. Monitoring CNAME for Promtail Target
|
||||||
|
|
||||||
|
**Problem:** Promtail hardcodes `monitoring01.home.2rjus.net:3100`. The VictoriaMetrics migration plan already addresses this by switching to a `monitoring` CNAME.
|
||||||
|
|
||||||
|
**Recommendation:** This should happen as part of the monitoring02 migration, not independently. If we do Loki improvements before that migration, keep pointing to monitoring01.
|
||||||
|
|
||||||
|
## Priority Ranking
|
||||||
|
|
||||||
|
| # | Improvement | Effort | Impact | Status |
|
||||||
|
|---|-------------|--------|--------|--------|
|
||||||
|
| 1 | **Retention policy** | Low | High | Done (30d compactor retention) |
|
||||||
|
| 2 | **Limits config** | Low | Medium | Done (rate limits + stream guards) |
|
||||||
|
| 3 | **Promtail labels** | Trivial | Low | Done (hostname/tier/role/level) |
|
||||||
|
| 4 | **Journal priority → level** | Low-medium | Medium | Done (pipeline stages) |
|
||||||
|
| 5 | **JSON logging audit** | Low-medium | Medium | Audited, not yet enabled |
|
||||||
|
| 6 | **Monitoring CNAME** | Low | Medium | Part of monitoring02 migration |
|
||||||
|
|
||||||
|
## Implementation Steps
|
||||||
|
|
||||||
|
### Phase 1: Retention + Labels (done 2026-02-13)
|
||||||
|
|
||||||
|
1. ~~Add `compactor` section to `services/monitoring/loki.nix`~~ Done
|
||||||
|
2. ~~Add `limits_config` with 30-day retention and basic rate limits~~ Done
|
||||||
|
3. ~~Update `system/monitoring/logs.nix`~~ Done:
|
||||||
|
- Standardized on `hostname` label (matching Prometheus) for both scrape configs
|
||||||
|
- Added `tier` and `role` static labels from `homelab.host` options
|
||||||
|
- Added pipeline stages for journal PRIORITY → `level` label mapping
|
||||||
|
4. ~~Update `pipe-to-loki` and bootstrap scripts to use `hostname`~~ Done
|
||||||
|
5. ~~Deploy and verify labels~~ Done - all 15 hosts reporting with correct labels
|
||||||
|
|
||||||
|
### Phase 2: JSON Logging (not started)
|
||||||
|
|
||||||
|
Enable JSON logging on services that support it, starting with the monitoring stack:
|
||||||
|
1. Prometheus, Alertmanager, Loki, Grafana, Tempo (`--log.format=json`)
|
||||||
|
2. OpenBao (`log_format = "json"`)
|
||||||
|
3. Lower priority: exporters (node-exporter, systemd-exporter, blackbox)
|
||||||
|
|
||||||
|
### Phase 3 (future): S3 Storage Migration
|
||||||
|
|
||||||
|
Revisit after NAS migration when a proper S3-compatible storage solution is available. At that point, add a new schema period with `object_store = "s3"` - the old filesystem period will continue serving historical data until it ages out past retention.
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- [ ] Do we want per-stream retention (e.g., keep bootstrap/pipe-to-loki longer)?
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Loki schema changes require adding a new period entry (not modifying existing ones). The old period continues serving historical data.
|
||||||
|
- Loki 3.6.3 requires `delete_request_store = "filesystem"` in the compactor config when retention is enabled.
|
||||||
|
- S3 storage deferred until post-NAS migration when a proper solution is available.
|
||||||
|
- As of 2026-02-13, Loki uses ~6.8 GB for ~30 days of logs from 16 hosts. Prometheus uses ~7.6 GB on the same disk (33 GB total, ~8 GB free).
|
||||||
122
docs/plans/long-term-metrics-storage.md
Normal file
122
docs/plans/long-term-metrics-storage.md
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
# Long-Term Metrics Storage Options
|
||||||
|
|
||||||
|
## Problem Statement
|
||||||
|
|
||||||
|
Current Prometheus configuration retains metrics for 30 days (`retentionTime = "30d"`). Extending retention further raises disk usage concerns on the homelab hypervisor with limited local storage.
|
||||||
|
|
||||||
|
Prometheus does not support downsampling - it stores all data at full resolution until the retention period expires, then deletes it entirely.
|
||||||
|
|
||||||
|
## Current Configuration
|
||||||
|
|
||||||
|
Location: `services/monitoring/prometheus.nix`
|
||||||
|
|
||||||
|
- **Retention**: 30 days
|
||||||
|
- **Scrape interval**: 15s
|
||||||
|
- **Features**: Alertmanager, Pushgateway, auto-generated scrape configs from flake hosts
|
||||||
|
- **Storage**: Local disk on monitoring01
|
||||||
|
|
||||||
|
## Options Evaluated
|
||||||
|
|
||||||
|
### Option 1: VictoriaMetrics
|
||||||
|
|
||||||
|
VictoriaMetrics is a Prometheus-compatible TSDB with significantly better compression (5-10x smaller storage footprint).
|
||||||
|
|
||||||
|
**NixOS Options Available:**
|
||||||
|
- `services.victoriametrics.enable`
|
||||||
|
- `services.victoriametrics.prometheusConfig` - accepts Prometheus scrape config format
|
||||||
|
- `services.victoriametrics.retentionPeriod` - e.g., "6m" for 6 months
|
||||||
|
- `services.vmagent` - dedicated scraping agent
|
||||||
|
- `services.vmalert` - alerting rules evaluation
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- Simple migration - single service replacement
|
||||||
|
- Same PromQL query language - Grafana dashboards work unchanged
|
||||||
|
- Same scrape config format - existing auto-generated configs work as-is
|
||||||
|
- 5-10x better compression means 30 days of Prometheus data could become 180+ days
|
||||||
|
- Lightweight, single binary
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- No automatic downsampling (relies on compression alone)
|
||||||
|
- Alerting requires switching to vmalert instead of Prometheus alertmanager integration
|
||||||
|
- Would need to migrate existing data or start fresh
|
||||||
|
|
||||||
|
**Migration Steps:**
|
||||||
|
1. Replace `services.prometheus` with `services.victoriametrics`
|
||||||
|
2. Move scrape configs to `prometheusConfig`
|
||||||
|
3. Set up `services.vmalert` for alerting rules
|
||||||
|
4. Update Grafana datasource to VictoriaMetrics port (8428)
|
||||||
|
5. Keep Alertmanager for notification routing
|
||||||
|
|
||||||
|
### Option 2: Thanos
|
||||||
|
|
||||||
|
Thanos extends Prometheus with long-term storage and automatic downsampling by uploading data to object storage.
|
||||||
|
|
||||||
|
**NixOS Options Available:**
|
||||||
|
- `services.thanos.sidecar` - uploads Prometheus blocks to object storage
|
||||||
|
- `services.thanos.compact` - compacts and downsamples data
|
||||||
|
- `services.thanos.query` - unified query gateway
|
||||||
|
- `services.thanos.query-frontend` - query caching and parallelization
|
||||||
|
- `services.thanos.downsample` - dedicated downsampling service
|
||||||
|
|
||||||
|
**Downsampling Behavior:**
|
||||||
|
- Raw resolution kept for configurable period (default: indefinite)
|
||||||
|
- 5-minute resolution created after 40 hours
|
||||||
|
- 1-hour resolution created after 10 days
|
||||||
|
|
||||||
|
**Retention Configuration (in compactor):**
|
||||||
|
```nix
|
||||||
|
services.thanos.compact = {
|
||||||
|
retention.resolution-raw = "30d"; # Keep raw for 30 days
|
||||||
|
retention.resolution-5m = "180d"; # Keep 5m samples for 6 months
|
||||||
|
retention.resolution-1h = "2y"; # Keep 1h samples for 2 years
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- True downsampling - older data uses progressively less storage
|
||||||
|
- Keep metrics for years with minimal storage impact
|
||||||
|
- Prometheus continues running unchanged
|
||||||
|
- Existing Alertmanager integration preserved
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- Requires object storage (MinIO, S3, or local filesystem)
|
||||||
|
- Multiple services to manage (sidecar, compactor, query)
|
||||||
|
- More complex architecture
|
||||||
|
- Additional infrastructure (MinIO) may be needed
|
||||||
|
|
||||||
|
**Required Components:**
|
||||||
|
1. Thanos Sidecar (runs alongside Prometheus)
|
||||||
|
2. Object storage (MinIO or local filesystem)
|
||||||
|
3. Thanos Compactor (handles downsampling)
|
||||||
|
4. Thanos Query (provides unified query endpoint)
|
||||||
|
|
||||||
|
**Migration Steps:**
|
||||||
|
1. Deploy object storage (MinIO or configure filesystem backend)
|
||||||
|
2. Add Thanos sidecar pointing to Prometheus data directory
|
||||||
|
3. Add Thanos compactor with retention policies
|
||||||
|
4. Add Thanos query gateway
|
||||||
|
5. Update Grafana datasource to Thanos Query port (10902)
|
||||||
|
|
||||||
|
## Comparison
|
||||||
|
|
||||||
|
| Aspect | VictoriaMetrics | Thanos |
|
||||||
|
|--------|-----------------|--------|
|
||||||
|
| Complexity | Low (1 service) | Higher (3-4 services) |
|
||||||
|
| Downsampling | No | Yes (automatic) |
|
||||||
|
| Storage savings | 5-10x compression | Compression + downsampling |
|
||||||
|
| Object storage required | No | Yes |
|
||||||
|
| Migration effort | Minimal | Moderate |
|
||||||
|
| Grafana changes | Change port only | Change port only |
|
||||||
|
| Alerting changes | Need vmalert | Keep existing |
|
||||||
|
|
||||||
|
## Recommendation
|
||||||
|
|
||||||
|
**Start with VictoriaMetrics** for simplicity. The compression alone may provide 6+ months of retention in the same disk space currently used for 30 days.
|
||||||
|
|
||||||
|
If multi-year retention with true downsampling becomes necessary, Thanos can be evaluated later. However, it requires deploying object storage infrastructure (MinIO) which adds operational complexity.
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- VictoriaMetrics docs: https://docs.victoriametrics.com/
|
||||||
|
- Thanos docs: https://thanos.io/tip/thanos/getting-started.md/
|
||||||
|
- NixOS options searched from nixpkgs revision e576e3c9 (NixOS 25.11)
|
||||||
244
docs/plans/media-pc-replacement.md
Normal file
244
docs/plans/media-pc-replacement.md
Normal file
@@ -0,0 +1,244 @@
|
|||||||
|
# Media PC Replacement
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Replace the aging Linux+Kodi media PC connected to the TV with a modern, compact solution. Primary use cases are Jellyfin/Kodi playback and watching Twitch/YouTube. The current machine (`media`, 10.69.31.50) is on VLAN 31.
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
### Hardware
|
||||||
|
- **CPU**: Intel Core i7-4770K @ 3.50GHz (Haswell, 4C/8T, 2013)
|
||||||
|
- **GPU**: Nvidia GeForce GT 710 (Kepler, GK208B)
|
||||||
|
- **OS**: Ubuntu 22.04.5 LTS (Jammy)
|
||||||
|
- **Software**: Kodi
|
||||||
|
- **Network**: `media.home.2rjus.net` at `10.69.31.50` (VLAN 31)
|
||||||
|
|
||||||
|
### Control & Display
|
||||||
|
- **Input**: Wireless keyboard (works well, useful for browser)
|
||||||
|
- **TV**: 1080p (no 4K/HDR currently, but may upgrade TV later)
|
||||||
|
- **Audio**: Surround system connected via HDMI ARC from TV (PC → HDMI → TV → ARC → surround)
|
||||||
|
|
||||||
|
### Notes on Current Hardware
|
||||||
|
- The i7-4770K is massively overpowered for media playback — it's a full desktop CPU from 2013
|
||||||
|
- The GT 710 is a low-end passive GPU; supports NVDEC for H.264/H.265 hardware decode but limited to 4K@30Hz over HDMI 1.4
|
||||||
|
- Ubuntu 22.04 is approaching EOL (April 2027) and is not managed by this repo
|
||||||
|
- The whole system is likely in a full-size or mid-tower case — not ideal for a TV setup
|
||||||
|
|
||||||
|
### Integration
|
||||||
|
- **Media source**: Jellyfin on `jelly01` (10.69.13.14) serves media from NAS via NFS
|
||||||
|
- **DNS**: A record in `services/ns/external-hosts.nix`
|
||||||
|
- **Not managed**: Not a NixOS host in this repo, no monitoring/auto-updates
|
||||||
|
|
||||||
|
## Options
|
||||||
|
|
||||||
|
### Option 1: Dedicated Streaming Device (Apple TV / Nvidia Shield)
|
||||||
|
|
||||||
|
| Aspect | Apple TV 4K | Nvidia Shield Pro |
|
||||||
|
|--------|-------------|-------------------|
|
||||||
|
| **Price** | ~$130-180 | ~$200 |
|
||||||
|
| **Jellyfin** | Swiftfin app (good) | Jellyfin Android TV (good) |
|
||||||
|
| **Kodi** | Not available (tvOS) | Full Kodi support |
|
||||||
|
| **Twitch** | Native app | Native app |
|
||||||
|
| **YouTube** | Native app | Native app |
|
||||||
|
| **HDR/DV** | Dolby Vision + HDR10 | Dolby Vision + HDR10 |
|
||||||
|
| **4K** | Yes | Yes |
|
||||||
|
| **Form factor** | Tiny, silent | Small, silent |
|
||||||
|
| **Remote** | Excellent Siri remote | Decent, supports CEC |
|
||||||
|
| **Homelab integration** | None | Minimal (Plex/Kodi only) |
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- Zero maintenance - appliance experience
|
||||||
|
- Excellent app ecosystem (native Twitch, YouTube, streaming services)
|
||||||
|
- Silent, tiny form factor
|
||||||
|
- Great remote control / CEC support
|
||||||
|
- Hardware-accelerated codec support out of the box
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- No NixOS management, monitoring, or auto-updates
|
||||||
|
- Can't run arbitrary software
|
||||||
|
- Jellyfin clients are decent but not as mature as Kodi
|
||||||
|
- Vendor lock-in (Apple ecosystem / Google ecosystem)
|
||||||
|
- No SSH access for troubleshooting
|
||||||
|
|
||||||
|
### Option 2: NixOS Mini PC (Kodi Appliance)
|
||||||
|
|
||||||
|
A small form factor PC (Intel NUC, Beelink, MinisForum, etc.) running NixOS with Kodi as the desktop environment.
|
||||||
|
|
||||||
|
**NixOS has built-in support:**
|
||||||
|
- `services.xserver.desktopManager.kodi.enable` - boots directly into Kodi
|
||||||
|
- `kodi-gbm` package - Kodi with direct DRM/KMS rendering (no X11/Wayland needed)
|
||||||
|
- `kodiPackages.jellycon` - Jellyfin integration for Kodi
|
||||||
|
- `kodiPackages.sendtokodi` - plays streams via yt-dlp (Twitch, YouTube)
|
||||||
|
- `kodiPackages.inputstream-adaptive` - adaptive streaming support
|
||||||
|
|
||||||
|
**Example NixOS config sketch:**
|
||||||
|
```nix
|
||||||
|
{ pkgs, ... }:
|
||||||
|
{
|
||||||
|
services.xserver.desktopManager.kodi = {
|
||||||
|
enable = true;
|
||||||
|
package = pkgs.kodi.withPackages (p: [
|
||||||
|
p.jellycon
|
||||||
|
p.sendtokodi
|
||||||
|
p.inputstream-adaptive
|
||||||
|
]);
|
||||||
|
};
|
||||||
|
|
||||||
|
# Auto-login to Kodi session
|
||||||
|
services.displayManager.autoLogin = {
|
||||||
|
enable = true;
|
||||||
|
user = "kodi";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- Full NixOS management (monitoring, auto-updates, vault, promtail)
|
||||||
|
- Kodi is a proven TV interface with excellent remote/CEC support
|
||||||
|
- JellyCon integrates Jellyfin library directly into Kodi
|
||||||
|
- Twitch/YouTube via sendtokodi + yt-dlp or Kodi browser addons
|
||||||
|
- Can run arbitrary services (e.g., Home Assistant dashboard)
|
||||||
|
- Declarative, reproducible config in this repo
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- More maintenance than an appliance
|
||||||
|
- NixOS + Kodi on bare metal needs GPU driver setup (Intel iGPU is usually fine)
|
||||||
|
- Kodi YouTube/Twitch addons are less polished than native apps
|
||||||
|
- Need to buy hardware (~$150-400 for a decent mini PC)
|
||||||
|
- Power consumption higher than a streaming device
|
||||||
|
|
||||||
|
### Option 3: NixOS Mini PC (Wayland Desktop)
|
||||||
|
|
||||||
|
A mini PC running NixOS with a lightweight Wayland compositor, launching Kodi for media and a browser for Twitch/YouTube.
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- Best of both worlds: Kodi for media, Firefox/Chromium for Twitch/YouTube
|
||||||
|
- Full NixOS management
|
||||||
|
- Can switch between Kodi and browser easily
|
||||||
|
- Native web experience for streaming sites
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- More complex setup (compositor + Kodi + browser)
|
||||||
|
- Harder to get a good "10-foot UI" experience
|
||||||
|
- Keyboard/mouse may be needed alongside remote
|
||||||
|
- Significantly more maintenance
|
||||||
|
|
||||||
|
## Comparison
|
||||||
|
|
||||||
|
| Criteria | Dedicated Device | NixOS Kodi | NixOS Desktop |
|
||||||
|
|----------|-----------------|------------|---------------|
|
||||||
|
| **Maintenance** | None | Low | Medium |
|
||||||
|
| **Media experience** | Excellent | Excellent | Good |
|
||||||
|
| **Twitch/YouTube** | Excellent (native apps) | Good (addons/yt-dlp) | Excellent (browser) |
|
||||||
|
| **Homelab integration** | None | Full | Full |
|
||||||
|
| **Form factor** | Tiny | Small | Small |
|
||||||
|
| **Cost** | $130-200 | $150-400 | $150-400 |
|
||||||
|
| **Silent operation** | Yes | Likely (fanless options) | Likely |
|
||||||
|
| **CEC remote** | Yes | Yes (Kodi) | Partial |
|
||||||
|
|
||||||
|
## Decision: NixOS Mini PC with Kodi (Option 2)
|
||||||
|
|
||||||
|
**Rationale:**
|
||||||
|
- Already comfortable with Kodi + wireless keyboard workflow
|
||||||
|
- Browser access for Twitch/YouTube is important — Kodi can launch a browser when needed
|
||||||
|
- Homelab integration comes for free (monitoring, auto-updates, vault)
|
||||||
|
- Natural fit alongside the other 16 NixOS hosts in this repo
|
||||||
|
- Dedicated devices lose the browser/keyboard workflow
|
||||||
|
|
||||||
|
### Display Server: Sway/Hyprland
|
||||||
|
|
||||||
|
Options evaluated:
|
||||||
|
|
||||||
|
| Approach | Pros | Cons |
|
||||||
|
|----------|------|------|
|
||||||
|
| Cage (kiosk) | Simplest, single-app | No browser without TTY switching |
|
||||||
|
| kodi-gbm (no compositor) | Best HDR support | No browser at all, ALSA-only audio |
|
||||||
|
| **Sway/Hyprland** | **Workspace switching, VA-API in browser** | **Slightly more config** |
|
||||||
|
| Full DE (GNOME/KDE) | Everything works | Overkill, heavy |
|
||||||
|
|
||||||
|
**Decision: Sway or Hyprland** (Hyprland preferred — same as desktop)
|
||||||
|
|
||||||
|
- Kodi fullscreen on workspace 1, Firefox on workspace 2
|
||||||
|
- Switch via keybinding on wireless keyboard
|
||||||
|
- Auto-start both on login via greetd
|
||||||
|
- Minimal config — no bar, no decorations, just workspaces
|
||||||
|
- VA-API hardware decode works in Firefox on Wayland (important for YouTube/Twitch)
|
||||||
|
- Can revisit kodi-gbm later if HDR becomes a priority (just a config change)
|
||||||
|
|
||||||
|
### Twitch/YouTube
|
||||||
|
|
||||||
|
Firefox on workspace 2, switched to via keyboard. Kodi addons (sendtokodi, YouTube plugin) available as secondary options but a real browser is the primary approach.
|
||||||
|
|
||||||
|
### Media Playback: Kodi + JellyCon + NFS Direct Path
|
||||||
|
|
||||||
|
Three options were evaluated for media playback:
|
||||||
|
|
||||||
|
| Approach | Transcoding | Library management | Watch state sync |
|
||||||
|
|----------|-------------|-------------------|-----------------|
|
||||||
|
| Jellyfin only (browser) | Yes — browsers lack codec support for DTS, PGS subs, etc. | Jellyfin | Jellyfin |
|
||||||
|
| Kodi + NFS only | No — Kodi plays everything natively | Kodi local DB | None |
|
||||||
|
| **Kodi + JellyCon + NFS** | **No — Kodi's native player, direct path via NFS** | **Jellyfin** | **Jellyfin** |
|
||||||
|
|
||||||
|
**Decision: Kodi + JellyCon with NFS direct path**
|
||||||
|
|
||||||
|
- JellyCon presents the Jellyfin library inside Kodi's UI (browse, search, metadata, artwork)
|
||||||
|
- Playback uses Kodi's native player — direct play, no transcoding, full codec support including surround passthrough
|
||||||
|
- JellyCon's "direct path" mode maps Jellyfin paths to local NFS mounts, so playback goes straight over NFS without streaming through Jellyfin's HTTP layer
|
||||||
|
- Watch state, resume position, etc. sync back to Jellyfin — accessible from other devices too
|
||||||
|
- NFS mount follows the same pattern as jelly01 (`nas.home.2rjus.net:/mnt/hdd-pool/media`)
|
||||||
|
|
||||||
|
### Audio Passthrough
|
||||||
|
|
||||||
|
Kodi on NixOS supports HDMI audio passthrough for surround formats (AC3, DTS, etc.). The ARC chain (PC → HDMI → TV → ARC → surround) works transparently — Kodi just needs to be configured for passthrough rather than decoding audio locally.
|
||||||
|
|
||||||
|
## Hardware
|
||||||
|
|
||||||
|
### Leading Candidate: GMKtec G3
|
||||||
|
|
||||||
|
- **CPU**: Intel N100 (Alder Lake-N, 4C/4T)
|
||||||
|
- **RAM**: 16GB
|
||||||
|
- **Storage**: 512GB NVMe
|
||||||
|
- **Price**: ~NOK 2800 (~$250 USD)
|
||||||
|
- **Source**: AliExpress
|
||||||
|
|
||||||
|
The N100 supports hardware decode for all relevant 4K codecs:
|
||||||
|
|
||||||
|
| Codec | Support | Used by |
|
||||||
|
|-------|---------|---------|
|
||||||
|
| H.264/AVC | Yes (Quick Sync) | Older media |
|
||||||
|
| H.265/HEVC 10-bit | Yes (Quick Sync) | Most 4K media, HDR |
|
||||||
|
| VP9 | Yes (Quick Sync) | YouTube 4K |
|
||||||
|
| AV1 | Yes (Quick Sync) | YouTube, Twitch, newer encodes |
|
||||||
|
|
||||||
|
16GB RAM is comfortable for Kodi + browser + NixOS system services (node-exporter, promtail, etc.) with plenty of headroom.
|
||||||
|
|
||||||
|
### Key Requirements
|
||||||
|
- HDMI 2.0+ for 4K future-proofing (current TV is 1080p)
|
||||||
|
- Hardware video decode via VA-API / Intel Quick Sync
|
||||||
|
- HDR support (for future TV upgrade)
|
||||||
|
- Fanless or near-silent operation
|
||||||
|
|
||||||
|
## Implementation Steps
|
||||||
|
|
||||||
|
1. **Choose and order hardware**
|
||||||
|
2. **Create host configuration** (`hosts/media1/`)
|
||||||
|
- Kodi desktop manager with Jellyfin + streaming addons
|
||||||
|
- Intel/AMD iGPU driver and VA-API hardware decode
|
||||||
|
- HDMI audio passthrough for surround
|
||||||
|
- NFS mount for media (same pattern as jelly01)
|
||||||
|
- Browser package (Firefox/Chromium) for Twitch/YouTube fallback
|
||||||
|
- Standard system modules (monitoring, promtail, vault, auto-upgrade)
|
||||||
|
3. **Install NixOS** on the mini PC
|
||||||
|
4. **Configure Kodi** (Jellyfin server, addons, audio passthrough)
|
||||||
|
5. **Update DNS** - point `media.home.2rjus.net` to new IP (or keep on VLAN 31)
|
||||||
|
6. **Retire old media PC**
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- [x] What are the current media PC specs? — i7-4770K, GT 710, Ubuntu 22.04. Overkill CPU, weak GPU, large form factor. Not worth reusing if goal is compact/silent.
|
||||||
|
- [x] VLAN? — Keep on VLAN 31 for now, same as current media PC. Can revisit later.
|
||||||
|
- [x] Is CEC needed? — No, not using it currently. Can add later if desired.
|
||||||
|
- [x] Is 4K HDR output needed? — TV is 1080p now, but want 4K/HDR capability for future TV upgrade
|
||||||
|
- [x] Audio setup? — Surround system via HDMI ARC from TV. Media PC outputs HDMI to TV, TV passes audio to surround via ARC. Kodi/any player just needs HDMI audio output with surround passthrough.
|
||||||
|
- [x] Are there streaming service apps needed? — No. Only Twitch/YouTube, which work fine in any browser.
|
||||||
|
- [x] Budget? — ~NOK 2800 for GMKtec G3 (N100, 16GB, 512GB NVMe)
|
||||||
116
docs/plans/memory-issues-follow-up.md
Normal file
116
docs/plans/memory-issues-follow-up.md
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
# Memory Issues Follow-up
|
||||||
|
|
||||||
|
Tracking the zram change to verify it resolves OOM issues during nixos-upgrade on low-memory hosts.
|
||||||
|
|
||||||
|
## Background
|
||||||
|
|
||||||
|
On 2026-02-08, ns2 (2GB RAM) experienced an OOM kill during nixos-upgrade. The Nix evaluation process consumed ~1.6GB before being killed by the kernel. ns1 (manually increased to 4GB) succeeded with the same upgrade.
|
||||||
|
|
||||||
|
Root cause: 2GB RAM is insufficient for Nix flake evaluation without swap.
|
||||||
|
|
||||||
|
## Fix Applied
|
||||||
|
|
||||||
|
**Commit:** `1674b6a` - system: enable zram swap for all hosts
|
||||||
|
|
||||||
|
**Merged:** 2026-02-08 ~12:15 UTC
|
||||||
|
|
||||||
|
**Change:** Added `zramSwap.enable = true` to `system/zram.nix`, providing ~2GB compressed swap on all hosts.
|
||||||
|
|
||||||
|
## Timeline
|
||||||
|
|
||||||
|
| Time (UTC) | Event |
|
||||||
|
|------------|-------|
|
||||||
|
| 05:00:46 | ns2 nixos-upgrade OOM killed |
|
||||||
|
| 05:01:47 | `nixos_upgrade_failed` alert fired |
|
||||||
|
| 12:15 | zram commit merged to master |
|
||||||
|
| 12:19 | ns2 rebooted with zram enabled |
|
||||||
|
| 12:20 | ns1 rebooted (memory reduced to 2GB via tofu) |
|
||||||
|
|
||||||
|
## Hosts Affected
|
||||||
|
|
||||||
|
All 2GB VMs that run nixos-upgrade:
|
||||||
|
- ns1, ns2 (DNS)
|
||||||
|
- vault01
|
||||||
|
- testvm01, testvm02, testvm03
|
||||||
|
- kanidm01
|
||||||
|
|
||||||
|
## Metrics to Monitor
|
||||||
|
|
||||||
|
Check these in Grafana or via PromQL to verify the fix:
|
||||||
|
|
||||||
|
### Swap availability (should be ~2GB after upgrade)
|
||||||
|
```promql
|
||||||
|
node_memory_SwapTotal_bytes / 1024 / 1024
|
||||||
|
```
|
||||||
|
|
||||||
|
### Swap usage during upgrades
|
||||||
|
```promql
|
||||||
|
(node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes) / 1024 / 1024
|
||||||
|
```
|
||||||
|
|
||||||
|
### Zswap compressed bytes (active compression)
|
||||||
|
```promql
|
||||||
|
node_memory_Zswap_bytes / 1024 / 1024
|
||||||
|
```
|
||||||
|
|
||||||
|
### Upgrade failures (should be 0)
|
||||||
|
```promql
|
||||||
|
node_systemd_unit_state{name="nixos-upgrade.service", state="failed"}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Memory available during upgrades
|
||||||
|
```promql
|
||||||
|
node_memory_MemAvailable_bytes / 1024 / 1024
|
||||||
|
```
|
||||||
|
|
||||||
|
## Verification Steps
|
||||||
|
|
||||||
|
After a few days (allow auto-upgrades to run on all hosts):
|
||||||
|
|
||||||
|
1. Check all hosts have swap enabled:
|
||||||
|
```promql
|
||||||
|
node_memory_SwapTotal_bytes > 0
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Check for any upgrade failures since the fix:
|
||||||
|
```promql
|
||||||
|
count_over_time(ALERTS{alertname="nixos_upgrade_failed"}[7d])
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Review if any hosts used swap during upgrades (check historical graphs)
|
||||||
|
|
||||||
|
## Success Criteria
|
||||||
|
|
||||||
|
- No `nixos_upgrade_failed` alerts due to OOM after 2026-02-08
|
||||||
|
- All hosts show ~2GB swap available
|
||||||
|
- Upgrades complete successfully on 2GB VMs
|
||||||
|
|
||||||
|
## Fallback Options
|
||||||
|
|
||||||
|
If zram is insufficient:
|
||||||
|
|
||||||
|
1. **Increase VM memory** - Update `terraform/vms.tf` to 4GB for affected hosts
|
||||||
|
2. **Enable memory ballooning** - Configure VMs with dynamic memory allocation (see below)
|
||||||
|
3. **Use remote builds** - Configure `nix.buildMachines` to offload evaluation
|
||||||
|
4. **Reduce flake size** - Split configurations to reduce evaluation memory
|
||||||
|
|
||||||
|
### Memory Ballooning
|
||||||
|
|
||||||
|
Proxmox supports memory ballooning, which allows VMs to dynamically grow/shrink memory allocation based on demand. The balloon driver inside the guest communicates with the hypervisor to release or reclaim memory pages.
|
||||||
|
|
||||||
|
Configuration in `terraform/vms.tf`:
|
||||||
|
```hcl
|
||||||
|
memory = 4096 # maximum memory
|
||||||
|
balloon = 2048 # minimum memory (shrinks to this when idle)
|
||||||
|
```
|
||||||
|
|
||||||
|
Pros:
|
||||||
|
- VMs get memory on-demand without reboots
|
||||||
|
- Better host memory utilization
|
||||||
|
- Solves upgrade OOM without permanently allocating 4GB
|
||||||
|
|
||||||
|
Cons:
|
||||||
|
- Requires QEMU guest agent running in guest
|
||||||
|
- Guest can experience memory pressure if host is overcommitted
|
||||||
|
|
||||||
|
Ballooning and zram are complementary - ballooning provides headroom from the host, zram provides overflow within the guest.
|
||||||
145
docs/plans/new-services.md
Normal file
145
docs/plans/new-services.md
Normal file
@@ -0,0 +1,145 @@
|
|||||||
|
# New Service Candidates
|
||||||
|
|
||||||
|
Ideas for additional services to deploy in the homelab. These lean more enterprise/obscure
|
||||||
|
than the typical self-hosted fare.
|
||||||
|
|
||||||
|
## Litestream
|
||||||
|
|
||||||
|
Continuous SQLite replication to S3-compatible storage. Streams WAL changes in near-real-time,
|
||||||
|
providing point-in-time recovery without scheduled backup jobs.
|
||||||
|
|
||||||
|
**Why:** Several services use SQLite (Home Assistant, potentially others). Litestream would
|
||||||
|
give continuous backup to Garage S3 with minimal resource overhead and near-zero configuration.
|
||||||
|
Replaces cron-based backup scripts with a small daemon per database.
|
||||||
|
|
||||||
|
**Integration points:**
|
||||||
|
- Garage S3 as replication target (already deployed)
|
||||||
|
- Home Assistant SQLite database is the primary candidate
|
||||||
|
- Could also cover any future SQLite-backed services
|
||||||
|
|
||||||
|
**Complexity:** Low. Single Go binary, minimal config (source DB path + S3 endpoint).
|
||||||
|
|
||||||
|
**NixOS packaging:** Available in nixpkgs as `litestream`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ntopng
|
||||||
|
|
||||||
|
Deep network traffic analysis and flow monitoring. Provides real-time visibility into bandwidth
|
||||||
|
usage, protocol distribution, top talkers, and anomaly detection via a web UI.
|
||||||
|
|
||||||
|
**Why:** We have host-level metrics (node-exporter) and logs (Loki) but no network-level
|
||||||
|
visibility. ntopng would show traffic patterns across the infrastructure — NFS throughput to
|
||||||
|
the NAS, DNS query volume, inter-host traffic, and bandwidth anomalies. Useful for capacity
|
||||||
|
planning and debugging network issues.
|
||||||
|
|
||||||
|
**Integration points:**
|
||||||
|
- Could export metrics to Prometheus via its built-in exporter
|
||||||
|
- Web UI behind http-proxy with Kanidm OIDC (if supported) or Pomerium
|
||||||
|
- NetFlow/sFlow from managed switches (if available)
|
||||||
|
- Passive traffic capture on a mirror port or the monitoring host itself
|
||||||
|
|
||||||
|
**Complexity:** Medium. Needs network tap or mirror port for full visibility, or can run
|
||||||
|
in host-local mode. May need a dedicated interface or VLAN mirror.
|
||||||
|
|
||||||
|
**NixOS packaging:** Available in nixpkgs as `ntopng`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Renovate
|
||||||
|
|
||||||
|
Automated dependency update bot that understands Nix flakes natively. Creates branches/PRs
|
||||||
|
to bump flake inputs on a configurable schedule.
|
||||||
|
|
||||||
|
**Why:** Currently `nix flake update` is manual. Renovate can automatically propose updates
|
||||||
|
to individual flake inputs (nixpkgs, homelab-deploy, nixos-exporter, etc.), group related
|
||||||
|
updates, and respect schedules. More granular than updating everything at once — can bump
|
||||||
|
nixpkgs weekly but hold back other inputs, auto-merge patch-level changes, etc.
|
||||||
|
|
||||||
|
**Integration points:**
|
||||||
|
- Runs against git.t-juice.club repositories
|
||||||
|
- Understands `flake.lock` format natively
|
||||||
|
- Could target both `nixos-servers` and `nixos` repos
|
||||||
|
- Update branches would be validated by homelab-deploy builder
|
||||||
|
|
||||||
|
**Complexity:** Medium. Needs git forge integration (Gitea/Forgejo API). Self-hosted runner
|
||||||
|
mode available. Configuration via `renovate.json` in each repo.
|
||||||
|
|
||||||
|
**NixOS packaging:** Available in nixpkgs as `renovate`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Pomerium
|
||||||
|
|
||||||
|
Identity-aware reverse proxy implementing zero-trust access. Every request is authenticated
|
||||||
|
and authorized based on identity, device, and context — not just network location.
|
||||||
|
|
||||||
|
**Why:** Currently Caddy terminates TLS but doesn't enforce authentication on most services.
|
||||||
|
Pomerium would put Kanidm OIDC authentication in front of every internal service, with
|
||||||
|
per-route authorization policies (e.g., "only admins can access Prometheus," "require re-auth
|
||||||
|
for Vault UI"). Directly addresses the security hardening plan's goals.
|
||||||
|
|
||||||
|
**Integration points:**
|
||||||
|
- Kanidm as OIDC identity provider (already deployed)
|
||||||
|
- Could replace or sit in front of Caddy for internal services
|
||||||
|
- Per-route policies based on Kanidm groups (admins, users, ssh-users)
|
||||||
|
- Centralizes access logging and audit trail
|
||||||
|
|
||||||
|
**Complexity:** Medium-high. Needs careful integration with existing Caddy reverse proxy.
|
||||||
|
Decision needed on whether Pomerium replaces Caddy or works alongside it (Pomerium for
|
||||||
|
auth, Caddy for TLS termination and routing, or Pomerium handles everything).
|
||||||
|
|
||||||
|
**NixOS packaging:** Available in nixpkgs as `pomerium`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Apache Guacamole
|
||||||
|
|
||||||
|
Clientless remote desktop and SSH gateway. Provides browser-based access to hosts via
|
||||||
|
RDP, VNC, SSH, and Telnet with no client software required. Supports session recording
|
||||||
|
and playback.
|
||||||
|
|
||||||
|
**Why:** Provides an alternative remote access path that doesn't require VPN software or
|
||||||
|
SSH keys on the client device. Useful for accessing hosts from untrusted machines (phone,
|
||||||
|
borrowed laptop) or providing temporary access to others. Session recording gives an audit
|
||||||
|
trail. Could complement the WireGuard remote access plan rather than replace it.
|
||||||
|
|
||||||
|
**Integration points:**
|
||||||
|
- Kanidm for authentication (OIDC or LDAP)
|
||||||
|
- Behind http-proxy or Pomerium for TLS
|
||||||
|
- SSH access to all hosts in the fleet
|
||||||
|
- Session recordings could be stored on Garage S3
|
||||||
|
- Could serve as the "emergency access" path when VPN is unavailable
|
||||||
|
|
||||||
|
**Complexity:** Medium. Java-based (guacd + web app), typically needs PostgreSQL for
|
||||||
|
connection/user storage (already available). Docker is the common deployment method but
|
||||||
|
native packaging exists.
|
||||||
|
|
||||||
|
**NixOS packaging:** Available in nixpkgs as `guacamole-server` and `guacamole-client`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## CrowdSec
|
||||||
|
|
||||||
|
Collaborative intrusion prevention system with crowd-sourced threat intelligence.
|
||||||
|
Parses logs to detect attack patterns, applies remediation (firewall bans, CAPTCHA),
|
||||||
|
and shares/receives threat signals from a global community network.
|
||||||
|
|
||||||
|
**Why:** Goes beyond fail2ban with behavioral detection, crowd-sourced IP reputation,
|
||||||
|
and a scenario-based engine. Fits the security hardening plan. The community blocklist
|
||||||
|
means we benefit from threat intelligence gathered across thousands of deployments.
|
||||||
|
Could parse SSH logs, HTTP access logs, and other service logs to detect and block
|
||||||
|
malicious activity.
|
||||||
|
|
||||||
|
**Integration points:**
|
||||||
|
- Could consume logs from Loki or directly from journald/log files
|
||||||
|
- Firewall bouncer for iptables/nftables remediation
|
||||||
|
- Caddy bouncer for HTTP-level blocking
|
||||||
|
- Prometheus metrics exporter for alert integration
|
||||||
|
- Scenarios available for SSH brute force, HTTP scanning, and more
|
||||||
|
- Feeds into existing alerting pipeline (Alertmanager -> alerttonotify)
|
||||||
|
|
||||||
|
**Complexity:** Medium. Agent (log parser + decision engine) on each host or centralized.
|
||||||
|
Bouncers (enforcement) on edge hosts. Free community tier includes threat intel access.
|
||||||
|
|
||||||
|
**NixOS packaging:** Available in nixpkgs as `crowdsec`.
|
||||||
232
docs/plans/nixos-hypervisor.md
Normal file
232
docs/plans/nixos-hypervisor.md
Normal file
@@ -0,0 +1,232 @@
|
|||||||
|
# NixOS Hypervisor
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Experiment with running a NixOS-based hypervisor as an alternative/complement to the current Proxmox setup. Goal is better homelab integration — declarative config, monitoring, auto-updates — while retaining the ability to run VMs with a Terraform-like workflow.
|
||||||
|
|
||||||
|
## Motivation
|
||||||
|
|
||||||
|
- Proxmox works but doesn't integrate with the NixOS-managed homelab (no monitoring, no auto-updates, no vault, no declarative config)
|
||||||
|
- The PN51 units (once stable) are good candidates for experimentation — test-tier, plenty of RAM (32-64GB), 8C/16T
|
||||||
|
- Long-term: could reduce reliance on Proxmox or provide a secondary hypervisor pool
|
||||||
|
- **VM migration**: Currently all VMs (including both nameservers) run on a single Proxmox host. Being able to migrate VMs between hypervisors would allow rebooting a host for kernel updates without downtime for critical services like DNS.
|
||||||
|
|
||||||
|
## Hardware Candidates
|
||||||
|
|
||||||
|
| | pn01 | pn02 |
|
||||||
|
|---|---|---|
|
||||||
|
| **CPU** | Ryzen 7 5700U (8C/16T) | Ryzen 7 5700U (8C/16T) |
|
||||||
|
| **RAM** | 64GB (2x32GB) | 32GB (1x32GB, second slot available) |
|
||||||
|
| **Storage** | 1TB NVMe | 1TB SATA SSD (NVMe planned) |
|
||||||
|
| **Status** | Stability testing | Stability testing |
|
||||||
|
|
||||||
|
## Options
|
||||||
|
|
||||||
|
### Option 1: Incus
|
||||||
|
|
||||||
|
Fork of LXD (after Canonical made LXD proprietary). Supports both containers (LXC) and VMs (QEMU/KVM).
|
||||||
|
|
||||||
|
**NixOS integration:**
|
||||||
|
- `virtualisation.incus.enable` module in nixpkgs
|
||||||
|
- Manages storage pools, networks, and instances
|
||||||
|
- REST API for automation
|
||||||
|
- CLI tool (`incus`) for management
|
||||||
|
|
||||||
|
**Terraform integration:**
|
||||||
|
- `lxd` provider works with Incus (API-compatible)
|
||||||
|
- Dedicated `incus` Terraform provider also exists
|
||||||
|
- Can define VMs/containers in OpenTofu, similar to current Proxmox workflow
|
||||||
|
|
||||||
|
**Migration:**
|
||||||
|
- Built-in live and offline migration via `incus move <instance> --target <host>`
|
||||||
|
- Clustering makes hosts aware of each other — migration is a first-class operation
|
||||||
|
- Shared storage (NFS, Ceph) or Incus can transfer storage during migration
|
||||||
|
- Stateful stop-and-move also supported for offline migration
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- Supports both containers and VMs
|
||||||
|
- REST API + CLI for automation
|
||||||
|
- Built-in clustering and migration — closest to Proxmox experience
|
||||||
|
- Good NixOS module support
|
||||||
|
- Image-based workflow (can build NixOS images and import)
|
||||||
|
- Active development and community
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- Another abstraction layer on top of QEMU/KVM
|
||||||
|
- Less mature Terraform provider than libvirt
|
||||||
|
- Container networking can be complex
|
||||||
|
- NixOS guests in Incus VMs need some setup
|
||||||
|
|
||||||
|
### Option 2: libvirt/QEMU
|
||||||
|
|
||||||
|
Standard Linux virtualization stack. Thin wrapper around QEMU/KVM.
|
||||||
|
|
||||||
|
**NixOS integration:**
|
||||||
|
- `virtualisation.libvirtd.enable` module in nixpkgs
|
||||||
|
- Mature and well-tested
|
||||||
|
- virsh CLI for management
|
||||||
|
|
||||||
|
**Terraform integration:**
|
||||||
|
- `dmacvicar/libvirt` provider — mature, well-maintained
|
||||||
|
- Supports cloud-init, volume management, network config
|
||||||
|
- Very similar workflow to current Proxmox+OpenTofu setup
|
||||||
|
- Can reuse cloud-init patterns from existing `terraform/` config
|
||||||
|
|
||||||
|
**Migration:**
|
||||||
|
- Supports live and offline migration via `virsh migrate`
|
||||||
|
- Requires shared storage (NFS, Ceph, or similar) for live migration
|
||||||
|
- Requires matching CPU models between hosts (or CPU model masking)
|
||||||
|
- Works but is manual — no cluster awareness, must specify target URI
|
||||||
|
- No built-in orchestration for multi-host scenarios
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- Closest to current Proxmox+Terraform workflow
|
||||||
|
- Most mature Terraform provider
|
||||||
|
- Minimal abstraction — direct QEMU/KVM management
|
||||||
|
- Well-understood, massive community
|
||||||
|
- Cloud-init works identically to Proxmox workflow
|
||||||
|
- Can reuse existing template-building patterns
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- VMs only (no containers without adding LXC separately)
|
||||||
|
- No built-in REST API (would need to expose libvirt socket)
|
||||||
|
- No web UI without adding cockpit or virt-manager
|
||||||
|
- Migration works but requires manual setup — no clustering, no orchestration
|
||||||
|
- Less feature-rich than Incus for multi-host scenarios
|
||||||
|
|
||||||
|
### Option 3: microvm.nix
|
||||||
|
|
||||||
|
NixOS-native microVM framework. VMs defined as NixOS modules in the host's flake.
|
||||||
|
|
||||||
|
**NixOS integration:**
|
||||||
|
- VMs are NixOS configurations in the same flake
|
||||||
|
- Supports multiple backends: cloud-hypervisor, QEMU, firecracker, kvmtool
|
||||||
|
- Lightweight — shares host's nix store with guests via virtiofs
|
||||||
|
- Declarative network, storage, and resource allocation
|
||||||
|
|
||||||
|
**Terraform integration:**
|
||||||
|
- None — everything is defined in Nix
|
||||||
|
- Fundamentally different workflow from current Proxmox+Terraform approach
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- Most NixOS-native approach
|
||||||
|
- VMs defined right alongside host configs in this repo
|
||||||
|
- Very lightweight — fast boot, minimal overhead
|
||||||
|
- Shares nix store with host (no duplicate packages)
|
||||||
|
- No cloud-init needed — guest config is part of the flake
|
||||||
|
|
||||||
|
**Migration:**
|
||||||
|
- No migration support — VMs are tied to the host's NixOS config
|
||||||
|
- Moving a VM means rebuilding it on another host
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- Very niche, smaller community
|
||||||
|
- Different mental model from current workflow
|
||||||
|
- Only NixOS guests (no Ubuntu, FreeBSD, etc.)
|
||||||
|
- No Terraform integration
|
||||||
|
- No migration support
|
||||||
|
- Less isolation than full QEMU VMs
|
||||||
|
- Would need to learn a new deployment pattern
|
||||||
|
|
||||||
|
## Comparison
|
||||||
|
|
||||||
|
| Criteria | Incus | libvirt | microvm.nix |
|
||||||
|
|----------|-------|---------|-------------|
|
||||||
|
| **Workflow similarity** | Medium | High | Low |
|
||||||
|
| **Terraform support** | Yes (lxd/incus provider) | Yes (mature provider) | No |
|
||||||
|
| **NixOS module** | Yes | Yes | Yes |
|
||||||
|
| **Containers + VMs** | Both | VMs only | VMs only |
|
||||||
|
| **Non-NixOS guests** | Yes | Yes | No |
|
||||||
|
| **Live migration** | Built-in (first-class) | Yes (manual setup) | No |
|
||||||
|
| **Offline migration** | Built-in | Yes (manual setup) | No (rebuild) |
|
||||||
|
| **Clustering** | Built-in | Manual | No |
|
||||||
|
| **Learning curve** | Medium | Low | Medium |
|
||||||
|
| **Community/maturity** | Growing | Very mature | Niche |
|
||||||
|
| **Overhead** | Low | Minimal | Minimal |
|
||||||
|
|
||||||
|
## Recommendation
|
||||||
|
|
||||||
|
Start with **Incus**. Migration and clustering are key requirements:
|
||||||
|
- Built-in clustering makes two PN51s a proper hypervisor pool
|
||||||
|
- Live and offline migration are first-class operations, similar to Proxmox
|
||||||
|
- Can move VMs between hosts for maintenance (kernel updates, hardware work) without downtime
|
||||||
|
- Supports both containers and VMs — flexibility for future use
|
||||||
|
- Terraform provider exists (less mature than libvirt's, but functional)
|
||||||
|
- REST API enables automation beyond what Terraform covers
|
||||||
|
|
||||||
|
libvirt could achieve similar results but requires significantly more manual setup for migration and has no clustering awareness. For a two-node setup where migration is a priority, Incus provides much more out of the box.
|
||||||
|
|
||||||
|
**microvm.nix** is off the table given the migration requirement.
|
||||||
|
|
||||||
|
## Implementation Plan
|
||||||
|
|
||||||
|
### Phase 1: Single-Node Setup (on one PN51)
|
||||||
|
|
||||||
|
1. Enable `virtualisation.incus` on pn01 (or whichever is stable)
|
||||||
|
2. Initialize Incus (`incus admin init`) — configure storage pool (local NVMe) and network bridge
|
||||||
|
3. Configure bridge networking for VM traffic on VLAN 12
|
||||||
|
4. Build a NixOS VM image and import it into Incus
|
||||||
|
5. Create a test VM manually with `incus launch` to validate the setup
|
||||||
|
|
||||||
|
### Phase 2: Two-Node Cluster (PN51s only)
|
||||||
|
|
||||||
|
1. Enable Incus on the second PN51
|
||||||
|
2. Form a cluster between both nodes
|
||||||
|
3. Configure shared storage (NFS from NAS, or Ceph if warranted)
|
||||||
|
4. Test offline migration: `incus move <vm> --target <other-node>`
|
||||||
|
5. Test live migration with shared storage
|
||||||
|
6. CPU compatibility is not an issue here — both nodes have identical Ryzen 7 5700U CPUs
|
||||||
|
|
||||||
|
### Phase 3: Terraform Integration
|
||||||
|
|
||||||
|
1. Add Incus Terraform provider to `terraform/`
|
||||||
|
2. Define a test VM in OpenTofu (cloud-init, static IP, vault provisioning)
|
||||||
|
3. Verify the full pipeline: tofu apply -> VM boots -> cloud-init -> vault credentials -> NixOS rebuild
|
||||||
|
4. Compare workflow with existing Proxmox pipeline
|
||||||
|
|
||||||
|
### Phase 4: Evaluate and Expand
|
||||||
|
|
||||||
|
- Is the workflow comparable to Proxmox?
|
||||||
|
- Migration reliability — does live migration work cleanly?
|
||||||
|
- Performance overhead acceptable on Ryzen 5700U?
|
||||||
|
- Worth migrating some test-tier VMs from Proxmox?
|
||||||
|
- Could ns1/ns2 run on separate Incus nodes instead of the single Proxmox host?
|
||||||
|
|
||||||
|
### Phase 5: Proxmox Replacement (optional)
|
||||||
|
|
||||||
|
If Incus works well on the PN51s, consider replacing Proxmox entirely for a three-node cluster.
|
||||||
|
|
||||||
|
**CPU compatibility for mixed cluster:**
|
||||||
|
|
||||||
|
| Node | CPU | Architecture | x86-64-v3 |
|
||||||
|
|------|-----|-------------|-----------|
|
||||||
|
| Proxmox host | AMD Ryzen 9 3900X (12C/24T) | Zen 2 | Yes |
|
||||||
|
| pn01 | AMD Ryzen 7 5700U (8C/16T) | Zen 3 | Yes |
|
||||||
|
| pn02 | AMD Ryzen 7 5700U (8C/16T) | Zen 3 | Yes |
|
||||||
|
|
||||||
|
All three CPUs are AMD and support `x86-64-v3`. The 3900X (Zen 2) is the oldest, so it defines the feature ceiling — but `x86-64-v3` is well within its capabilities. VMs configured with `x86-64-v3` can migrate freely between all three nodes.
|
||||||
|
|
||||||
|
Being all-AMD also avoids the trickier Intel/AMD cross-vendor migration edge cases (different CPUID layouts, virtualization extensions).
|
||||||
|
|
||||||
|
The 3900X (12C/24T) would be the most powerful node, making it the natural home for heavier workloads, with the PN51s (8C/16T each) handling lighter VMs or serving as migration targets during maintenance.
|
||||||
|
|
||||||
|
Steps:
|
||||||
|
1. Install NixOS + Incus on the Proxmox host (or a replacement machine)
|
||||||
|
2. Join it to the existing Incus cluster with `x86-64-v3` CPU baseline
|
||||||
|
3. Migrate VMs from Proxmox to the Incus cluster
|
||||||
|
4. Decommission Proxmox
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- [ ] PN51 units pass stability testing (see `pn51-stability.md`)
|
||||||
|
- [ ] Decide which unit to use first (pn01 preferred — 64GB RAM, NVMe, currently more stable)
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- How to handle VM storage? Local NVMe, NFS from NAS, or Ceph between the two nodes?
|
||||||
|
- Network topology: bridge on VLAN 12, or trunk multiple VLANs to the PN51?
|
||||||
|
- Should VMs be on the same VLAN as the hypervisor host, or separate?
|
||||||
|
- Incus clustering with only two nodes — any quorum issues? Three nodes (with Proxmox replacement) would solve this
|
||||||
|
- How to handle NixOS guest images? Build with nixos-generators, or use Incus image builder?
|
||||||
|
- ~~What CPU does the current Proxmox host have?~~ AMD Ryzen 9 3900X (Zen 2) — `x86-64-v3` confirmed, all-AMD cluster
|
||||||
|
- If replacing Proxmox: migrate VMs first, or fresh start and rebuild?
|
||||||
27
docs/plans/nixos-improvements.md
Normal file
27
docs/plans/nixos-improvements.md
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
# NixOS Infrastructure Improvements
|
||||||
|
|
||||||
|
This document contains planned improvements to the NixOS infrastructure that are not directly part of the automated deployment pipeline.
|
||||||
|
|
||||||
|
## Planned
|
||||||
|
|
||||||
|
### Custom NixOS Options for Service and System Configuration
|
||||||
|
|
||||||
|
Currently, most service configurations in `services/` and shared system configurations in `system/` are written as plain NixOS module imports without declaring custom options. This means host-specific customization is done by directly setting upstream NixOS options or by duplicating configuration across hosts.
|
||||||
|
|
||||||
|
The `homelab.dns` module (`modules/homelab/dns.nix`) is the first example of defining custom options under a `homelab.*` namespace. This pattern should be extended to more of the repository's configuration.
|
||||||
|
|
||||||
|
**Goals:**
|
||||||
|
|
||||||
|
- Define `homelab.*` options for services and shared configuration where it makes sense, following the pattern established by `homelab.dns`
|
||||||
|
- Allow hosts to enable/configure services declaratively (e.g. `homelab.monitoring.enable`, `homelab.http-proxy.virtualHosts`) rather than importing opaque module files
|
||||||
|
- Keep options simple and focused — wrap only the parts that vary between hosts or that benefit from a clearer interface. Not everything needs a custom option.
|
||||||
|
|
||||||
|
**Candidate areas:**
|
||||||
|
|
||||||
|
- `system/` modules (e.g. auto-upgrade schedule, ACME CA URL, monitoring endpoints)
|
||||||
|
- `services/` modules where multiple hosts use the same service with different parameters
|
||||||
|
- Cross-cutting concerns that are currently implicit (e.g. which Loki endpoint promtail ships to)
|
||||||
|
|
||||||
|
## Completed
|
||||||
|
|
||||||
|
- [DNS Automation](completed/dns-automation.md) - Automatically generate DNS entries from host configurations
|
||||||
182
docs/plans/nixos-router.md
Normal file
182
docs/plans/nixos-router.md
Normal file
@@ -0,0 +1,182 @@
|
|||||||
|
# NixOS Router — Replace EdgeRouter
|
||||||
|
|
||||||
|
Replace the aging Ubiquiti EdgeRouter (gw, 10.69.10.1) with a NixOS-based router.
|
||||||
|
The EdgeRouter is suspected to be a throughput bottleneck. A NixOS router integrates
|
||||||
|
naturally with the existing fleet: same config management, same monitoring pipeline,
|
||||||
|
same deployment workflow.
|
||||||
|
|
||||||
|
## Goals
|
||||||
|
|
||||||
|
- Eliminate the EdgeRouter throughput bottleneck
|
||||||
|
- Full integration with existing monitoring (node-exporter, promtail, Prometheus, Loki)
|
||||||
|
- Declarative firewall and routing config managed in the flake
|
||||||
|
- Inter-VLAN routing for all existing subnets
|
||||||
|
- DHCP server for client subnets
|
||||||
|
- NetFlow/traffic accounting for future ntopng integration
|
||||||
|
- Foundation for WireGuard remote access (see remote-access.md)
|
||||||
|
|
||||||
|
## Current Network Topology
|
||||||
|
|
||||||
|
**Subnets (known VLANs):**
|
||||||
|
| VLAN/Subnet | Purpose | Notable hosts |
|
||||||
|
|----------------|------------------|----------------------------------------|
|
||||||
|
| 10.69.10.0/24 | Gateway | gw (10.69.10.1) |
|
||||||
|
| 10.69.12.0/24 | Core services | nas, pve1, arr jails, restic |
|
||||||
|
| 10.69.13.0/24 | Infrastructure | All NixOS servers (static IPs) |
|
||||||
|
| 10.69.22.0/24 | WLAN | unifi-ctrl |
|
||||||
|
| 10.69.30.0/24 | Workstations | gunter |
|
||||||
|
| 10.69.31.0/24 | Media | media |
|
||||||
|
| 10.69.99.0/24 | Management | sw1 (MikroTik CRS326-24G-2S+) |
|
||||||
|
|
||||||
|
**DNS:** ns1 (10.69.13.5) and ns2 (10.69.13.6) handle all resolution. Upstream is
|
||||||
|
Cloudflare/Google over DoT via Unbound.
|
||||||
|
|
||||||
|
**Switch:** MikroTik CRS326-24G-2S+ — L2 switching with VLAN trunking. Capable of
|
||||||
|
L3 routing via RouterOS but not ideal for sustained routing throughput.
|
||||||
|
|
||||||
|
## Hardware
|
||||||
|
|
||||||
|
Needs a small x86 box with:
|
||||||
|
- At least 2 NICs (WAN + LAN trunk). Dual 2.5GbE preferred.
|
||||||
|
- Enough CPU for nftables NAT at line rate (any modern x86 is fine)
|
||||||
|
- 4-8 GB RAM (plenty for routing + DHCP + NetFlow accounting)
|
||||||
|
- Low power consumption, fanless preferred for always-on use
|
||||||
|
|
||||||
|
**Leading candidate:** [Topton Solid Mini PC](https://www.aliexpress.com/item/1005008981218625.html)
|
||||||
|
with Intel i3-N300 (8 E-cores), 2x10GbE SFP+ + 3x2.5GbE (~NOK 3000 barebones). The N300
|
||||||
|
gives headroom for ntopng DPI and potential Suricata IDS without being overkill.
|
||||||
|
|
||||||
|
### Hardware Alternatives
|
||||||
|
|
||||||
|
Domestic availability for firewall mini PCs is limited — likely ordering from AliExpress.
|
||||||
|
|
||||||
|
Key things to verify:
|
||||||
|
- NIC chipset: Intel i225-V/i226-V preferred over Realtek for Linux driver support
|
||||||
|
- RAM/storage: some listings are barebones, check what's included
|
||||||
|
- Import duties: factor in ~25% on top of listing price
|
||||||
|
|
||||||
|
| Option | NICs | Notes | Price |
|
||||||
|
|--------|------|-------|-------|
|
||||||
|
| [Topton Solid Firewall Router](https://www.aliexpress.com/item/1005008059819023.html) | 2x10GbE SFP+, 4x2.5GbE | No RAM/SSD, only Intel N150 available currently | ~NOK 2500 |
|
||||||
|
| [Topton Solid Mini PC](https://www.aliexpress.com/item/1005008981218625.html) | 2x10GbE SFP+, 3x2.5GbE | No RAM/SSD, only Intel i3-N300 available currently | ~NOK 3000 |
|
||||||
|
| [MINISFORUM MS-01](https://www.aliexpress.com/item/1005007308262492.html) | 2x10GbE SFP+, 2x2.5GbE | No RAM/SSD, i5-12600H | ~NOK 4500 |
|
||||||
|
|
||||||
|
The LAN port would carry a VLAN trunk to the MikroTik switch, with sub-interfaces
|
||||||
|
for each VLAN. WAN port connects to the ISP uplink.
|
||||||
|
|
||||||
|
## NixOS Configuration
|
||||||
|
|
||||||
|
### Stability Policy
|
||||||
|
|
||||||
|
The router is treated differently from the rest of the fleet:
|
||||||
|
- **No auto-upgrade** — `system.autoUpgrade.enable = false`
|
||||||
|
- **No homelab-deploy listener** — `homelab.deploy.enable = false`
|
||||||
|
- **Manual updates only** — update every few months, test-build first
|
||||||
|
- **Use `nixos-rebuild boot`** — changes take effect on next deliberate reboot
|
||||||
|
- **Tier: prod, priority: high** — alerts treated with highest priority
|
||||||
|
|
||||||
|
### Core Services
|
||||||
|
|
||||||
|
**Routing & NAT:**
|
||||||
|
- `systemd-networkd` for all interface config (consistent with rest of fleet)
|
||||||
|
- VLAN sub-interfaces on the LAN trunk (one per subnet)
|
||||||
|
- `networking.nftables` for stateful firewall and NAT
|
||||||
|
- IP forwarding enabled (`net.ipv4.ip_forward = 1`)
|
||||||
|
- Masquerade outbound traffic on WAN interface
|
||||||
|
|
||||||
|
**DHCP:**
|
||||||
|
- Kea or dnsmasq for DHCP on client subnets (WLAN, workstations, media)
|
||||||
|
- Infrastructure subnet (10.69.13.0/24) stays static — no DHCP needed
|
||||||
|
- Static leases for known devices
|
||||||
|
|
||||||
|
**Firewall (nftables):**
|
||||||
|
- Default deny between VLANs
|
||||||
|
- Explicit allow rules for known cross-VLAN traffic:
|
||||||
|
- All subnets → ns1/ns2 (DNS)
|
||||||
|
- All subnets → monitoring01 (metrics/logs)
|
||||||
|
- Infrastructure → all (management access)
|
||||||
|
- Workstations → media, core services
|
||||||
|
- NAT masquerade on WAN
|
||||||
|
- Rate limiting on WAN-facing services
|
||||||
|
|
||||||
|
**Traffic Accounting:**
|
||||||
|
- nftables flow accounting or softflowd for NetFlow export
|
||||||
|
- Export to future ntopng instance (see new-services.md)
|
||||||
|
|
||||||
|
**IDS/IPS (future consideration):**
|
||||||
|
- Suricata for inline intrusion detection/prevention on the WAN interface
|
||||||
|
- Signature-based threat detection, protocol anomaly detection
|
||||||
|
- CPU-intensive — feasible at typical home internet speeds (500Mbps-1Gbps) on the N300
|
||||||
|
- Not a day-one requirement, but the hardware should support it
|
||||||
|
|
||||||
|
### Monitoring Integration
|
||||||
|
|
||||||
|
Since this is a NixOS host in the flake, it gets the standard monitoring stack for free:
|
||||||
|
- node-exporter for system metrics (CPU, memory, NIC throughput per interface)
|
||||||
|
- promtail shipping logs to Loki
|
||||||
|
- Prometheus scrape target auto-registration
|
||||||
|
- Alertmanager alerts for host-down, high CPU, etc.
|
||||||
|
|
||||||
|
Additional router-specific monitoring:
|
||||||
|
- Per-VLAN interface traffic metrics via node-exporter (automatic for all interfaces)
|
||||||
|
- NAT connection tracking table size
|
||||||
|
- WAN uplink status and throughput
|
||||||
|
- DHCP lease metrics (if Kea, it has a Prometheus exporter)
|
||||||
|
|
||||||
|
This is a significant advantage over the EdgeRouter — full observability through
|
||||||
|
the existing Grafana dashboards and Loki log search, debuggable via the monitoring
|
||||||
|
MCP tools.
|
||||||
|
|
||||||
|
### WireGuard Integration
|
||||||
|
|
||||||
|
The remote access plan (remote-access.md) currently proposes a separate `extgw01`
|
||||||
|
gateway host. With a NixOS router, there's a decision to make:
|
||||||
|
|
||||||
|
**Option A:** WireGuard terminates on the router itself. Simplest topology — the
|
||||||
|
router is already the gateway, so VPN traffic doesn't need extra hops or firewall
|
||||||
|
rules. But adds complexity to the router, which should stay simple.
|
||||||
|
|
||||||
|
**Option B:** Keep extgw01 as a separate host (original plan). Router just routes
|
||||||
|
traffic to it. Better separation of concerns, router stays minimal.
|
||||||
|
|
||||||
|
Recommendation: Start with option B (keep it separate). The router should do routing
|
||||||
|
and nothing else. WireGuard can move to the router later if extgw01 feels redundant.
|
||||||
|
|
||||||
|
## Migration Plan
|
||||||
|
|
||||||
|
### Phase 1: Build and lab test
|
||||||
|
- Acquire hardware
|
||||||
|
- Create host config in the flake (routing, NAT, DHCP, firewall)
|
||||||
|
- Test-build on workstation: `nix build .#nixosConfigurations.router01.config.system.build.toplevel`
|
||||||
|
- Lab test with a temporary setup if possible (two NICs, isolated VLAN)
|
||||||
|
|
||||||
|
### Phase 2: Prepare cutover
|
||||||
|
- Pre-configure the MikroTik switch trunk port for the new router
|
||||||
|
- Document current EdgeRouter config (port forwarding, NAT rules, DHCP leases)
|
||||||
|
- Replicate all rules in the NixOS config
|
||||||
|
- Verify DNS, DHCP, and inter-VLAN routing work in test
|
||||||
|
|
||||||
|
### Phase 3: Cutover
|
||||||
|
- Schedule a maintenance window (brief downtime expected)
|
||||||
|
- Swap WAN cable from EdgeRouter to new router
|
||||||
|
- Swap LAN trunk from EdgeRouter to new router
|
||||||
|
- Verify connectivity from each VLAN
|
||||||
|
- Verify internet access, DNS resolution, inter-VLAN routing
|
||||||
|
- Monitor via Prometheus/Loki (immediately available since it's a fleet host)
|
||||||
|
|
||||||
|
### Phase 4: Decommission EdgeRouter
|
||||||
|
- Keep EdgeRouter available as fallback for a few weeks
|
||||||
|
- Remove `gw` entry from external-hosts.nix, replace with flake-managed host
|
||||||
|
- Update any references to 10.69.10.1 if the router IP changes
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- **Router IP:** Keep 10.69.10.1 or move to a different address? Each VLAN
|
||||||
|
sub-interface needs an IP (the gateway address for that subnet).
|
||||||
|
- **ISP uplink:** What type of WAN connection? PPPoE, DHCP, static IP?
|
||||||
|
- **Port forwarding:** What ports are currently forwarded on the EdgeRouter?
|
||||||
|
These need to be replicated in nftables.
|
||||||
|
- **DHCP scope:** Which subnets currently get DHCP from the EdgeRouter vs
|
||||||
|
other sources (UniFi controller for WLAN?)?
|
||||||
|
- **UPnP/NAT-PMP:** Needed for any devices? (gaming consoles, etc.)
|
||||||
|
- **Hardware preference:** Fanless mini PC budget and preferred vendor?
|
||||||
104
docs/plans/openstack-nixos-image.md
Normal file
104
docs/plans/openstack-nixos-image.md
Normal file
@@ -0,0 +1,104 @@
|
|||||||
|
# NixOS OpenStack Image
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Build and upload a NixOS base image to the OpenStack cluster at work, enabling NixOS-based VPS instances to replace the current Debian+Podman setup. This image will serve as the foundation for multiple external services:
|
||||||
|
|
||||||
|
- **Forgejo** (replacing Gitea on docker2)
|
||||||
|
- **WireGuard gateway** (replacing docker2's tunnel role, feeding into the remote-access plan)
|
||||||
|
- Any future externally-hosted services
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
- VPS hosting runs on an OpenStack cluster with a personal quota
|
||||||
|
- Current VPS (`docker2.t-juice.club`) runs Debian with Podman containers
|
||||||
|
- Homelab already has a working Proxmox image pipeline: `template2` builds via `nixos-rebuild build-image --image-variant proxmox`, deployed via Ansible
|
||||||
|
- nixpkgs has a built-in `openstack` image variant in the same `image.modules` system used for Proxmox
|
||||||
|
|
||||||
|
## Decisions
|
||||||
|
|
||||||
|
- **No cloud-init dependency** - SSH key baked into the image, no need for metadata service
|
||||||
|
- **No bootstrap script** - VPS deployments are infrequent; manual `nixos-rebuild` after first boot is fine
|
||||||
|
- **No Vault access** - secrets handled manually until WireGuard access is set up (see remote-access plan)
|
||||||
|
- **Separate from homelab services** - no logging/metrics integration initially; revisit after remote-access WireGuard is in place
|
||||||
|
- **Repo placement TBD** - keep in this flake for now for convenience, but external hosts may move to a separate flake later since they can't use most shared `system/` modules (no Vault, no internal DNS, no Promtail)
|
||||||
|
- **OpenStack CLI in devshell** - add `openstackclient` package; credentials (`clouds.yaml`) stay outside the repo
|
||||||
|
- **Parallel deployment** - new Forgejo instance runs alongside docker2 initially, then CNAME moves over
|
||||||
|
|
||||||
|
## Approach
|
||||||
|
|
||||||
|
Follow the same pattern as the Proxmox template (`hosts/template2`), but targeting OpenStack's qcow2 format.
|
||||||
|
|
||||||
|
### What nixpkgs provides
|
||||||
|
|
||||||
|
The `image.modules.openstack` module produces a qcow2 image with:
|
||||||
|
- `openstack-config.nix`: EC2 metadata fetcher, SSH enabled, GRUB bootloader, serial console, auto-growing root partition
|
||||||
|
- `qemu-guest.nix` profile (virtio drivers)
|
||||||
|
- ext4 root filesystem with `autoResize`
|
||||||
|
|
||||||
|
### What we need to customize
|
||||||
|
|
||||||
|
The stock OpenStack image pulls SSH keys and hostname from EC2-style metadata. Since we're baking the SSH key into the image, we need a simpler configuration:
|
||||||
|
|
||||||
|
- SSH authorized keys baked into the image
|
||||||
|
- Base packages (age, vim, wget, git)
|
||||||
|
- Nix substituters (`cache.nixos.org` only - internal cache not reachable)
|
||||||
|
- systemd-networkd with DHCP
|
||||||
|
- GRUB bootloader
|
||||||
|
- Firewall enabled (public-facing host)
|
||||||
|
|
||||||
|
### Differences from template2
|
||||||
|
|
||||||
|
| Aspect | template2 (Proxmox) | openstack-template (OpenStack) |
|
||||||
|
|--------|---------------------|-------------------------------|
|
||||||
|
| Image format | VMA (`.vma.zst`) | qcow2 (`.qcow2`) |
|
||||||
|
| Image variant | `proxmox` | `openstack` |
|
||||||
|
| Cloud-init | ConfigDrive + NoCloud | Not used (SSH key baked in) |
|
||||||
|
| Nix cache | Internal + nixos.org | `cache.nixos.org` only |
|
||||||
|
| Vault | AppRole via wrapped token | None |
|
||||||
|
| Bootstrap | Automatic nixos-rebuild on first boot | Manual |
|
||||||
|
| Network | Internal DHCP | OpenStack DHCP |
|
||||||
|
| DNS | Internal ns1/ns2 | Public DNS |
|
||||||
|
| Firewall | Disabled (trusted network) | Enabled |
|
||||||
|
| System modules | Full `../../system` import | Minimal (sshd, packages only) |
|
||||||
|
|
||||||
|
## Implementation Steps
|
||||||
|
|
||||||
|
### Phase 1: Build the image
|
||||||
|
|
||||||
|
1. Create `hosts/openstack-template/` with minimal configuration
|
||||||
|
- `default.nix` - imports (only sshd and packages from `system/`, not the full set)
|
||||||
|
- `configuration.nix` - base config: SSH key, DHCP, GRUB, base packages, firewall on
|
||||||
|
- `hardware-configuration.nix` - qemu-guest profile with virtio drivers
|
||||||
|
- Exclude from DNS and monitoring (`homelab.dns.enable = false`, `homelab.monitoring.enable = false`)
|
||||||
|
- May need to override parts of `image.modules.openstack` to disable the EC2 metadata fetcher if it causes boot delays
|
||||||
|
2. Build with `nixos-rebuild build-image --image-variant openstack --flake .#openstack-template`
|
||||||
|
3. Verify the qcow2 image is produced in `result/`
|
||||||
|
|
||||||
|
### Phase 2: Upload and test
|
||||||
|
|
||||||
|
1. Add `openstackclient` to the devshell
|
||||||
|
2. Upload image: `openstack image create --disk-format qcow2 --file result/<image>.qcow2 nixos-template`
|
||||||
|
3. Boot a test instance from the image
|
||||||
|
4. Verify: SSH access works, DHCP networking, Nix builds work
|
||||||
|
5. Test manual `nixos-rebuild switch --flake` against the instance
|
||||||
|
|
||||||
|
### Phase 3: Automation (optional, later)
|
||||||
|
|
||||||
|
Consider an Ansible playbook similar to `build-and-deploy-template.yml` for image builds + uploads. Low priority since this will be done rarely.
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- [ ] Should external VPS hosts eventually move to a separate flake? (Depends on how different they end up being from homelab hosts)
|
||||||
|
- [ ] Will the stock `openstack-config.nix` metadata fetcher cause boot delays/errors if the metadata service isn't reachable? May need to disable it.
|
||||||
|
- [ ] **Flavor selection** - investigate what flavors are available in the quota. The standard small flavors likely have insufficient root disk for a NixOS host (Nix store grows fast). Options:
|
||||||
|
- Use a larger flavor with adequate root disk
|
||||||
|
- Create a custom flavor (if permissions allow)
|
||||||
|
- Cinder block storage is an option in theory, but was very slow last time it was tested - avoid if possible
|
||||||
|
- [ ] Consolidation opportunity - currently running multiple smaller VMs on OpenStack. Could a single larger NixOS VM replace several of them?
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- `nixos-rebuild build-image --image-variant openstack` uses the same `image.modules` system as Proxmox
|
||||||
|
- nixpkgs also has an `openstack-zfs` variant if ZFS root is ever wanted
|
||||||
|
- The stock OpenStack module imports `ec2-data.nix` and `amazon-init.nix` - these may need to be disabled or overridden if they cause issues without a metadata service
|
||||||
231
docs/plans/pn51-stability.md
Normal file
231
docs/plans/pn51-stability.md
Normal file
@@ -0,0 +1,231 @@
|
|||||||
|
# ASUS PN51 Stability Testing
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Two ASUS PN51-E1 mini PCs (Ryzen 7 5700U) purchased years ago but shelved due to stability issues. Revisiting them to potentially add to the homelab.
|
||||||
|
|
||||||
|
## Hardware
|
||||||
|
|
||||||
|
| | pn01 (10.69.12.60) | pn02 (10.69.12.61) |
|
||||||
|
|---|---|---|
|
||||||
|
| **CPU** | AMD Ryzen 7 5700U (8C/16T) | AMD Ryzen 7 5700U (8C/16T) |
|
||||||
|
| **RAM** | 2x 32GB DDR4 SO-DIMM (64GB) | 1x 32GB DDR4 SO-DIMM (32GB) |
|
||||||
|
| **Storage** | 1TB NVMe | 1TB Samsung 870 EVO (SATA SSD) |
|
||||||
|
| **BIOS** | 0508 (2023-11-08) | Updated 2026-02-21 (latest from ASUS) |
|
||||||
|
|
||||||
|
## Original Issues
|
||||||
|
|
||||||
|
- **pn01**: Would boot but freeze randomly after some time. No console errors, completely unresponsive. memtest86 passed.
|
||||||
|
- **pn02**: Had trouble booting — would start loading kernel from installer USB then instantly reboot. When it did boot, would also freeze randomly.
|
||||||
|
|
||||||
|
## Debugging Steps
|
||||||
|
|
||||||
|
### 2026-02-21: Initial Setup
|
||||||
|
|
||||||
|
1. **Disabled fTPM** (labeled "Security Device" in ASUS BIOS) on both units
|
||||||
|
- AMD Ryzen 5000 series had a known fTPM bug causing random hard freezes with no console output
|
||||||
|
- Both units booted the NixOS installer successfully after this change
|
||||||
|
2. Installed NixOS on both, added to repo as `pn01` and `pn02` on VLAN 12
|
||||||
|
3. Configured monitoring (node-exporter, promtail, nixos-exporter)
|
||||||
|
|
||||||
|
### 2026-02-21: pn02 First Freeze
|
||||||
|
|
||||||
|
- pn02 froze approximately 1 hour after boot
|
||||||
|
- All three Prometheus targets went down simultaneously — hard freeze, not graceful shutdown
|
||||||
|
- Journal on next boot: `system.journal corrupted or uncleanly shut down`
|
||||||
|
- Kernel warnings from boot log before freeze:
|
||||||
|
- **TSC clocksource unstable**: `Marking clocksource 'tsc' as unstable because the skew is too large` — TSC skewing ~3.8ms over 500ms relative to HPET watchdog
|
||||||
|
- **AMD PSP error**: `psp gfx command LOAD_TA(0x1) failed and response status is (0x7)` — Platform Security Processor failing to load trusted application
|
||||||
|
- pn01 did not show these warnings on this particular boot, but has shown them historically (see below)
|
||||||
|
|
||||||
|
### 2026-02-21: pn02 BIOS Update
|
||||||
|
|
||||||
|
- Updated pn02 BIOS to latest version from ASUS website
|
||||||
|
- **TSC still unstable** after BIOS update — same ~3.8ms skew
|
||||||
|
- **PSP LOAD_TA still failing** after BIOS update
|
||||||
|
- Monitoring back up, letting it run to see if freeze recurs
|
||||||
|
|
||||||
|
### 2026-02-22: TSC/PSP Confirmed on Both Units
|
||||||
|
|
||||||
|
- Checked kernel logs after ~9 hours uptime — both units still running
|
||||||
|
- **pn01 now shows TSC unstable and PSP LOAD_TA failure** on this boot (same ~3.8ms TSC skew, same PSP error)
|
||||||
|
- pn01 had these same issues historically when tested years ago — the earlier clean boot was just lucky TSC calibration timing
|
||||||
|
- **Conclusion**: TSC instability and PSP LOAD_TA are platform-level quirks of the PN51-E1 / Ryzen 5700U, present on both units
|
||||||
|
- The kernel handles TSC instability gracefully (falls back to HPET), and PSP LOAD_TA is non-fatal
|
||||||
|
- Neither issue is likely the cause of the hard freezes — the fTPM bug remains the primary suspect
|
||||||
|
|
||||||
|
### 2026-02-22: Stress Test (1 hour)
|
||||||
|
|
||||||
|
- Ran `stress-ng --cpu 16 --vm 2 --vm-bytes 8G --timeout 1h` on both units
|
||||||
|
- CPU temps peaked at ~85°C, settled to ~80°C sustained (throttle limit is 105°C)
|
||||||
|
- Both survived the full hour with no freezes, no MCE errors, no kernel issues
|
||||||
|
- No concerning log entries during or after the test
|
||||||
|
|
||||||
|
### 2026-02-22: TSC Runtime Switch Test
|
||||||
|
|
||||||
|
- Attempted to switch clocksource back to TSC at runtime on pn01:
|
||||||
|
```
|
||||||
|
echo tsc > /sys/devices/system/clocksource/clocksource0/current_clocksource
|
||||||
|
```
|
||||||
|
- Kernel watchdog immediately reverted to HPET — TSC skew is ongoing, not just a boot-time issue
|
||||||
|
- **Conclusion**: TSC is genuinely unstable on the PN51-E1 platform. HPET is the correct clocksource.
|
||||||
|
- For virtualization (Incus), this means guest VMs will use HPET-backed timing. Performance impact is minimal for typical server workloads (DNS, monitoring, light services) but would matter for latency-sensitive applications.
|
||||||
|
|
||||||
|
### 2026-02-22: BIOS Tweaks (Both Units)
|
||||||
|
|
||||||
|
- Disabled ErP Ready on both (EU power efficiency mode — aggressively cuts power in idle)
|
||||||
|
- Disabled WiFi and Bluetooth in BIOS on both
|
||||||
|
- **TSC still unstable** after these changes — same ~3.8ms skew on both units
|
||||||
|
- ErP/power states are not the cause of the TSC issue
|
||||||
|
|
||||||
|
### 2026-02-22: pn02 Second Freeze
|
||||||
|
|
||||||
|
- pn02 froze again ~5.5 hours after boot (at idle, not under load)
|
||||||
|
- All Prometheus targets down simultaneously — same hard freeze pattern
|
||||||
|
- Last log entry was normal nix-daemon activity — zero warning/error logs before crash
|
||||||
|
- Survived the 1h stress test earlier but froze at idle later — not thermal
|
||||||
|
- pn01 remains stable throughout
|
||||||
|
- **Action**: Blacklisted `amdgpu` kernel module on pn02 (`boot.blacklistedKernelModules = [ "amdgpu" ]`) to eliminate GPU/PSP firmware interactions as a cause. No console output but managed via SSH.
|
||||||
|
- **Action**: Added diagnostic/recovery config to pn02:
|
||||||
|
- `panic=10` + `nmi_watchdog=1` kernel params — auto-reboot after 10s on panic
|
||||||
|
- `softlockup_panic` + `hardlockup_panic` sysctls — convert lockups to panics with stack traces
|
||||||
|
- `hardware.rasdaemon` with recording — logs hardware errors (MCE, PCIe AER, memory) to sqlite database, survives reboots
|
||||||
|
- Check recorded errors: `ras-mc-ctl --summary`, `ras-mc-ctl --errors`
|
||||||
|
|
||||||
|
## Benign Kernel Errors (Both Units)
|
||||||
|
|
||||||
|
These appear on both units and can be ignored:
|
||||||
|
- `clocksource: Marking clocksource 'tsc' as unstable` — TSC skew vs HPET, kernel falls back gracefully. Platform-level quirk on PN51-E1, not always reproducible on every boot.
|
||||||
|
- `psp gfx command LOAD_TA(0x1) failed` — AMD PSP firmware error, non-fatal. Present on both units across all BIOS versions.
|
||||||
|
- `pcie_mp2_amd: amd_sfh_hid_client_init failed err -95` — AMD Sensor Fusion Hub, no sensors connected
|
||||||
|
- `Bluetooth: hci0: Reading supported features failed` — Bluetooth init quirk
|
||||||
|
- `Serial bus multi instantiate pseudo device driver INT3515:00: error -ENXIO` — unused serial bus device
|
||||||
|
- `snd_hda_intel: no codecs found` — no audio device connected, headless server
|
||||||
|
- `ata2.00: supports DRM functions and may not be fully accessible` — Samsung SSD DRM quirk (pn02 only)
|
||||||
|
|
||||||
|
### 2026-02-23: processor.max_cstate=1 and Proxmox Forums
|
||||||
|
|
||||||
|
- Found a thread on the Proxmox forums about PN51 units with similar freeze issues
|
||||||
|
- Many users reporting identical symptoms — random hard freezes, no log evidence
|
||||||
|
- No conclusive fix. Some have frequent freezes, others only a few times a month
|
||||||
|
- Some reported BIOS updates helped, but results inconsistent
|
||||||
|
- Added `processor.max_cstate=1` kernel parameter to pn02 — limits CPU to C1 halt state, preventing deep C-state sleep transitions that may trigger freezes on AMD mobile chips
|
||||||
|
- Also applied: amdgpu blacklist, panic=10, nmi_watchdog=1, softlockup/hardlockup panic, rasdaemon
|
||||||
|
|
||||||
|
### 2026-02-23: logind D-Bus Deadlock (pn02)
|
||||||
|
|
||||||
|
- node-exporter alert fired — but host was NOT frozen
|
||||||
|
- logind was running (PID 871) but deadlocked on D-Bus — not responding to `org.freedesktop.login1` requests
|
||||||
|
- Every node-exporter scrape blocked for 25s waiting for logind, causing scrape timeouts
|
||||||
|
- Likely related to amdgpu blacklist — no DRM device means no graphical seat, logind may have deadlocked during seat enumeration at boot
|
||||||
|
- Fix: `systemctl restart systemd-logind` + `systemctl restart prometheus-node-exporter`
|
||||||
|
- After restart, logind responded normally and reported seat0
|
||||||
|
|
||||||
|
### 2026-02-27: pn02 Third Freeze
|
||||||
|
|
||||||
|
- pn02 crashed again after ~2 days 21 hours uptime (longest run so far)
|
||||||
|
- Evidence of crash:
|
||||||
|
- Journal file corrupted: `system.journal corrupted or uncleanly shut down`
|
||||||
|
- Boot partition fsck: `Dirty bit is set. Fs was not properly unmounted`
|
||||||
|
- No orderly shutdown logs from previous boot
|
||||||
|
- No auto-upgrade triggered
|
||||||
|
- **NMI watchdog did NOT fire** — no kernel panic logged. This is a true hard lockup below NMI level
|
||||||
|
- **rasdaemon recorded nothing** — no MCE, AER, or memory errors in the sqlite database
|
||||||
|
- **Positive**: The system auto-rebooted this time (likely hardware watchdog), unlike previous freezes that required manual power cycle
|
||||||
|
- `processor.max_cstate=1` may have extended uptime (2d21h vs previous 1h and 5.5h) but did not prevent the freeze
|
||||||
|
|
||||||
|
### 2026-02-27 to 2026-03-03: Relative Stability
|
||||||
|
|
||||||
|
- pn02 ran without crashes for approximately one week after the third freeze
|
||||||
|
- pn01 continued to be completely stable throughout this period
|
||||||
|
- Auto-upgrade reboots continued daily (~4am) on both units — these are planned and healthy
|
||||||
|
|
||||||
|
### 2026-03-04: pn02 Fourth Crash — sched_ext Kernel Oops (pstore captured)
|
||||||
|
|
||||||
|
- pn02 crashed after ~5.8 days uptime (504566s)
|
||||||
|
- **First crash captured by pstore** — kernel oops and panic stack traces preserved across reboot
|
||||||
|
- Journal corruption confirmed: `system.journal corrupted or uncleanly shut down`
|
||||||
|
- **Crash location**: `RIP: 0010:set_next_task_scx+0x6e/0x210` — crash in the **sched_ext (SCX) scheduler** subsystem
|
||||||
|
- **Call trace**: `sysvec_apic_timer_interrupt` → `cpuidle_enter_state` — crashed during CPU idle, triggered by APIC timer interrupt
|
||||||
|
- **CR2**: `ffffffffffffff89` — dereferencing an obviously invalid kernel pointer
|
||||||
|
- **Kernel**: 6.12.74 (NixOS 25.11)
|
||||||
|
- **Significance**: This is the first crash with actual diagnostic output. Previous crashes were silent sub-NMI freezes. The sched_ext scheduler path is a new finding — earlier crashes were assumed to be hardware-level.
|
||||||
|
|
||||||
|
### 2026-03-06: pn02 Fifth Crash
|
||||||
|
|
||||||
|
- pn02 crashed again — journal corruption on next boot
|
||||||
|
- No pstore data captured for this crash
|
||||||
|
|
||||||
|
### 2026-03-07: pn02 Sixth and Seventh Crashes — Two in One Day
|
||||||
|
|
||||||
|
**First crash (~11:06 UTC):**
|
||||||
|
- ~26.6 hours uptime (95994s)
|
||||||
|
- **pstore captured both Oops and Panic**
|
||||||
|
- **Crash location**: Scheduler code path — `pick_next_task_fair` → `__pick_next_task`
|
||||||
|
- **CR2**: `000000c000726000` — invalid pointer dereference
|
||||||
|
- **Notable**: `dbus-daemon` segfaulted ~50 minutes before the kernel crash (`segfault at 0` in `libdbus-1.so.3.32.4` on CPU 0) — may indicate memory corruption preceding the kernel crash
|
||||||
|
|
||||||
|
**Second crash (~21:15 UTC):**
|
||||||
|
- Journal corruption confirmed on next boot
|
||||||
|
- No pstore data captured
|
||||||
|
|
||||||
|
### 2026-03-07: pn01 Status
|
||||||
|
|
||||||
|
- pn01 has had **zero crashes** since initial setup on Feb 21
|
||||||
|
- Zero journal corruptions, zero pstore dumps in 30 days
|
||||||
|
- Same BOOT_ID maintained between daily auto-upgrade reboots — consistently clean shutdown/reboot cycles
|
||||||
|
- All 8 reboots in 30 days are planned auto-upgrade reboots
|
||||||
|
- **pn01 is fully stable**
|
||||||
|
|
||||||
|
## Crash Summary
|
||||||
|
|
||||||
|
| Date | Uptime Before Crash | Crash Type | Diagnostic Data |
|
||||||
|
|------|---------------------|------------|-----------------|
|
||||||
|
| Feb 21 | ~1h | Silent freeze | None — sub-NMI |
|
||||||
|
| Feb 22 | ~5.5h | Silent freeze | None — sub-NMI |
|
||||||
|
| Feb 27 | ~2d 21h | Silent freeze | None — sub-NMI, rasdaemon empty |
|
||||||
|
| Mar 4 | ~5.8d | **Kernel oops** | pstore: `set_next_task_scx` (sched_ext) |
|
||||||
|
| Mar 6 | Unknown | Crash | Journal corruption only |
|
||||||
|
| Mar 7 | ~26.6h | **Kernel oops + panic** | pstore: `pick_next_task_fair` (scheduler) + dbus segfault |
|
||||||
|
| Mar 7 | Unknown | Crash | Journal corruption only |
|
||||||
|
|
||||||
|
## Conclusion
|
||||||
|
|
||||||
|
**pn02 is unreliable.** After exhausting mitigations (fTPM disabled, BIOS updated, WiFi/BT disabled, ErP disabled, amdgpu blacklisted, processor.max_cstate=1, NMI watchdog, rasdaemon), the unit still crashes every few days. 26 reboots in 30 days (7 unclean crashes + daily auto-upgrade reboots).
|
||||||
|
|
||||||
|
The pstore crash dumps from March reveal a new dimension: at least some crashes are **kernel scheduler bugs in sched_ext**, not just silent hardware-level freezes. The `set_next_task_scx` and `pick_next_task_fair` crash sites, combined with the dbus-daemon segfault before one crash, suggest possible memory corruption that manifests in the scheduler. It's unclear whether this is:
|
||||||
|
1. A sched_ext kernel bug exposed by the PN51's hardware quirks (unstable TSC, C-state behavior)
|
||||||
|
2. Hardware-induced memory corruption that happens to hit scheduler data structures
|
||||||
|
3. A pure software bug in the 6.12.74 kernel's sched_ext implementation
|
||||||
|
|
||||||
|
**pn01 is stable** — zero crashes in 30 days of continuous operation. Both units have identical kernel and NixOS configuration (minus pn02's diagnostic mitigations), so the difference points toward a hardware defect specific to the pn02 board.
|
||||||
|
|
||||||
|
## Next Steps
|
||||||
|
|
||||||
|
- **pn02 memtest**: Run memtest86 for 24h+ (available in systemd-boot menu). The crash signatures (userspace segfaults before kernel panics, corrupted pointers in scheduler structures) are consistent with intermittent RAM errors that a quick pass wouldn't catch. If memtest finds errors, swap the DIMM.
|
||||||
|
- **pn02**: Consider scrapping or repurposing for non-critical workloads that tolerate random reboots (auto-recovery via hardware watchdog is now working)
|
||||||
|
- **pn02 investigation**: Could try disabling sched_ext (`boot.kernelParams = [ "sched_ext.enabled=0" ]` or equivalent) to test whether the crashes stop — would help distinguish kernel bug from hardware defect
|
||||||
|
- **pn01**: Continue monitoring. If it remains stable long-term, it is viable for light workloads
|
||||||
|
- If pn01 eventually crashes, apply the same mitigations (amdgpu blacklist, max_cstate=1) to see if they help
|
||||||
|
- For the Incus hypervisor plan: likely need different hardware. Evaluating GMKtec G3 (Intel) as an alternative. Note: mixed Intel/AMD cluster complicates live migration
|
||||||
|
|
||||||
|
## Diagnostics and Auto-Recovery (pn02)
|
||||||
|
|
||||||
|
Currently deployed on pn02:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
boot.blacklistedKernelModules = [ "amdgpu" ];
|
||||||
|
boot.kernelParams = [ "panic=10" "nmi_watchdog=1" "processor.max_cstate=1" ];
|
||||||
|
boot.kernel.sysctl."kernel.softlockup_panic" = 1;
|
||||||
|
boot.kernel.sysctl."kernel.hardlockup_panic" = 1;
|
||||||
|
hardware.rasdaemon.enable = true;
|
||||||
|
hardware.rasdaemon.record = true;
|
||||||
|
```
|
||||||
|
|
||||||
|
**Crash recovery is working**: pstore now captures kernel oops/panic data, and the system auto-reboots via `panic=10` or SP5100 TCO hardware watchdog.
|
||||||
|
|
||||||
|
**After reboot, check:**
|
||||||
|
- `ras-mc-ctl --summary` — overview of hardware errors
|
||||||
|
- `ras-mc-ctl --errors` — detailed error list
|
||||||
|
- `journalctl -b -1 -p err` — kernel logs from crashed boot (if panic was logged)
|
||||||
|
- pstore data is automatically archived by `systemd-pstore.service` and forwarded to Loki via promtail
|
||||||
121
docs/plans/remote-access.md
Normal file
121
docs/plans/remote-access.md
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
# Remote Access to Homelab Services
|
||||||
|
|
||||||
|
## Status: Planning
|
||||||
|
|
||||||
|
## Goal
|
||||||
|
|
||||||
|
Enable personal remote access to selected homelab services from outside the internal network, without exposing anything directly to the internet.
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
- All services are only accessible from the internal 10.69.13.x network
|
||||||
|
- http-proxy has a WireGuard tunnel (`wg0`, `10.69.222.0/24`) to a VPS (`docker2.t-juice.club`) on an OpenStack cluster
|
||||||
|
- VPS runs Traefik which proxies selected services (including Jellyfin) back through the tunnel to http-proxy's Caddy
|
||||||
|
- No other services are directly exposed to the public internet
|
||||||
|
|
||||||
|
## Decision: WireGuard Gateway
|
||||||
|
|
||||||
|
After evaluating WireGuard gateway vs Headscale (self-hosted Tailscale), the **WireGuard gateway** approach was chosen:
|
||||||
|
|
||||||
|
- Only 2 client devices (laptop + phone), so Headscale's device management UX isn't needed
|
||||||
|
- Split DNS works fine on Linux laptop via systemd-resolved; all-or-nothing DNS on phone is acceptable for occasional use
|
||||||
|
- Simpler infrastructure - no control server to maintain
|
||||||
|
- Builds on existing WireGuard experience and setup
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
graph TD
|
||||||
|
clients["Laptop / Phone"]
|
||||||
|
vps["VPS<br/>(WireGuard endpoint)"]
|
||||||
|
extgw["extgw01<br/>(gateway + bastion)"]
|
||||||
|
grafana["Grafana<br/>monitoring01:3000"]
|
||||||
|
jellyfin["Jellyfin<br/>jelly01:8096"]
|
||||||
|
arr["arr stack<br/>*-jail hosts"]
|
||||||
|
|
||||||
|
clients -->|WireGuard| vps
|
||||||
|
vps -->|WireGuard tunnel| extgw
|
||||||
|
extgw -->|allowed traffic| grafana
|
||||||
|
extgw -->|allowed traffic| jellyfin
|
||||||
|
extgw -->|allowed traffic| arr
|
||||||
|
```
|
||||||
|
|
||||||
|
### Existing path (unchanged)
|
||||||
|
|
||||||
|
The current public access path stays as-is:
|
||||||
|
|
||||||
|
```
|
||||||
|
Internet → VPS (Traefik) → WireGuard → http-proxy (Caddy) → internal services
|
||||||
|
```
|
||||||
|
|
||||||
|
This handles public Jellyfin access and any other publicly-exposed services.
|
||||||
|
|
||||||
|
### New path (personal VPN)
|
||||||
|
|
||||||
|
A separate WireGuard tunnel for personal remote access with restricted firewall rules:
|
||||||
|
|
||||||
|
```
|
||||||
|
Laptop/Phone → VPS (WireGuard peers) → tunnel → extgw01 (firewall) → allowed services
|
||||||
|
```
|
||||||
|
|
||||||
|
### Access tiers
|
||||||
|
|
||||||
|
1. **VPN (default)**: Laptop/phone connect to VPS WireGuard endpoint, traffic routed through extgw01 firewall. Only whitelisted services are reachable.
|
||||||
|
2. **SSH + 2FA (escalated)**: SSH into extgw01 for full network access when needed.
|
||||||
|
|
||||||
|
## New Host: extgw01
|
||||||
|
|
||||||
|
A NixOS host on the internal network acting as both WireGuard gateway and SSH bastion.
|
||||||
|
|
||||||
|
### Responsibilities
|
||||||
|
|
||||||
|
- **WireGuard tunnel** to the VPS for client traffic
|
||||||
|
- **Firewall** with allowlist controlling which internal services are reachable through the VPN
|
||||||
|
- **SSH bastion** with 2FA for full network access when needed
|
||||||
|
- **DNS**: Clients get split DNS config (laptop via systemd-resolved routing domain, phone uses internal DNS for all queries)
|
||||||
|
|
||||||
|
### Firewall allowlist (initial)
|
||||||
|
|
||||||
|
| Service | Destination | Port |
|
||||||
|
|------------|------------------------------|-------|
|
||||||
|
| Grafana | monitoring01.home.2rjus.net | 3000 |
|
||||||
|
| Jellyfin | jelly01.home.2rjus.net | 8096 |
|
||||||
|
| Sonarr | sonarr-jail.home.2rjus.net | 8989 |
|
||||||
|
| Radarr | radarr-jail.home.2rjus.net | 7878 |
|
||||||
|
| NZBget | nzbget-jail.home.2rjus.net | 6789 |
|
||||||
|
|
||||||
|
### SSH 2FA options (to be decided)
|
||||||
|
|
||||||
|
- **Kanidm**: Already deployed on kanidm01, supports RADIUS/OAuth2 for PAM integration
|
||||||
|
- **SSH certificates via OpenBao**: Fits existing Vault infrastructure, short-lived certs
|
||||||
|
- **TOTP via PAM**: Simplest fallback, Google Authenticator / similar
|
||||||
|
|
||||||
|
## VPS Configuration
|
||||||
|
|
||||||
|
The VPS needs a new WireGuard interface (separate from the existing http-proxy tunnel):
|
||||||
|
|
||||||
|
- WireGuard endpoint listening on a public UDP port
|
||||||
|
- 2 peers: laptop, phone
|
||||||
|
- Routes client traffic through tunnel to extgw01
|
||||||
|
- Minimal config - just routing, no firewall policy (that lives on extgw01)
|
||||||
|
|
||||||
|
## Implementation Steps
|
||||||
|
|
||||||
|
1. **Create extgw01 host configuration** in this repo
|
||||||
|
- VM provisioned via OpenTofu (same as other hosts)
|
||||||
|
- WireGuard interface for VPS tunnel
|
||||||
|
- nftables/iptables firewall with service allowlist
|
||||||
|
- IP forwarding enabled
|
||||||
|
2. **Configure VPS WireGuard** for client peers
|
||||||
|
- New WireGuard interface with laptop + phone peers
|
||||||
|
- Routing for 10.69.13.0/24 through extgw01 tunnel
|
||||||
|
3. **Set up client configs**
|
||||||
|
- Laptop: WireGuard config + systemd-resolved split DNS for `home.2rjus.net`
|
||||||
|
- Phone: WireGuard app config with DNS pointing at internal nameservers
|
||||||
|
4. **Set up SSH 2FA** on extgw01
|
||||||
|
- Evaluate Kanidm integration vs OpenBao SSH certs vs TOTP
|
||||||
|
5. **Test and verify**
|
||||||
|
- VPN access to allowed services only
|
||||||
|
- Firewall blocks everything else
|
||||||
|
- SSH + 2FA grants full access
|
||||||
|
- Existing public access path unaffected
|
||||||
224
docs/plans/security-hardening.md
Normal file
224
docs/plans/security-hardening.md
Normal file
@@ -0,0 +1,224 @@
|
|||||||
|
# Security Hardening Plan
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Address security gaps identified in infrastructure review. Focus areas: SSH hardening, network security, logging improvements, and secrets management.
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
- SSH allows password auth and unrestricted root login (`system/sshd.nix`)
|
||||||
|
- Firewall disabled on all hosts (`networking.firewall.enable = false`)
|
||||||
|
- Promtail ships logs over HTTP to Loki
|
||||||
|
- Loki has no authentication (`auth_enabled = false`)
|
||||||
|
- AppRole secret-IDs never expire (`secret_id_ttl = 0`)
|
||||||
|
- Vault TLS verification disabled by default (`skipTlsVerify = true`)
|
||||||
|
- Audit logging exists (`common/ssh-audit.nix`) but not applied globally
|
||||||
|
- Alert rules focus on availability, no security event detection
|
||||||
|
|
||||||
|
## Priority Matrix
|
||||||
|
|
||||||
|
| Issue | Severity | Effort | Priority |
|
||||||
|
|-------|----------|--------|----------|
|
||||||
|
| SSH password auth | High | Low | **P1** |
|
||||||
|
| Firewall disabled | High | Medium | **P1** |
|
||||||
|
| Promtail HTTP (no TLS) | High | Medium | **P2** |
|
||||||
|
| No security alerting | Medium | Low | **P2** |
|
||||||
|
| Audit logging not global | Low | Low | **P2** |
|
||||||
|
| Loki no auth | Medium | Medium | **P3** |
|
||||||
|
| Secret-ID TTL | Medium | Medium | **P3** |
|
||||||
|
| Vault skipTlsVerify | Medium | Low | **P3** |
|
||||||
|
|
||||||
|
## Phase 1: Quick Wins (P1)
|
||||||
|
|
||||||
|
### 1.1 SSH Hardening
|
||||||
|
|
||||||
|
Edit `system/sshd.nix`:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
services.openssh = {
|
||||||
|
enable = true;
|
||||||
|
settings = {
|
||||||
|
PermitRootLogin = "prohibit-password"; # Key-only root login
|
||||||
|
PasswordAuthentication = false;
|
||||||
|
KbdInteractiveAuthentication = false;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**Prerequisite:** Verify all hosts have SSH keys deployed for root.
|
||||||
|
|
||||||
|
### 1.2 Enable Firewall
|
||||||
|
|
||||||
|
Create `system/firewall.nix` with default deny policy:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
{ ... }: {
|
||||||
|
networking.firewall.enable = true;
|
||||||
|
|
||||||
|
# Use openssh's built-in firewall integration
|
||||||
|
services.openssh.openFirewall = true;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Useful firewall options:**
|
||||||
|
|
||||||
|
| Option | Description |
|
||||||
|
|--------|-------------|
|
||||||
|
| `networking.firewall.trustedInterfaces` | Accept all traffic from these interfaces (e.g., `[ "lo" ]`) |
|
||||||
|
| `networking.firewall.interfaces.<name>.allowedTCPPorts` | Per-interface port rules |
|
||||||
|
| `networking.firewall.extraInputRules` | Custom nftables rules (for complex filtering) |
|
||||||
|
|
||||||
|
**Network range restrictions:** Consider restricting SSH to the infrastructure subnet (`10.69.13.0/24`) using `extraInputRules` for defense in depth. However, this adds complexity and may not be necessary given the trusted network model.
|
||||||
|
|
||||||
|
#### Per-Interface Rules (http-proxy WireGuard)
|
||||||
|
|
||||||
|
The `http-proxy` host has a WireGuard interface (`wg0`) that may need different rules than the LAN interface. Use `networking.firewall.interfaces` to apply per-interface policies:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
# Example: http-proxy with different rules per interface
|
||||||
|
networking.firewall = {
|
||||||
|
enable = true;
|
||||||
|
|
||||||
|
# Default: only SSH (via openFirewall)
|
||||||
|
allowedTCPPorts = [ ];
|
||||||
|
|
||||||
|
# LAN interface: allow HTTP/HTTPS
|
||||||
|
interfaces.ens18 = {
|
||||||
|
allowedTCPPorts = [ 80 443 ];
|
||||||
|
};
|
||||||
|
|
||||||
|
# WireGuard interface: restrict to specific services or trust fully
|
||||||
|
interfaces.wg0 = {
|
||||||
|
allowedTCPPorts = [ 80 443 ];
|
||||||
|
# Or use trustedInterfaces = [ "wg0" ] if fully trusted
|
||||||
|
};
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**TODO:** Investigate current WireGuard usage on http-proxy to determine appropriate rules.
|
||||||
|
|
||||||
|
Then per-host, open required ports:
|
||||||
|
|
||||||
|
| Host | Additional Ports |
|
||||||
|
|------|------------------|
|
||||||
|
| ns1/ns2 | 53 (TCP/UDP) |
|
||||||
|
| vault01 | 8200 |
|
||||||
|
| monitoring01 | 3100, 9090, 3000, 9093 |
|
||||||
|
| http-proxy | 80, 443 |
|
||||||
|
| nats1 | 4222 |
|
||||||
|
| ha1 | 1883, 8123 |
|
||||||
|
| jelly01 | 8096 |
|
||||||
|
| nix-cache01 | 5000 |
|
||||||
|
|
||||||
|
## Phase 2: Logging & Detection (P2)
|
||||||
|
|
||||||
|
### 2.1 Enable TLS for Promtail → Loki
|
||||||
|
|
||||||
|
Update `system/monitoring/logs.nix`:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
clients = [{
|
||||||
|
url = "https://monitoring01.home.2rjus.net:3100/loki/api/v1/push";
|
||||||
|
tls_config = {
|
||||||
|
ca_file = "/etc/ssl/certs/homelab-root-ca.pem";
|
||||||
|
};
|
||||||
|
}];
|
||||||
|
```
|
||||||
|
|
||||||
|
Requires:
|
||||||
|
- Configure Loki with TLS certificate (use internal ACME)
|
||||||
|
- Ensure all hosts trust root CA (already done via `system/pki/root-ca.nix`)
|
||||||
|
|
||||||
|
### 2.2 Security Alert Rules
|
||||||
|
|
||||||
|
Add to `services/monitoring/rules.yml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- name: security_rules
|
||||||
|
rules:
|
||||||
|
- alert: ssh_auth_failures
|
||||||
|
expr: increase(node_logind_sessions_total[5m]) > 20
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Unusual login activity on {{ $labels.instance }}"
|
||||||
|
|
||||||
|
- alert: vault_secret_fetch_failure
|
||||||
|
expr: increase(vault_secret_failures[5m]) > 5
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Vault secret fetch failures on {{ $labels.instance }}"
|
||||||
|
```
|
||||||
|
|
||||||
|
Also add Loki-based alerts for:
|
||||||
|
- Failed SSH attempts: `{job="systemd-journal"} |= "Failed password"`
|
||||||
|
- sudo usage: `{job="systemd-journal"} |= "sudo"`
|
||||||
|
|
||||||
|
### 2.3 Global Audit Logging
|
||||||
|
|
||||||
|
Add `./common/ssh-audit.nix` import to `system/default.nix`:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
imports = [
|
||||||
|
# ... existing imports
|
||||||
|
../common/ssh-audit.nix
|
||||||
|
];
|
||||||
|
```
|
||||||
|
|
||||||
|
## Phase 3: Defense in Depth (P3)
|
||||||
|
|
||||||
|
### 3.1 Loki Authentication
|
||||||
|
|
||||||
|
Options:
|
||||||
|
1. **Basic auth via reverse proxy** - Put Loki behind Caddy with auth
|
||||||
|
2. **Loki multi-tenancy** - Enable `auth_enabled = true` and use tenant IDs
|
||||||
|
3. **Network isolation** - Bind Loki only to localhost, expose via authenticated proxy
|
||||||
|
|
||||||
|
Recommendation: Option 1 (reverse proxy) is simplest for homelab.
|
||||||
|
|
||||||
|
### 3.2 AppRole Secret Rotation
|
||||||
|
|
||||||
|
Update `terraform/vault/approle.tf`:
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
secret_id_ttl = 2592000 # 30 days
|
||||||
|
```
|
||||||
|
|
||||||
|
Add documentation for manual rotation procedure or implement automated rotation via the existing `restartTrigger` mechanism in `vault-secrets.nix`.
|
||||||
|
|
||||||
|
### 3.3 Enable Vault TLS Verification
|
||||||
|
|
||||||
|
Change default in `system/vault-secrets.nix`:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
skipTlsVerify = mkOption {
|
||||||
|
type = types.bool;
|
||||||
|
default = false; # Changed from true
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**Prerequisite:** Verify all hosts trust the internal CA that signed the Vault certificate.
|
||||||
|
|
||||||
|
## Implementation Order
|
||||||
|
|
||||||
|
1. **Test on test-tier first** - Deploy phases 1-2 to testvm01/02/03
|
||||||
|
2. **Validate SSH access** - Ensure key-based login works before disabling passwords
|
||||||
|
3. **Document firewall ports** - Create reference of ports per host before enabling
|
||||||
|
4. **Phase prod rollout** - Deploy to prod hosts one at a time, verify each
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- [ ] Do all hosts have SSH keys configured for root access?
|
||||||
|
- [ ] Should firewall rules be per-host or use a central definition with roles?
|
||||||
|
- [ ] Should Loki authentication use the existing Kanidm setup?
|
||||||
|
|
||||||
|
**Resolved:** Password-based SSH access for recovery is not required - most hosts have console access through Proxmox or physical access, which provides an out-of-band recovery path if SSH keys fail.
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Firewall changes are the highest risk - test thoroughly on test-tier
|
||||||
|
- SSH hardening must not lock out access - verify keys first
|
||||||
|
- Consider creating a "break glass" procedure for emergency access if keys fail
|
||||||
156
docs/plans/truenas-migration.md
Normal file
156
docs/plans/truenas-migration.md
Normal file
@@ -0,0 +1,156 @@
|
|||||||
|
# TrueNAS Migration Planning
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
### Hardware
|
||||||
|
- CPU: AMD Ryzen 5 5600G with Radeon Graphics
|
||||||
|
- RAM: 32GB
|
||||||
|
- Network: 10GbE (mlxen0)
|
||||||
|
- Software: TrueNAS-13.0-U6.1 (Core)
|
||||||
|
|
||||||
|
### Storage Status
|
||||||
|
|
||||||
|
**hdd-pool**: 29.1TB total, **28.4TB used, 658GB free (97% capacity)** ⚠️
|
||||||
|
- mirror-0: 2x Seagate ST16000NE000 16TB HDD (16TB usable)
|
||||||
|
- mirror-1: 2x WD WD80EFBX 8TB HDD (8TB usable)
|
||||||
|
- mirror-2: 2x Seagate ST8000VN004 8TB HDD (8TB usable)
|
||||||
|
|
||||||
|
## Goal
|
||||||
|
|
||||||
|
Expand storage capacity for the main hdd-pool. Since we need to add disks anyway, also evaluating whether to upgrade or replace the entire system.
|
||||||
|
|
||||||
|
## Decisions
|
||||||
|
|
||||||
|
### Migration Approach: Option 3 - Migrate to NixOS
|
||||||
|
|
||||||
|
**Decision**: Replace TrueNAS with NixOS bare metal installation
|
||||||
|
|
||||||
|
**Rationale**:
|
||||||
|
- Aligns with existing infrastructure (16+ NixOS hosts already managed in this repo)
|
||||||
|
- Declarative configuration fits homelab philosophy
|
||||||
|
- Automatic monitoring/logging integration (Prometheus + Promtail)
|
||||||
|
- Auto-upgrades via same mechanism as other hosts
|
||||||
|
- SOPS secrets management integration
|
||||||
|
- TrueNAS-specific features (WebGUI, jails) not heavily utilized
|
||||||
|
|
||||||
|
**Service migration**:
|
||||||
|
- radarr/sonarr: Native NixOS services (`services.radarr`, `services.sonarr`)
|
||||||
|
- restic-rest: `services.restic.server`
|
||||||
|
- nzbget: NixOS service or OCI container
|
||||||
|
- NFS exports: `services.nfs.server`
|
||||||
|
|
||||||
|
### Filesystem: Keep ZFS
|
||||||
|
|
||||||
|
**Decision**: Keep existing ZFS pool, import on NixOS
|
||||||
|
|
||||||
|
**Rationale**:
|
||||||
|
- **No data migration needed**: Existing ZFS pool can be imported directly on NixOS
|
||||||
|
- **Proven reliability**: Pool has been running reliably on TrueNAS
|
||||||
|
- **NixOS ZFS support**: Well-supported, declarative configuration via `boot.zfs` and `services.zfs`
|
||||||
|
- **BTRFS RAID5/6 unreliable**: Research showed BTRFS RAID5/6 write hole is still unresolved
|
||||||
|
- **BTRFS RAID1 wasteful**: With mixed disk sizes, RAID1 wastes significant capacity vs ZFS mirrors
|
||||||
|
- Checksumming, snapshots, compression (lz4/zstd) all available
|
||||||
|
|
||||||
|
### Hardware: Keep Existing + Add Disks
|
||||||
|
|
||||||
|
**Decision**: Retain current hardware, expand disk capacity
|
||||||
|
|
||||||
|
**Hardware to keep**:
|
||||||
|
- AMD Ryzen 5 5600G (sufficient for NAS workload)
|
||||||
|
- 32GB RAM (adequate)
|
||||||
|
- 10GbE network interface
|
||||||
|
- Chassis
|
||||||
|
|
||||||
|
**Storage architecture**:
|
||||||
|
|
||||||
|
**hdd-pool** (ZFS mirrors):
|
||||||
|
- Current: 3 mirror vdevs (2x16TB + 2x8TB + 2x8TB) = 32TB usable
|
||||||
|
- Add: mirror-3 with 2x 24TB = +24TB usable
|
||||||
|
- Total after expansion: ~56TB usable
|
||||||
|
- Use: Media, downloads, backups, non-critical data
|
||||||
|
|
||||||
|
### Disk Purchase Decision
|
||||||
|
|
||||||
|
**Decision**: 2x 24TB drives (ordered, arriving 2026-02-21)
|
||||||
|
|
||||||
|
## Migration Strategy
|
||||||
|
|
||||||
|
### High-Level Plan
|
||||||
|
|
||||||
|
1. **Expand ZFS pool** (on TrueNAS):
|
||||||
|
- Install 2x 24TB drives (may need new drive trays - order from abroad if needed)
|
||||||
|
- If chassis space is limited, temporarily replace the two oldest 8TB drives (da0/ada4)
|
||||||
|
- Add as mirror-3 vdev to hdd-pool
|
||||||
|
- Verify pool health and resilver completes
|
||||||
|
- Check SMART data on old 8TB drives (all healthy as of 2026-02-20, no reallocated sectors)
|
||||||
|
- Burn-in: at minimum short + long SMART test before adding to pool
|
||||||
|
|
||||||
|
2. **Prepare NixOS configuration**:
|
||||||
|
- Create host configuration (`hosts/nas1/` or similar)
|
||||||
|
- Configure ZFS pool import (`boot.zfs.extraPools`)
|
||||||
|
- Set up services: radarr, sonarr, nzbget, restic-rest, NFS
|
||||||
|
- Configure monitoring (node-exporter, promtail, smartctl-exporter)
|
||||||
|
|
||||||
|
3. **Install NixOS**:
|
||||||
|
- `zfs export hdd-pool` on TrueNAS before shutdown (clean export)
|
||||||
|
- Wipe TrueNAS boot-pool SSDs, set up as mdadm RAID1 for NixOS root
|
||||||
|
- Install NixOS on mdadm mirror (keeps boot path ZFS-independent)
|
||||||
|
- Import hdd-pool via `boot.zfs.extraPools`
|
||||||
|
- Verify all datasets mount correctly
|
||||||
|
|
||||||
|
4. **Service migration**:
|
||||||
|
- Configure NixOS services to use ZFS dataset paths
|
||||||
|
- Update NFS exports
|
||||||
|
- Test from consuming hosts
|
||||||
|
|
||||||
|
5. **Cutover**:
|
||||||
|
- Update DNS/client mounts if IP changes
|
||||||
|
- Verify monitoring integration
|
||||||
|
- Decommission TrueNAS
|
||||||
|
|
||||||
|
### Post-Expansion: Vdev Rebalancing
|
||||||
|
|
||||||
|
ZFS has no built-in rebalance command. After adding the new 24TB vdev, ZFS will
|
||||||
|
write new data preferentially to it (most free space), leaving old vdevs packed
|
||||||
|
at ~97%. This is suboptimal but not urgent once overall pool usage drops to ~50%.
|
||||||
|
|
||||||
|
To gradually rebalance, rewrite files in place so ZFS redistributes blocks across
|
||||||
|
all vdevs proportional to free space:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Rewrite files individually (spreads blocks across all vdevs)
|
||||||
|
find /pool/dataset -type f -exec sh -c '
|
||||||
|
for f; do cp "$f" "$f.rebal" && mv "$f.rebal" "$f"; done
|
||||||
|
' _ {} +
|
||||||
|
```
|
||||||
|
|
||||||
|
Avoid `zfs send/recv` for large datasets (e.g. 20TB) as this would concentrate
|
||||||
|
data on the emptiest vdev rather than spreading it evenly.
|
||||||
|
|
||||||
|
**Recommendation**: Do this after NixOS migration is stable. Not urgent - the pool
|
||||||
|
will function fine with uneven distribution, just slightly suboptimal for performance.
|
||||||
|
|
||||||
|
### Migration Advantages
|
||||||
|
|
||||||
|
- **No data migration**: ZFS pool imported directly, no copying terabytes of data
|
||||||
|
- **Low risk**: Pool expansion done on stable TrueNAS before OS swap
|
||||||
|
- **Reversible**: Can boot back to TrueNAS if NixOS has issues (ZFS pool is OS-independent)
|
||||||
|
- **Quick cutover**: Once NixOS config is ready, the OS swap is fast
|
||||||
|
|
||||||
|
## Next Steps
|
||||||
|
|
||||||
|
1. ~~Decide on disk size~~ - 2x 24TB ordered
|
||||||
|
2. Install drives and add mirror vdev to ZFS pool
|
||||||
|
3. Check SMART data on 8TB drives - decide whether to keep or retire
|
||||||
|
4. Design NixOS host configuration (`hosts/nas1/`)
|
||||||
|
5. Document NFS export mapping (current -> new)
|
||||||
|
6. Plan NixOS installation and cutover
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- [ ] Hostname for new NAS host? (nas1? storage1?)
|
||||||
|
- [ ] IP address/subnet: NAS and Proxmox are both on 10GbE to the same switch but different subnets, forcing traffic through the router (bottleneck). Move to same subnet during migration.
|
||||||
|
- [x] Boot drive: Reuse TrueNAS boot-pool SSDs as mdadm RAID1 for NixOS root (no ZFS on boot path)
|
||||||
|
- [ ] Retire old 8TB drives? (SMART looks healthy, keep unless chassis space is needed)
|
||||||
|
- [x] Drive trays: ordered domestically (expected 2026-02-25 to 2026-03-03)
|
||||||
|
- [ ] Timeline/maintenance window for NixOS swap?
|
||||||
311
docs/user-management.md
Normal file
311
docs/user-management.md
Normal file
@@ -0,0 +1,311 @@
|
|||||||
|
# User Management with Kanidm
|
||||||
|
|
||||||
|
Central authentication for the homelab using Kanidm.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
- **Server**: kanidm01.home.2rjus.net (auth.home.2rjus.net)
|
||||||
|
- **WebUI**: https://auth.home.2rjus.net
|
||||||
|
- **LDAPS**: port 636
|
||||||
|
|
||||||
|
## CLI Setup
|
||||||
|
|
||||||
|
The `kanidm` CLI is available in the devshell:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
nix develop
|
||||||
|
|
||||||
|
# Login as idm_admin
|
||||||
|
kanidm login --name idm_admin --url https://auth.home.2rjus.net
|
||||||
|
```
|
||||||
|
|
||||||
|
## User Management
|
||||||
|
|
||||||
|
POSIX users are managed imperatively via the `kanidm` CLI. This allows setting
|
||||||
|
all attributes (including UNIX password) in one workflow.
|
||||||
|
|
||||||
|
### Creating a POSIX User
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create the person
|
||||||
|
kanidm person create <username> "<Display Name>"
|
||||||
|
|
||||||
|
# Add to groups
|
||||||
|
kanidm group add-members ssh-users <username>
|
||||||
|
|
||||||
|
# Enable POSIX (UID is auto-assigned)
|
||||||
|
kanidm person posix set <username>
|
||||||
|
|
||||||
|
# Set UNIX password (required for SSH login, min 10 characters)
|
||||||
|
kanidm person posix set-password <username>
|
||||||
|
|
||||||
|
# Optionally set login shell
|
||||||
|
kanidm person posix set <username> --shell /bin/zsh
|
||||||
|
```
|
||||||
|
|
||||||
|
### Setting Email Address
|
||||||
|
|
||||||
|
Email is required for OAuth2/OIDC login (e.g., Grafana):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm person update <username> --mail <email>
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example: Full User Creation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm person create testuser "Test User"
|
||||||
|
kanidm person update testuser --mail testuser@home.2rjus.net
|
||||||
|
kanidm group add-members ssh-users testuser
|
||||||
|
kanidm group add-members users testuser # Required for OAuth2 scopes
|
||||||
|
kanidm person posix set testuser
|
||||||
|
kanidm person posix set-password testuser
|
||||||
|
kanidm person get testuser
|
||||||
|
```
|
||||||
|
|
||||||
|
After creation, verify on a client host:
|
||||||
|
```bash
|
||||||
|
getent passwd testuser
|
||||||
|
ssh testuser@testvm01.home.2rjus.net
|
||||||
|
```
|
||||||
|
|
||||||
|
### Viewing User Details
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm person get <username>
|
||||||
|
```
|
||||||
|
|
||||||
|
### Removing a User
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm person delete <username>
|
||||||
|
```
|
||||||
|
|
||||||
|
## Group Management
|
||||||
|
|
||||||
|
Groups for POSIX access are also managed via CLI.
|
||||||
|
|
||||||
|
### Creating a POSIX Group
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create the group
|
||||||
|
kanidm group create <group-name>
|
||||||
|
|
||||||
|
# Enable POSIX with a specific GID
|
||||||
|
kanidm group posix set <group-name> --gidnumber <gid>
|
||||||
|
```
|
||||||
|
|
||||||
|
### Adding Members
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm group add-members <group-name> <username>
|
||||||
|
```
|
||||||
|
|
||||||
|
### Viewing Group Details
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm group get <group-name>
|
||||||
|
kanidm group list-members <group-name>
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example: Full Group Creation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm group create testgroup
|
||||||
|
kanidm group posix set testgroup --gidnumber 68010
|
||||||
|
kanidm group add-members testgroup testuser
|
||||||
|
kanidm group get testgroup
|
||||||
|
```
|
||||||
|
|
||||||
|
After creation, verify on a client host:
|
||||||
|
```bash
|
||||||
|
getent group testgroup
|
||||||
|
```
|
||||||
|
|
||||||
|
### Current Groups
|
||||||
|
|
||||||
|
| Group | GID | Purpose |
|
||||||
|
|-------|-----|---------|
|
||||||
|
| ssh-users | 68000 | SSH login access |
|
||||||
|
| admins | 68001 | Administrative access |
|
||||||
|
| users | 68002 | General users |
|
||||||
|
|
||||||
|
### UID/GID Allocation
|
||||||
|
|
||||||
|
Kanidm auto-assigns UIDs/GIDs from its configured range. For manually assigned GIDs:
|
||||||
|
|
||||||
|
| Range | Purpose |
|
||||||
|
|-------|---------|
|
||||||
|
| 65,536+ | Users (auto-assigned) |
|
||||||
|
| 68,000 - 68,999 | Groups (manually assigned) |
|
||||||
|
|
||||||
|
## OAuth2/OIDC Login (Web Services)
|
||||||
|
|
||||||
|
For OAuth2/OIDC login to web services like Grafana, users need:
|
||||||
|
|
||||||
|
1. **Primary credential** - Password set via `credential update` (separate from unix password)
|
||||||
|
2. **MFA** - TOTP or passkey (Kanidm requires MFA for primary credentials)
|
||||||
|
3. **Group membership** - Member of `users` group (for OAuth2 scope mapping)
|
||||||
|
4. **Email address** - Set via `person update --mail`
|
||||||
|
|
||||||
|
### Setting Up Primary Credential (Web Login)
|
||||||
|
|
||||||
|
The primary credential is different from the unix/POSIX password:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Interactive credential setup
|
||||||
|
kanidm person credential update <username>
|
||||||
|
|
||||||
|
# In the interactive prompt:
|
||||||
|
# 1. Type 'password' to set a password
|
||||||
|
# 2. Type 'totp' to add TOTP (scan QR with authenticator app)
|
||||||
|
# 3. Type 'commit' to save
|
||||||
|
```
|
||||||
|
|
||||||
|
### Verifying OAuth2 Readiness
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm person get <username>
|
||||||
|
```
|
||||||
|
|
||||||
|
Check for:
|
||||||
|
- `mail:` - Email address set
|
||||||
|
- `memberof:` - Includes `users@home.2rjus.net`
|
||||||
|
- Primary credential status (check via `credential update` → `status`)
|
||||||
|
|
||||||
|
## PAM/NSS Client Configuration
|
||||||
|
|
||||||
|
Enable central authentication on a host:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
homelab.kanidm.enable = true;
|
||||||
|
```
|
||||||
|
|
||||||
|
This configures:
|
||||||
|
- `services.kanidm.enablePam = true`
|
||||||
|
- Client connection to auth.home.2rjus.net
|
||||||
|
- Login authorization for `ssh-users` group
|
||||||
|
- Short usernames (`torjus` instead of `torjus@home.2rjus.net`)
|
||||||
|
- Home directory symlinks (`/home/torjus` → UUID-based directory)
|
||||||
|
|
||||||
|
### Enabled Hosts
|
||||||
|
|
||||||
|
- testvm01, testvm02, testvm03 (test tier)
|
||||||
|
|
||||||
|
### Options
|
||||||
|
|
||||||
|
```nix
|
||||||
|
homelab.kanidm = {
|
||||||
|
enable = true;
|
||||||
|
server = "https://auth.home.2rjus.net"; # default
|
||||||
|
allowedLoginGroups = [ "ssh-users" ]; # default
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
### Home Directories
|
||||||
|
|
||||||
|
Home directories use UUID-based paths for stability (so renaming a user doesn't
|
||||||
|
require moving their home directory). Symlinks provide convenient access:
|
||||||
|
|
||||||
|
```
|
||||||
|
/home/torjus -> /home/e4f4c56c-4aee-4c20-846f-90cb69807733
|
||||||
|
```
|
||||||
|
|
||||||
|
The symlinks are created by `kanidm-unixd-tasks` on first login.
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
### Verify NSS Resolution
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check user resolution
|
||||||
|
getent passwd <username>
|
||||||
|
|
||||||
|
# Check group resolution
|
||||||
|
getent group <group-name>
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test SSH Login
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ssh <username>@<hostname>.home.2rjus.net
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### "PAM user mismatch" error
|
||||||
|
|
||||||
|
SSH fails with "fatal: PAM user mismatch" in logs. This happens when Kanidm returns
|
||||||
|
usernames in SPN format (`torjus@home.2rjus.net`) but SSH expects short names (`torjus`).
|
||||||
|
|
||||||
|
**Solution**: Configure `uid_attr_map = "name"` in unixSettings (already set in our module).
|
||||||
|
|
||||||
|
Check current format:
|
||||||
|
```bash
|
||||||
|
getent passwd torjus
|
||||||
|
# Should show: torjus:x:65536:...
|
||||||
|
# NOT: torjus@home.2rjus.net:x:65536:...
|
||||||
|
```
|
||||||
|
|
||||||
|
### User resolves but SSH fails immediately
|
||||||
|
|
||||||
|
The user's login group (e.g., `ssh-users`) likely doesn't have POSIX enabled:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check if group has POSIX
|
||||||
|
getent group ssh-users
|
||||||
|
|
||||||
|
# If empty, enable POSIX on the server
|
||||||
|
kanidm group posix set ssh-users --gidnumber 68000
|
||||||
|
```
|
||||||
|
|
||||||
|
### User doesn't resolve via getent
|
||||||
|
|
||||||
|
1. Check kanidm-unixd service is running:
|
||||||
|
```bash
|
||||||
|
systemctl status kanidm-unixd
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Check unixd can reach server:
|
||||||
|
```bash
|
||||||
|
kanidm-unix status
|
||||||
|
# Should show: system: online, Kanidm: online
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Check client can reach server:
|
||||||
|
```bash
|
||||||
|
curl -s https://auth.home.2rjus.net/status
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Check user has POSIX enabled on server:
|
||||||
|
```bash
|
||||||
|
kanidm person get <username>
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Restart nscd to clear stale cache:
|
||||||
|
```bash
|
||||||
|
systemctl restart nscd
|
||||||
|
```
|
||||||
|
|
||||||
|
6. Invalidate kanidm cache:
|
||||||
|
```bash
|
||||||
|
kanidm-unix cache-invalidate
|
||||||
|
```
|
||||||
|
|
||||||
|
### Changes not taking effect after deployment
|
||||||
|
|
||||||
|
NixOS uses nsncd (a Rust reimplementation of nscd) for NSS caching. After deploying
|
||||||
|
kanidm-unixd config changes, you may need to restart both services:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
systemctl restart kanidm-unixd
|
||||||
|
systemctl restart nscd
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test PAM authentication directly
|
||||||
|
|
||||||
|
Use the kanidm-unix CLI to test PAM auth without SSH:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kanidm-unix auth-test --name <username>
|
||||||
|
```
|
||||||
560
docs/vault-bootstrap-implementation.md
Normal file
560
docs/vault-bootstrap-implementation.md
Normal file
@@ -0,0 +1,560 @@
|
|||||||
|
# Phase 4d: Vault Bootstrap Integration - Implementation Summary
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Phase 4d implements automatic Vault/OpenBao integration for new NixOS hosts, enabling:
|
||||||
|
- Zero-touch secret provisioning on first boot
|
||||||
|
- Automatic AppRole authentication
|
||||||
|
- Runtime secret fetching with caching
|
||||||
|
- Periodic secret rotation
|
||||||
|
|
||||||
|
**Key principle**: Existing sops-nix infrastructure remains unchanged. This is new infrastructure running in parallel.
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
### Component Diagram
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────────────────────────┐
|
||||||
|
│ Developer Workstation │
|
||||||
|
│ │
|
||||||
|
│ create-host --hostname myhost --ip 10.69.13.x/24 │
|
||||||
|
│ │ │
|
||||||
|
│ ├─> Generate host configs (hosts/myhost/) │
|
||||||
|
│ ├─> Update flake.nix │
|
||||||
|
│ ├─> Update terraform/vms.tf │
|
||||||
|
│ ├─> Generate terraform/vault/hosts-generated.tf │
|
||||||
|
│ ├─> Apply Vault Terraform (create AppRole) │
|
||||||
|
│ └─> Generate wrapped token (24h TTL) ───┐ │
|
||||||
|
│ │ │
|
||||||
|
└───────────────────────────────────────────────┼────────────┘
|
||||||
|
│
|
||||||
|
┌───────────────────────────┘
|
||||||
|
│ Wrapped Token
|
||||||
|
│ (single-use, 24h expiry)
|
||||||
|
↓
|
||||||
|
┌─────────────────────────────────────────────────────────────┐
|
||||||
|
│ Cloud-init (VM Provisioning) │
|
||||||
|
│ │
|
||||||
|
│ /etc/environment: │
|
||||||
|
│ VAULT_ADDR=https://vault01.home.2rjus.net:8200 │
|
||||||
|
│ VAULT_WRAPPED_TOKEN=hvs.CAES... │
|
||||||
|
│ VAULT_SKIP_VERIFY=1 │
|
||||||
|
└─────────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
↓
|
||||||
|
┌─────────────────────────────────────────────────────────────┐
|
||||||
|
│ Bootstrap Service (First Boot) │
|
||||||
|
│ │
|
||||||
|
│ 1. Read VAULT_WRAPPED_TOKEN from environment │
|
||||||
|
│ 2. POST /v1/sys/wrapping/unwrap │
|
||||||
|
│ 3. Extract role_id + secret_id │
|
||||||
|
│ 4. Store in /var/lib/vault/approle/ │
|
||||||
|
│ ├─ role-id (600 permissions) │
|
||||||
|
│ └─ secret-id (600 permissions) │
|
||||||
|
│ 5. Continue with nixos-rebuild boot │
|
||||||
|
└─────────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
↓
|
||||||
|
┌─────────────────────────────────────────────────────────────┐
|
||||||
|
│ Runtime (Service Starts) │
|
||||||
|
│ │
|
||||||
|
│ vault-secret-<name>.service (ExecStartPre) │
|
||||||
|
│ │ │
|
||||||
|
│ ├─> vault-fetch <secret-path> <output-dir> │
|
||||||
|
│ │ │ │
|
||||||
|
│ │ ├─> Read role_id + secret_id │
|
||||||
|
│ │ ├─> POST /v1/auth/approle/login → token │
|
||||||
|
│ │ ├─> GET /v1/secret/data/<path> → secrets │
|
||||||
|
│ │ ├─> Write /run/secrets/<name>/password │
|
||||||
|
│ │ ├─> Write /run/secrets/<name>/api_key │
|
||||||
|
│ │ └─> Cache to /var/lib/vault/cache/<name>/ │
|
||||||
|
│ │ │
|
||||||
|
│ └─> chown/chmod secret files │
|
||||||
|
│ │
|
||||||
|
│ myservice.service │
|
||||||
|
│ └─> Reads secrets from /run/secrets/<name>/ │
|
||||||
|
└─────────────────────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
### Data Flow
|
||||||
|
|
||||||
|
1. **Provisioning Time** (Developer → Vault):
|
||||||
|
- create-host generates AppRole configuration
|
||||||
|
- Terraform creates AppRole + policy in Vault
|
||||||
|
- Vault generates wrapped token containing role_id + secret_id
|
||||||
|
- Wrapped token stored in terraform/vms.tf
|
||||||
|
|
||||||
|
2. **Bootstrap Time** (Cloud-init → VM):
|
||||||
|
- Cloud-init injects wrapped token via /etc/environment
|
||||||
|
- Bootstrap service unwraps token (single-use operation)
|
||||||
|
- Stores unwrapped credentials persistently
|
||||||
|
|
||||||
|
3. **Runtime** (Service → Vault):
|
||||||
|
- Service starts
|
||||||
|
- ExecStartPre hook calls vault-fetch
|
||||||
|
- vault-fetch authenticates using stored credentials
|
||||||
|
- Fetches secrets and caches them
|
||||||
|
- Service reads secrets from filesystem
|
||||||
|
|
||||||
|
## Implementation Details
|
||||||
|
|
||||||
|
### 1. vault-fetch Helper (`scripts/vault-fetch/`)
|
||||||
|
|
||||||
|
**Purpose**: Fetch secrets from Vault and write to filesystem
|
||||||
|
|
||||||
|
**Features**:
|
||||||
|
- Reads AppRole credentials from `/var/lib/vault/approle/`
|
||||||
|
- Authenticates to Vault (fresh token each time)
|
||||||
|
- Fetches secret from KV v2 engine
|
||||||
|
- Writes individual files per secret key
|
||||||
|
- Updates cache for fallback
|
||||||
|
- Gracefully degrades to cache if Vault unreachable
|
||||||
|
|
||||||
|
**Usage**:
|
||||||
|
```bash
|
||||||
|
vault-fetch hosts/monitoring01/grafana /run/secrets/grafana
|
||||||
|
```
|
||||||
|
|
||||||
|
**Environment Variables**:
|
||||||
|
- `VAULT_ADDR`: Vault server (default: https://vault01.home.2rjus.net:8200)
|
||||||
|
- `VAULT_SKIP_VERIFY`: Skip TLS verification (default: 1)
|
||||||
|
|
||||||
|
**Error Handling**:
|
||||||
|
- Vault unreachable → Use cache (log warning)
|
||||||
|
- Invalid credentials → Fail with clear error
|
||||||
|
- No cache + unreachable → Fail with error
|
||||||
|
|
||||||
|
### 2. NixOS Module (`system/vault-secrets.nix`)
|
||||||
|
|
||||||
|
**Purpose**: Declarative Vault secret management for NixOS services
|
||||||
|
|
||||||
|
**Configuration Options**:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
vault.enable = true; # Enable Vault integration
|
||||||
|
|
||||||
|
vault.secrets.<name> = {
|
||||||
|
secretPath = "hosts/monitoring01/grafana"; # Path in Vault
|
||||||
|
outputDir = "/run/secrets/grafana"; # Where to write secrets
|
||||||
|
cacheDir = "/var/lib/vault/cache/grafana"; # Cache location
|
||||||
|
owner = "grafana"; # File owner
|
||||||
|
group = "grafana"; # File group
|
||||||
|
mode = "0400"; # Permissions
|
||||||
|
services = [ "grafana" ]; # Dependent services
|
||||||
|
restartTrigger = true; # Enable periodic rotation
|
||||||
|
restartInterval = "daily"; # Rotation schedule
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**Module Behavior**:
|
||||||
|
|
||||||
|
1. **Fetch Service**: Creates `vault-secret-<name>.service`
|
||||||
|
- Runs on boot and before dependent services
|
||||||
|
- Calls vault-fetch to populate secrets
|
||||||
|
- Sets ownership and permissions
|
||||||
|
|
||||||
|
2. **Rotation Timer**: Optionally creates `vault-secret-rotate-<name>.timer`
|
||||||
|
- Scheduled restarts for secret rotation
|
||||||
|
- Automatically excluded for critical services
|
||||||
|
- Configurable interval (daily, weekly, monthly)
|
||||||
|
|
||||||
|
3. **Critical Service Protection**:
|
||||||
|
```nix
|
||||||
|
vault.criticalServices = [ "bind" "openbao" "step-ca" ];
|
||||||
|
```
|
||||||
|
Services in this list never get auto-restart timers
|
||||||
|
|
||||||
|
### 3. create-host Tool Updates
|
||||||
|
|
||||||
|
**New Functionality**:
|
||||||
|
|
||||||
|
1. **Vault Terraform Generation** (`generators.py`):
|
||||||
|
- Creates/updates `terraform/vault/hosts-generated.tf`
|
||||||
|
- Adds host policy granting access to `secret/data/hosts/<hostname>/*`
|
||||||
|
- Adds AppRole configuration
|
||||||
|
- Idempotent (safe to re-run)
|
||||||
|
|
||||||
|
2. **Wrapped Token Generation** (`vault_helper.py`):
|
||||||
|
- Applies Vault Terraform to create AppRole
|
||||||
|
- Reads role_id from Vault
|
||||||
|
- Generates secret_id
|
||||||
|
- Wraps credentials in cubbyhole token (24h TTL, single-use)
|
||||||
|
- Returns wrapped token
|
||||||
|
|
||||||
|
3. **VM Configuration Update** (`manipulators.py`):
|
||||||
|
- Adds `vault_wrapped_token` field to VM in vms.tf
|
||||||
|
- Preserves other VM settings
|
||||||
|
|
||||||
|
**New CLI Options**:
|
||||||
|
```bash
|
||||||
|
create-host --hostname myhost --ip 10.69.13.x/24
|
||||||
|
# Full workflow with Vault integration
|
||||||
|
|
||||||
|
create-host --hostname myhost --skip-vault
|
||||||
|
# Create host without Vault (legacy behavior)
|
||||||
|
|
||||||
|
create-host --hostname myhost --force
|
||||||
|
# Regenerate everything including new wrapped token
|
||||||
|
```
|
||||||
|
|
||||||
|
**Dependencies Added**:
|
||||||
|
- `hvac`: Python Vault client library
|
||||||
|
|
||||||
|
### 4. Bootstrap Service Updates
|
||||||
|
|
||||||
|
**New Behavior** (`hosts/template2/bootstrap.nix`):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check for wrapped token
|
||||||
|
if [ -n "$VAULT_WRAPPED_TOKEN" ]; then
|
||||||
|
# Unwrap to get credentials
|
||||||
|
curl -X POST \
|
||||||
|
-H "X-Vault-Token: $VAULT_WRAPPED_TOKEN" \
|
||||||
|
$VAULT_ADDR/v1/sys/wrapping/unwrap
|
||||||
|
|
||||||
|
# Store role_id and secret_id
|
||||||
|
mkdir -p /var/lib/vault/approle
|
||||||
|
echo "$ROLE_ID" > /var/lib/vault/approle/role-id
|
||||||
|
echo "$SECRET_ID" > /var/lib/vault/approle/secret-id
|
||||||
|
chmod 600 /var/lib/vault/approle/*
|
||||||
|
|
||||||
|
# Continue with bootstrap...
|
||||||
|
fi
|
||||||
|
```
|
||||||
|
|
||||||
|
**Error Handling**:
|
||||||
|
- Token already used → Log error, continue bootstrap
|
||||||
|
- Token expired → Log error, continue bootstrap
|
||||||
|
- Vault unreachable → Log warning, continue bootstrap
|
||||||
|
- **Never fails bootstrap** - host can still run without Vault
|
||||||
|
|
||||||
|
### 5. Cloud-init Configuration
|
||||||
|
|
||||||
|
**Updates** (`terraform/cloud-init.tf`):
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
write_files:
|
||||||
|
- path: /etc/environment
|
||||||
|
content: |
|
||||||
|
VAULT_ADDR=https://vault01.home.2rjus.net:8200
|
||||||
|
VAULT_WRAPPED_TOKEN=${vault_wrapped_token}
|
||||||
|
VAULT_SKIP_VERIFY=1
|
||||||
|
```
|
||||||
|
|
||||||
|
**VM Configuration** (`terraform/vms.tf`):
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
locals {
|
||||||
|
vms = {
|
||||||
|
"myhost" = {
|
||||||
|
ip = "10.69.13.x/24"
|
||||||
|
vault_wrapped_token = "hvs.CAESIBw..." # Added by create-host
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6. Vault Terraform Structure
|
||||||
|
|
||||||
|
**Generated Hosts File** (`terraform/vault/hosts-generated.tf`):
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
locals {
|
||||||
|
generated_host_policies = {
|
||||||
|
"myhost" = {
|
||||||
|
paths = [
|
||||||
|
"secret/data/hosts/myhost/*",
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "vault_policy" "generated_host_policies" {
|
||||||
|
for_each = local.generated_host_policies
|
||||||
|
name = "host-${each.key}"
|
||||||
|
policy = <<-EOT
|
||||||
|
path "secret/data/hosts/${each.key}/*" {
|
||||||
|
capabilities = ["read", "list"]
|
||||||
|
}
|
||||||
|
EOT
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "vault_approle_auth_backend_role" "generated_hosts" {
|
||||||
|
for_each = local.generated_host_policies
|
||||||
|
|
||||||
|
backend = vault_auth_backend.approle.path
|
||||||
|
role_name = each.key
|
||||||
|
token_policies = ["host-${each.key}"]
|
||||||
|
secret_id_ttl = 0 # Never expire
|
||||||
|
token_ttl = 3600 # 1 hour tokens
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Separation of Concerns**:
|
||||||
|
- `approle.tf`: Manual host configurations (ha1, monitoring01)
|
||||||
|
- `hosts-generated.tf`: Auto-generated configurations
|
||||||
|
- `secrets.tf`: Secret definitions (manual)
|
||||||
|
- `pki.tf`: PKI infrastructure
|
||||||
|
|
||||||
|
## Security Model
|
||||||
|
|
||||||
|
### Credential Distribution
|
||||||
|
|
||||||
|
**Wrapped Token Security**:
|
||||||
|
- **Single-use**: Can only be unwrapped once
|
||||||
|
- **Time-limited**: 24h TTL
|
||||||
|
- **Safe in git**: Even if leaked, expires quickly
|
||||||
|
- **Standard Vault pattern**: Built-in Vault feature
|
||||||
|
|
||||||
|
**Why wrapped tokens are secure**:
|
||||||
|
```
|
||||||
|
Developer commits wrapped token to git
|
||||||
|
↓
|
||||||
|
Attacker finds token in git history
|
||||||
|
↓
|
||||||
|
Attacker tries to use token
|
||||||
|
↓
|
||||||
|
❌ Token already used (unwrapped during bootstrap)
|
||||||
|
↓
|
||||||
|
❌ OR: Token expired (>24h old)
|
||||||
|
```
|
||||||
|
|
||||||
|
### AppRole Credentials
|
||||||
|
|
||||||
|
**Storage**:
|
||||||
|
- Location: `/var/lib/vault/approle/`
|
||||||
|
- Permissions: `600 (root:root)`
|
||||||
|
- Persistence: Survives reboots
|
||||||
|
|
||||||
|
**Security Properties**:
|
||||||
|
- `role_id`: Non-sensitive (like username)
|
||||||
|
- `secret_id`: Sensitive (like password)
|
||||||
|
- `secret_id_ttl = 0`: Never expires (simplicity vs rotation tradeoff)
|
||||||
|
- Tokens: Ephemeral (1h TTL, not cached)
|
||||||
|
|
||||||
|
**Attack Scenarios**:
|
||||||
|
|
||||||
|
1. **Attacker gets root on host**:
|
||||||
|
- Can read AppRole credentials
|
||||||
|
- Can only access that host's secrets
|
||||||
|
- Cannot access other hosts' secrets (policy restriction)
|
||||||
|
- ✅ Blast radius limited to single host
|
||||||
|
|
||||||
|
2. **Attacker intercepts wrapped token**:
|
||||||
|
- Single-use: Already consumed during bootstrap
|
||||||
|
- Time-limited: Likely expired
|
||||||
|
- ✅ Cannot be reused
|
||||||
|
|
||||||
|
3. **Vault server compromised**:
|
||||||
|
- All secrets exposed (same as any secret storage)
|
||||||
|
- ✅ No different from sops-nix master key compromise
|
||||||
|
|
||||||
|
### Secret Storage
|
||||||
|
|
||||||
|
**Runtime Secrets**:
|
||||||
|
- Location: `/run/secrets/` (tmpfs)
|
||||||
|
- Lost on reboot
|
||||||
|
- Re-fetched on service start
|
||||||
|
- ✅ Not in Nix store
|
||||||
|
- ✅ Not persisted to disk
|
||||||
|
|
||||||
|
**Cached Secrets**:
|
||||||
|
- Location: `/var/lib/vault/cache/`
|
||||||
|
- Persists across reboots
|
||||||
|
- Only used when Vault unreachable
|
||||||
|
- ✅ Enables service availability
|
||||||
|
- ⚠️ May be stale
|
||||||
|
|
||||||
|
## Failure Modes
|
||||||
|
|
||||||
|
### Wrapped Token Expired
|
||||||
|
|
||||||
|
**Symptom**: Bootstrap logs "token expired" error
|
||||||
|
|
||||||
|
**Impact**: Host boots but has no Vault credentials
|
||||||
|
|
||||||
|
**Fix**: Regenerate token and redeploy
|
||||||
|
```bash
|
||||||
|
create-host --hostname myhost --force
|
||||||
|
cd terraform && tofu apply
|
||||||
|
```
|
||||||
|
|
||||||
|
### Vault Unreachable
|
||||||
|
|
||||||
|
**Symptom**: Service logs "WARNING: Using cached secrets"
|
||||||
|
|
||||||
|
**Impact**: Service uses stale secrets (may work or fail depending on rotation)
|
||||||
|
|
||||||
|
**Fix**: Restore Vault connectivity, restart service
|
||||||
|
|
||||||
|
### No Cache Available
|
||||||
|
|
||||||
|
**Symptom**: Service fails to start with "No cache available"
|
||||||
|
|
||||||
|
**Impact**: Service unavailable until Vault restored
|
||||||
|
|
||||||
|
**Fix**: Restore Vault, restart service
|
||||||
|
|
||||||
|
### Invalid Credentials
|
||||||
|
|
||||||
|
**Symptom**: vault-fetch logs authentication failure
|
||||||
|
|
||||||
|
**Impact**: Service cannot start
|
||||||
|
|
||||||
|
**Fix**:
|
||||||
|
1. Check AppRole exists: `vault read auth/approle/role/hostname`
|
||||||
|
2. Check policy exists: `vault policy read host-hostname`
|
||||||
|
3. Regenerate credentials if needed
|
||||||
|
|
||||||
|
## Migration Path
|
||||||
|
|
||||||
|
### Current State (Phase 4d)
|
||||||
|
|
||||||
|
- ✅ sops-nix: Used by all existing services
|
||||||
|
- ✅ Vault: Available for new services
|
||||||
|
- ✅ Parallel operation: Both work simultaneously
|
||||||
|
|
||||||
|
### Future Migration
|
||||||
|
|
||||||
|
**Gradual Service Migration**:
|
||||||
|
|
||||||
|
1. **Pick a non-critical service** (e.g., test service)
|
||||||
|
2. **Add Vault secrets**:
|
||||||
|
```nix
|
||||||
|
vault.secrets.myservice = {
|
||||||
|
secretPath = "hosts/myhost/myservice";
|
||||||
|
};
|
||||||
|
```
|
||||||
|
3. **Update service to read from Vault**:
|
||||||
|
```nix
|
||||||
|
systemd.services.myservice.serviceConfig = {
|
||||||
|
EnvironmentFile = "/run/secrets/myservice/password";
|
||||||
|
};
|
||||||
|
```
|
||||||
|
4. **Remove sops-nix secret**
|
||||||
|
5. **Test thoroughly**
|
||||||
|
6. **Repeat for next service**
|
||||||
|
|
||||||
|
**Critical Services Last**:
|
||||||
|
- DNS (bind)
|
||||||
|
- Certificate Authority (step-ca)
|
||||||
|
- Vault itself (openbao)
|
||||||
|
|
||||||
|
**Eventually**:
|
||||||
|
- All services migrated to Vault
|
||||||
|
- Remove sops-nix dependency
|
||||||
|
- Clean up `/secrets/` directory
|
||||||
|
|
||||||
|
## Performance Considerations
|
||||||
|
|
||||||
|
### Bootstrap Time
|
||||||
|
|
||||||
|
**Added overhead**: ~2-5 seconds
|
||||||
|
- Token unwrap: ~1s
|
||||||
|
- Credential storage: ~1s
|
||||||
|
|
||||||
|
**Total bootstrap time**: Still <2 minutes (acceptable)
|
||||||
|
|
||||||
|
### Service Startup
|
||||||
|
|
||||||
|
**Added overhead**: ~1-3 seconds per service
|
||||||
|
- Vault authentication: ~1s
|
||||||
|
- Secret fetch: ~1s
|
||||||
|
- File operations: <1s
|
||||||
|
|
||||||
|
**Parallel vs Serial**:
|
||||||
|
- Multiple services fetch in parallel
|
||||||
|
- No cascade delays
|
||||||
|
|
||||||
|
### Cache Benefits
|
||||||
|
|
||||||
|
**When Vault unreachable**:
|
||||||
|
- Service starts in <1s (cache read)
|
||||||
|
- No Vault dependency for startup
|
||||||
|
- High availability maintained
|
||||||
|
|
||||||
|
## Testing Checklist
|
||||||
|
|
||||||
|
Complete testing workflow documented in `vault-bootstrap-testing.md`:
|
||||||
|
|
||||||
|
- [ ] Create test host with create-host
|
||||||
|
- [ ] Add test secrets to Vault
|
||||||
|
- [ ] Deploy VM and verify bootstrap
|
||||||
|
- [ ] Verify secrets fetched successfully
|
||||||
|
- [ ] Test service restart (re-fetch)
|
||||||
|
- [ ] Test Vault unreachable (cache fallback)
|
||||||
|
- [ ] Test secret rotation
|
||||||
|
- [ ] Test wrapped token expiry
|
||||||
|
- [ ] Test token reuse prevention
|
||||||
|
- [ ] Verify critical services excluded from auto-restart
|
||||||
|
|
||||||
|
## Files Changed
|
||||||
|
|
||||||
|
### Created
|
||||||
|
- `scripts/vault-fetch/vault-fetch.sh` - Secret fetching script
|
||||||
|
- `scripts/vault-fetch/default.nix` - Nix package
|
||||||
|
- `scripts/vault-fetch/README.md` - Documentation
|
||||||
|
- `system/vault-secrets.nix` - NixOS module
|
||||||
|
- `scripts/create-host/vault_helper.py` - Vault API client
|
||||||
|
- `terraform/vault/hosts-generated.tf` - Generated Terraform
|
||||||
|
- `docs/vault-bootstrap-implementation.md` - This file
|
||||||
|
- `docs/vault-bootstrap-testing.md` - Testing guide
|
||||||
|
|
||||||
|
### Modified
|
||||||
|
- `scripts/create-host/default.nix` - Add hvac dependency
|
||||||
|
- `scripts/create-host/create_host.py` - Add Vault integration
|
||||||
|
- `scripts/create-host/generators.py` - Add Vault Terraform generation
|
||||||
|
- `scripts/create-host/manipulators.py` - Add wrapped token injection
|
||||||
|
- `terraform/cloud-init.tf` - Inject Vault credentials
|
||||||
|
- `terraform/vms.tf` - Support vault_wrapped_token field
|
||||||
|
- `hosts/template2/bootstrap.nix` - Unwrap token and store credentials
|
||||||
|
- `system/default.nix` - Import vault-secrets module
|
||||||
|
- `flake.nix` - Add vault-fetch package
|
||||||
|
|
||||||
|
### Unchanged
|
||||||
|
- All existing sops-nix configuration
|
||||||
|
- All existing service configurations
|
||||||
|
- All existing host configurations
|
||||||
|
- `/secrets/` directory
|
||||||
|
|
||||||
|
## Future Enhancements
|
||||||
|
|
||||||
|
### Phase 4e+ (Not in Scope)
|
||||||
|
|
||||||
|
1. **Dynamic Secrets**
|
||||||
|
- Database credentials with rotation
|
||||||
|
- Cloud provider credentials
|
||||||
|
- SSH certificates
|
||||||
|
|
||||||
|
2. **Secret Watcher**
|
||||||
|
- Monitor Vault for secret changes
|
||||||
|
- Automatically restart services on rotation
|
||||||
|
- Faster than periodic timers
|
||||||
|
|
||||||
|
3. **PKI Integration** (Phase 4c)
|
||||||
|
- Migrate from step-ca to Vault PKI
|
||||||
|
- Automatic certificate issuance
|
||||||
|
- Short-lived certificates
|
||||||
|
|
||||||
|
4. **Audit Logging**
|
||||||
|
- Track secret access
|
||||||
|
- Alert on suspicious patterns
|
||||||
|
- Compliance reporting
|
||||||
|
|
||||||
|
5. **Multi-Environment**
|
||||||
|
- Dev/staging/prod separation
|
||||||
|
- Per-environment Vault namespaces
|
||||||
|
- Separate AppRoles per environment
|
||||||
|
|
||||||
|
## Conclusion
|
||||||
|
|
||||||
|
Phase 4d successfully implements automatic Vault integration for new NixOS hosts with:
|
||||||
|
|
||||||
|
- ✅ Zero-touch provisioning
|
||||||
|
- ✅ Secure credential distribution
|
||||||
|
- ✅ Graceful degradation
|
||||||
|
- ✅ Backward compatibility
|
||||||
|
- ✅ Production-ready error handling
|
||||||
|
|
||||||
|
The infrastructure is ready for gradual migration of existing services from sops-nix to Vault.
|
||||||
419
docs/vault-bootstrap-testing.md
Normal file
419
docs/vault-bootstrap-testing.md
Normal file
@@ -0,0 +1,419 @@
|
|||||||
|
# Phase 4d: Vault Bootstrap Integration - Testing Guide
|
||||||
|
|
||||||
|
This guide walks through testing the complete Vault bootstrap workflow implemented in Phase 4d.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
Before testing, ensure:
|
||||||
|
|
||||||
|
1. **Vault server is running**: vault01 (vault01.home.2rjus.net:8200) is accessible
|
||||||
|
2. **Vault access**: You have a Vault token with admin permissions (set `BAO_TOKEN` env var)
|
||||||
|
3. **Terraform installed**: OpenTofu is available in your PATH
|
||||||
|
4. **Git repository clean**: All Phase 4d changes are committed to a branch
|
||||||
|
|
||||||
|
## Test Scenario: Create vaulttest01
|
||||||
|
|
||||||
|
### Step 1: Create Test Host Configuration
|
||||||
|
|
||||||
|
Run the create-host tool with Vault integration:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Ensure you have Vault token
|
||||||
|
export BAO_TOKEN="your-vault-admin-token"
|
||||||
|
|
||||||
|
# Create test host
|
||||||
|
nix run .#create-host -- \
|
||||||
|
--hostname vaulttest01 \
|
||||||
|
--ip 10.69.13.150/24 \
|
||||||
|
--cpu 2 \
|
||||||
|
--memory 2048 \
|
||||||
|
--disk 20G
|
||||||
|
|
||||||
|
# If you need to regenerate (e.g., wrapped token expired):
|
||||||
|
nix run .#create-host -- \
|
||||||
|
--hostname vaulttest01 \
|
||||||
|
--ip 10.69.13.150/24 \
|
||||||
|
--force
|
||||||
|
```
|
||||||
|
|
||||||
|
**What this does:**
|
||||||
|
- Creates `hosts/vaulttest01/` configuration
|
||||||
|
- Updates `flake.nix` with new host
|
||||||
|
- Updates `terraform/vms.tf` with VM definition
|
||||||
|
- Generates `terraform/vault/hosts-generated.tf` with AppRole and policy
|
||||||
|
- Creates a wrapped token (24h TTL, single-use)
|
||||||
|
- Adds wrapped token to VM configuration
|
||||||
|
|
||||||
|
**Expected output:**
|
||||||
|
```
|
||||||
|
✓ All validations passed
|
||||||
|
✓ Created hosts/vaulttest01/default.nix
|
||||||
|
✓ Created hosts/vaulttest01/configuration.nix
|
||||||
|
✓ Updated flake.nix
|
||||||
|
✓ Updated terraform/vms.tf
|
||||||
|
|
||||||
|
Configuring Vault integration...
|
||||||
|
✓ Updated terraform/vault/hosts-generated.tf
|
||||||
|
Applying Vault Terraform configuration...
|
||||||
|
✓ Terraform applied successfully
|
||||||
|
Reading AppRole credentials for vaulttest01...
|
||||||
|
✓ Retrieved role_id
|
||||||
|
✓ Generated secret_id
|
||||||
|
Creating wrapped token (24h TTL, single-use)...
|
||||||
|
✓ Created wrapped token: hvs.CAESIBw...
|
||||||
|
⚠️ Token expires in 24 hours
|
||||||
|
⚠️ Token can only be used once
|
||||||
|
✓ Added wrapped token to terraform/vms.tf
|
||||||
|
|
||||||
|
✓ Host configuration generated successfully!
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 2: Add Test Service Configuration
|
||||||
|
|
||||||
|
Edit `hosts/vaulttest01/configuration.nix` to enable Vault and add a test service:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
{ config, pkgs, lib, ... }:
|
||||||
|
{
|
||||||
|
imports = [
|
||||||
|
../../system
|
||||||
|
../../common/vm
|
||||||
|
];
|
||||||
|
|
||||||
|
# Enable Vault secrets management
|
||||||
|
vault.enable = true;
|
||||||
|
|
||||||
|
# Define a test secret
|
||||||
|
vault.secrets.test-service = {
|
||||||
|
secretPath = "hosts/vaulttest01/test-service";
|
||||||
|
restartTrigger = true;
|
||||||
|
restartInterval = "daily";
|
||||||
|
services = [ "vault-test" ];
|
||||||
|
};
|
||||||
|
|
||||||
|
# Create a test service that uses the secret
|
||||||
|
systemd.services.vault-test = {
|
||||||
|
description = "Test Vault secret fetching";
|
||||||
|
wantedBy = [ "multi-user.target" ];
|
||||||
|
after = [ "vault-secret-test-service.service" ];
|
||||||
|
|
||||||
|
serviceConfig = {
|
||||||
|
Type = "oneshot";
|
||||||
|
RemainAfterExit = true;
|
||||||
|
|
||||||
|
ExecStart = pkgs.writeShellScript "vault-test" ''
|
||||||
|
echo "=== Vault Secret Test ==="
|
||||||
|
echo "Secret path: hosts/vaulttest01/test-service"
|
||||||
|
|
||||||
|
if [ -f /run/secrets/test-service/password ]; then
|
||||||
|
echo "✓ Password file exists"
|
||||||
|
echo "Password length: $(wc -c < /run/secrets/test-service/password)"
|
||||||
|
else
|
||||||
|
echo "✗ Password file missing!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -d /var/lib/vault/cache/test-service ]; then
|
||||||
|
echo "✓ Cache directory exists"
|
||||||
|
else
|
||||||
|
echo "✗ Cache directory missing!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Test successful!"
|
||||||
|
'';
|
||||||
|
|
||||||
|
StandardOutput = "journal+console";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# Rest of configuration...
|
||||||
|
networking.hostName = "vaulttest01";
|
||||||
|
networking.domain = "home.2rjus.net";
|
||||||
|
|
||||||
|
systemd.network.networks."10-lan" = {
|
||||||
|
matchConfig.Name = "ens18";
|
||||||
|
address = [ "10.69.13.150/24" ];
|
||||||
|
gateway = [ "10.69.13.1" ];
|
||||||
|
dns = [ "10.69.13.5" "10.69.13.6" ];
|
||||||
|
domains = [ "home.2rjus.net" ];
|
||||||
|
};
|
||||||
|
|
||||||
|
system.stateVersion = "25.11";
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 3: Create Test Secrets in Vault
|
||||||
|
|
||||||
|
Add test secrets to Vault using Terraform:
|
||||||
|
|
||||||
|
Edit `terraform/vault/secrets.tf`:
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
locals {
|
||||||
|
secrets = {
|
||||||
|
# ... existing secrets ...
|
||||||
|
|
||||||
|
# Test secret for vaulttest01
|
||||||
|
"hosts/vaulttest01/test-service" = {
|
||||||
|
auto_generate = true
|
||||||
|
password_length = 24
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Apply the Vault configuration:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd terraform/vault
|
||||||
|
tofu apply
|
||||||
|
```
|
||||||
|
|
||||||
|
**Verify the secret exists:**
|
||||||
|
```bash
|
||||||
|
export VAULT_ADDR=https://vault01.home.2rjus.net:8200
|
||||||
|
export VAULT_SKIP_VERIFY=1
|
||||||
|
|
||||||
|
vault kv get secret/hosts/vaulttest01/test-service
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 4: Deploy the VM
|
||||||
|
|
||||||
|
**Important**: Deploy within 24 hours of creating the host (wrapped token TTL)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd terraform
|
||||||
|
tofu plan # Review changes
|
||||||
|
tofu apply # Deploy VM
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 5: Monitor Bootstrap Process
|
||||||
|
|
||||||
|
SSH into the VM and monitor the bootstrap:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Watch bootstrap logs
|
||||||
|
ssh root@vaulttest01
|
||||||
|
journalctl -fu nixos-bootstrap.service
|
||||||
|
|
||||||
|
# Expected log output:
|
||||||
|
# Starting NixOS bootstrap for host: vaulttest01
|
||||||
|
# Network connectivity confirmed
|
||||||
|
# Unwrapping Vault token to get AppRole credentials...
|
||||||
|
# Vault credentials unwrapped and stored successfully
|
||||||
|
# Fetching and building NixOS configuration from flake...
|
||||||
|
# Successfully built configuration for vaulttest01
|
||||||
|
# Rebooting into new configuration...
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 6: Verify Vault Integration
|
||||||
|
|
||||||
|
After the VM reboots, verify the integration:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ssh root@vaulttest01
|
||||||
|
|
||||||
|
# Check AppRole credentials were stored
|
||||||
|
ls -la /var/lib/vault/approle/
|
||||||
|
# Expected: role-id and secret-id files with 600 permissions
|
||||||
|
|
||||||
|
cat /var/lib/vault/approle/role-id
|
||||||
|
# Should show a UUID
|
||||||
|
|
||||||
|
# Check vault-secret service ran successfully
|
||||||
|
systemctl status vault-secret-test-service.service
|
||||||
|
# Should be active (exited)
|
||||||
|
|
||||||
|
journalctl -u vault-secret-test-service.service
|
||||||
|
# Should show successful secret fetch:
|
||||||
|
# [vault-fetch] Authenticating to Vault at https://vault01.home.2rjus.net:8200
|
||||||
|
# [vault-fetch] Successfully authenticated to Vault
|
||||||
|
# [vault-fetch] Fetching secret from path: hosts/vaulttest01/test-service
|
||||||
|
# [vault-fetch] Writing secrets to /run/secrets/test-service
|
||||||
|
# [vault-fetch] - Wrote secret key: password
|
||||||
|
# [vault-fetch] Successfully fetched and cached secrets
|
||||||
|
|
||||||
|
# Check test service passed
|
||||||
|
systemctl status vault-test.service
|
||||||
|
journalctl -u vault-test.service
|
||||||
|
# Should show:
|
||||||
|
# === Vault Secret Test ===
|
||||||
|
# ✓ Password file exists
|
||||||
|
# ✓ Cache directory exists
|
||||||
|
# Test successful!
|
||||||
|
|
||||||
|
# Verify secret files exist
|
||||||
|
ls -la /run/secrets/test-service/
|
||||||
|
# Should show password file with 400 permissions
|
||||||
|
|
||||||
|
# Verify cache exists
|
||||||
|
ls -la /var/lib/vault/cache/test-service/
|
||||||
|
# Should show cached password file
|
||||||
|
```
|
||||||
|
|
||||||
|
## Test Scenarios
|
||||||
|
|
||||||
|
### Scenario 1: Fresh Deployment
|
||||||
|
✅ **Expected**: All secrets fetched successfully from Vault
|
||||||
|
|
||||||
|
### Scenario 2: Service Restart
|
||||||
|
```bash
|
||||||
|
systemctl restart vault-test.service
|
||||||
|
```
|
||||||
|
✅ **Expected**: Secrets re-fetched from Vault, service starts successfully
|
||||||
|
|
||||||
|
### Scenario 3: Vault Unreachable
|
||||||
|
```bash
|
||||||
|
# On vault01, stop Vault temporarily
|
||||||
|
ssh root@vault01
|
||||||
|
systemctl stop openbao
|
||||||
|
|
||||||
|
# On vaulttest01, restart test service
|
||||||
|
ssh root@vaulttest01
|
||||||
|
systemctl restart vault-test.service
|
||||||
|
journalctl -u vault-secret-test-service.service | tail -20
|
||||||
|
```
|
||||||
|
✅ **Expected**:
|
||||||
|
- Warning logged: "Using cached secrets from /var/lib/vault/cache/test-service"
|
||||||
|
- Service starts successfully using cached secrets
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Restore Vault
|
||||||
|
ssh root@vault01
|
||||||
|
systemctl start openbao
|
||||||
|
```
|
||||||
|
|
||||||
|
### Scenario 4: Secret Rotation
|
||||||
|
```bash
|
||||||
|
# Update secret in Vault
|
||||||
|
vault kv put secret/hosts/vaulttest01/test-service password="new-secret-value"
|
||||||
|
|
||||||
|
# On vaulttest01, trigger rotation
|
||||||
|
ssh root@vaulttest01
|
||||||
|
systemctl restart vault-secret-test-service.service
|
||||||
|
|
||||||
|
# Verify new secret
|
||||||
|
cat /run/secrets/test-service/password
|
||||||
|
# Should show new value
|
||||||
|
```
|
||||||
|
✅ **Expected**: New secret fetched and cached
|
||||||
|
|
||||||
|
### Scenario 5: Expired Wrapped Token
|
||||||
|
```bash
|
||||||
|
# Wait 24+ hours after create-host, then try to deploy
|
||||||
|
cd terraform
|
||||||
|
tofu apply
|
||||||
|
```
|
||||||
|
❌ **Expected**: Bootstrap fails with message about expired token
|
||||||
|
|
||||||
|
**Fix (Option 1 - Regenerate token only):**
|
||||||
|
```bash
|
||||||
|
# Only regenerates the wrapped token, preserves all other configuration
|
||||||
|
nix run .#create-host -- --hostname vaulttest01 --regenerate-token
|
||||||
|
cd terraform
|
||||||
|
tofu apply
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fix (Option 2 - Full regeneration with --force):**
|
||||||
|
```bash
|
||||||
|
# Overwrites entire host configuration (including any manual changes)
|
||||||
|
nix run .#create-host -- --hostname vaulttest01 --force
|
||||||
|
cd terraform
|
||||||
|
tofu apply
|
||||||
|
```
|
||||||
|
|
||||||
|
**Recommendation**: Use `--regenerate-token` to avoid losing manual configuration changes.
|
||||||
|
|
||||||
|
### Scenario 6: Already-Used Wrapped Token
|
||||||
|
Try to deploy the same VM twice without regenerating token.
|
||||||
|
|
||||||
|
❌ **Expected**: Second bootstrap fails with "token already used" message
|
||||||
|
|
||||||
|
## Cleanup
|
||||||
|
|
||||||
|
After testing:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Destroy test VM
|
||||||
|
cd terraform
|
||||||
|
tofu destroy -target=proxmox_vm_qemu.vm[\"vaulttest01\"]
|
||||||
|
|
||||||
|
# Remove test secrets from Vault
|
||||||
|
vault kv delete secret/hosts/vaulttest01/test-service
|
||||||
|
|
||||||
|
# Remove host configuration (optional)
|
||||||
|
git rm -r hosts/vaulttest01
|
||||||
|
# Edit flake.nix to remove nixosConfigurations.vaulttest01
|
||||||
|
# Edit terraform/vms.tf to remove vaulttest01
|
||||||
|
# Edit terraform/vault/hosts-generated.tf to remove vaulttest01
|
||||||
|
```
|
||||||
|
|
||||||
|
## Success Criteria Checklist
|
||||||
|
|
||||||
|
Phase 4d is considered successful when:
|
||||||
|
|
||||||
|
- [x] create-host generates Vault configuration automatically
|
||||||
|
- [x] New hosts receive AppRole credentials via cloud-init
|
||||||
|
- [x] Bootstrap stores credentials in /var/lib/vault/approle/
|
||||||
|
- [x] Services can fetch secrets using vault.secrets option
|
||||||
|
- [x] Secrets extracted to individual files in /run/secrets/
|
||||||
|
- [x] Cached secrets work when Vault is unreachable
|
||||||
|
- [x] Periodic restart timers work for secret rotation
|
||||||
|
- [x] Critical services excluded from auto-restart
|
||||||
|
- [x] Test host deploys and verifies working
|
||||||
|
- [x] sops-nix continues to work for existing services
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Bootstrap fails with "Failed to unwrap Vault token"
|
||||||
|
|
||||||
|
**Possible causes:**
|
||||||
|
- Token already used (wrapped tokens are single-use)
|
||||||
|
- Token expired (24h TTL)
|
||||||
|
- Invalid token
|
||||||
|
- Vault unreachable
|
||||||
|
|
||||||
|
**Solution:**
|
||||||
|
```bash
|
||||||
|
# Regenerate token
|
||||||
|
nix run .#create-host -- --hostname vaulttest01 --force
|
||||||
|
cd terraform && tofu apply
|
||||||
|
```
|
||||||
|
|
||||||
|
### Secret fetch fails with authentication error
|
||||||
|
|
||||||
|
**Check:**
|
||||||
|
```bash
|
||||||
|
# Verify AppRole exists
|
||||||
|
vault read auth/approle/role/vaulttest01
|
||||||
|
|
||||||
|
# Verify policy exists
|
||||||
|
vault policy read host-vaulttest01
|
||||||
|
|
||||||
|
# Test authentication manually
|
||||||
|
ROLE_ID=$(cat /var/lib/vault/approle/role-id)
|
||||||
|
SECRET_ID=$(cat /var/lib/vault/approle/secret-id)
|
||||||
|
vault write auth/approle/login role_id="$ROLE_ID" secret_id="$SECRET_ID"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Cache not working
|
||||||
|
|
||||||
|
**Check:**
|
||||||
|
```bash
|
||||||
|
# Verify cache directory exists and has files
|
||||||
|
ls -la /var/lib/vault/cache/test-service/
|
||||||
|
|
||||||
|
# Check permissions
|
||||||
|
stat /var/lib/vault/cache/test-service/password
|
||||||
|
# Should be 600 (rw-------)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Next Steps
|
||||||
|
|
||||||
|
After successful testing:
|
||||||
|
|
||||||
|
1. Gradually migrate existing services from sops-nix to Vault
|
||||||
|
2. Consider implementing secret watcher for faster rotation (future enhancement)
|
||||||
|
3. Phase 4c: Migrate from step-ca to OpenBao PKI
|
||||||
|
4. Eventually deprecate and remove sops-nix
|
||||||
178
docs/vault/auto-unseal.md
Normal file
178
docs/vault/auto-unseal.md
Normal file
@@ -0,0 +1,178 @@
|
|||||||
|
# OpenBao TPM2 Auto-Unseal Setup
|
||||||
|
|
||||||
|
This document describes the one-time setup process for enabling TPM2-based auto-unsealing on vault01.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The auto-unseal feature uses systemd's `LoadCredentialEncrypted` with TPM2 to securely store and retrieve an unseal key. On service start, systemd automatically decrypts the credential using the VM's TPM, and the service unseals OpenBao.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- OpenBao must be initialized (`bao operator init` completed)
|
||||||
|
- You must have at least one unseal key from the initialization
|
||||||
|
- vault01 must have a TPM2 device (virtual TPM for Proxmox VMs)
|
||||||
|
|
||||||
|
## Initial Setup
|
||||||
|
|
||||||
|
Perform these steps on vault01 after deploying the service configuration:
|
||||||
|
|
||||||
|
### 1. Save Unseal Key
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create temporary file with one of your unseal keys
|
||||||
|
echo "paste-your-unseal-key-here" > /tmp/unseal-key.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Encrypt with TPM2
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Encrypt the key using TPM2 binding
|
||||||
|
systemd-creds encrypt \
|
||||||
|
--with-key=tpm2 \
|
||||||
|
--name=unseal-key \
|
||||||
|
/tmp/unseal-key.txt \
|
||||||
|
/var/lib/openbao/unseal-key.cred
|
||||||
|
|
||||||
|
# Set proper ownership and permissions
|
||||||
|
chown openbao:openbao /var/lib/openbao/unseal-key.cred
|
||||||
|
chmod 600 /var/lib/openbao/unseal-key.cred
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Cleanup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Securely delete the plaintext key
|
||||||
|
shred -u /tmp/unseal-key.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Test Auto-Unseal
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Restart the service - it should auto-unseal
|
||||||
|
systemctl restart openbao
|
||||||
|
|
||||||
|
# Verify it's unsealed
|
||||||
|
bao status
|
||||||
|
# Should show: Sealed = false
|
||||||
|
```
|
||||||
|
|
||||||
|
## TPM PCR Binding
|
||||||
|
|
||||||
|
The default `--with-key=tpm2` binds the credential to PCR 7 (Secure Boot state). For stricter binding that includes firmware and boot state:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
systemd-creds encrypt \
|
||||||
|
--with-key=tpm2 \
|
||||||
|
--tpm2-pcrs=0+7+14 \
|
||||||
|
--name=unseal-key \
|
||||||
|
/tmp/unseal-key.txt \
|
||||||
|
/var/lib/openbao/unseal-key.cred
|
||||||
|
```
|
||||||
|
|
||||||
|
PCR meanings:
|
||||||
|
- **PCR 0**: BIOS/UEFI firmware measurements
|
||||||
|
- **PCR 7**: Secure Boot state (UEFI variables)
|
||||||
|
- **PCR 14**: MOK (Machine Owner Key) state
|
||||||
|
|
||||||
|
**Trade-off**: Stricter PCR binding improves security but may require re-encrypting the credential after firmware updates or kernel changes.
|
||||||
|
|
||||||
|
## Re-provisioning
|
||||||
|
|
||||||
|
If you need to reprovision vault01 from scratch:
|
||||||
|
|
||||||
|
1. **Before destroying**: Back up your root token and all unseal keys (stored securely offline)
|
||||||
|
2. **After recreating the VM**:
|
||||||
|
- Initialize OpenBao: `bao operator init`
|
||||||
|
- Follow the setup steps above to encrypt a new unseal key with TPM2
|
||||||
|
3. **Restore data** (if migrating): Copy `/var/lib/openbao` from backup
|
||||||
|
|
||||||
|
## Handling System Changes
|
||||||
|
|
||||||
|
**After firmware updates, kernel updates, or boot configuration changes**, PCR values may change, causing TPM decryption to fail.
|
||||||
|
|
||||||
|
### Symptoms
|
||||||
|
- Service fails to start
|
||||||
|
- Logs show: `Failed to decrypt credentials`
|
||||||
|
- OpenBao remains sealed after reboot
|
||||||
|
|
||||||
|
### Fix
|
||||||
|
1. Unseal manually with one of your offline unseal keys:
|
||||||
|
```bash
|
||||||
|
bao operator unseal
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Re-encrypt the credential with updated PCR values:
|
||||||
|
```bash
|
||||||
|
echo "your-unseal-key" > /tmp/unseal-key.txt
|
||||||
|
systemd-creds encrypt \
|
||||||
|
--with-key=tpm2 \
|
||||||
|
--name=unseal-key \
|
||||||
|
/tmp/unseal-key.txt \
|
||||||
|
/var/lib/openbao/unseal-key.cred
|
||||||
|
chown openbao:openbao /var/lib/openbao/unseal-key.cred
|
||||||
|
chmod 600 /var/lib/openbao/unseal-key.cred
|
||||||
|
shred -u /tmp/unseal-key.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Restart the service:
|
||||||
|
```bash
|
||||||
|
systemctl restart openbao
|
||||||
|
```
|
||||||
|
|
||||||
|
## Security Considerations
|
||||||
|
|
||||||
|
### What This Protects Against
|
||||||
|
- **Data at rest**: Vault data is encrypted and cannot be accessed without unsealing
|
||||||
|
- **VM snapshot theft**: An attacker with a VM snapshot cannot decrypt the unseal key without the TPM state
|
||||||
|
- **TPM binding**: The key can only be decrypted by the same VM with matching PCR values
|
||||||
|
|
||||||
|
### What This Does NOT Protect Against
|
||||||
|
- **Compromised host**: If an attacker gains root access to vault01 while running, they can access unsealed data
|
||||||
|
- **Boot-time attacks**: If an attacker can modify the boot process to match PCR values, they may retrieve the key
|
||||||
|
- **VM console access**: An attacker with VM console access during boot could potentially access the unsealed vault
|
||||||
|
|
||||||
|
### Recommendations
|
||||||
|
- **Keep offline backups** of root token and all unseal keys in a secure location (password manager, encrypted USB, etc.)
|
||||||
|
- **Use Shamir secret sharing**: The default 5-key threshold means even if the TPM key is compromised, an attacker needs the other keys
|
||||||
|
- **Monitor access**: Use OpenBao's audit logging to detect unauthorized access
|
||||||
|
- **Consider stricter PCR binding** (PCR 0+7+14) for production, accepting the maintenance overhead
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Check if credential exists
|
||||||
|
```bash
|
||||||
|
ls -la /var/lib/openbao/unseal-key.cred
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test credential decryption manually
|
||||||
|
```bash
|
||||||
|
# Should output your unseal key if TPM decryption works
|
||||||
|
systemd-creds decrypt /var/lib/openbao/unseal-key.cred -
|
||||||
|
```
|
||||||
|
|
||||||
|
### View service logs
|
||||||
|
```bash
|
||||||
|
journalctl -u openbao -n 50
|
||||||
|
```
|
||||||
|
|
||||||
|
### Manual unseal
|
||||||
|
```bash
|
||||||
|
bao operator unseal
|
||||||
|
# Enter one of your offline unseal keys when prompted
|
||||||
|
```
|
||||||
|
|
||||||
|
### Check TPM status
|
||||||
|
```bash
|
||||||
|
# Check if TPM2 is available
|
||||||
|
ls /dev/tpm*
|
||||||
|
|
||||||
|
# View TPM PCR values
|
||||||
|
tpm2_pcrread
|
||||||
|
```
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [systemd.exec - Credentials](https://www.freedesktop.org/software/systemd/man/systemd.exec.html#Credentials)
|
||||||
|
- [systemd-creds man page](https://www.freedesktop.org/software/systemd/man/systemd-creds.html)
|
||||||
|
- [TPM2 PCR Documentation](https://uapi-group.org/specifications/specs/linux_tpm_pcr_registry/)
|
||||||
|
- [OpenBao Documentation](https://openbao.org/docs/)
|
||||||
70
flake.lock
generated
70
flake.lock
generated
@@ -21,55 +21,54 @@
|
|||||||
"url": "https://git.t-juice.club/torjus/alerttonotify"
|
"url": "https://git.t-juice.club/torjus/alerttonotify"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"backup-helper": {
|
"homelab-deploy": {
|
||||||
"inputs": {
|
"inputs": {
|
||||||
"nixpkgs": [
|
"nixpkgs": [
|
||||||
"nixpkgs-unstable"
|
"nixpkgs-unstable"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1738015166,
|
"lastModified": 1771488195,
|
||||||
"narHash": "sha256-573tR4aXNjILKvYnjZUM5DZZME2H6YTHJkUKs3ZehFU=",
|
"narHash": "sha256-2kMxqdDyPluRQRoES22Y0oSjp7pc5fj2nRterfmSIyc=",
|
||||||
"ref": "master",
|
"ref": "master",
|
||||||
"rev": "f9540cc065692c7ca80735e7b08399459e0ea6d6",
|
"rev": "2d26de50559d8acb82ea803764e138325d95572c",
|
||||||
"revCount": 35,
|
"revCount": 37,
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://git.t-juice.club/torjus/backup-helper"
|
"url": "https://git.t-juice.club/torjus/homelab-deploy"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
"ref": "master",
|
"ref": "master",
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://git.t-juice.club/torjus/backup-helper"
|
"url": "https://git.t-juice.club/torjus/homelab-deploy"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"labmon": {
|
"nixos-exporter": {
|
||||||
"inputs": {
|
"inputs": {
|
||||||
"nixpkgs": [
|
"nixpkgs": [
|
||||||
"nixpkgs-unstable"
|
"nixpkgs-unstable"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1748983975,
|
"lastModified": 1770593543,
|
||||||
"narHash": "sha256-DA5mOqxwLMj/XLb4hvBU1WtE6cuVej7PjUr8N0EZsCE=",
|
"narHash": "sha256-hT8Rj6JAwGDFvcxWEcUzTCrWSiupCfBa57pBDnM2C5g=",
|
||||||
"ref": "master",
|
"ref": "refs/heads/master",
|
||||||
"rev": "040a73e891a70ff06ec7ab31d7167914129dbf7d",
|
"rev": "5aa5f7275b7a08015816171ba06d2cbdc2e02d3e",
|
||||||
"revCount": 17,
|
"revCount": 15,
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://git.t-juice.club/torjus/labmon"
|
"url": "https://git.t-juice.club/torjus/nixos-exporter"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
"ref": "master",
|
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://git.t-juice.club/torjus/labmon"
|
"url": "https://git.t-juice.club/torjus/nixos-exporter"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1769598131,
|
"lastModified": 1772822230,
|
||||||
"narHash": "sha256-e7VO/kGLgRMbWtpBqdWl0uFg8Y2XWFMdz0uUJvlML8o=",
|
"narHash": "sha256-yf3iYLGbGVlIthlQIk5/4/EQDZNNEmuqKZkQssMljuw=",
|
||||||
"owner": "nixos",
|
"owner": "nixos",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "fa83fd837f3098e3e678e6cf017b2b36102c7211",
|
"rev": "71caefce12ba78d84fe618cf61644dce01cf3a96",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
@@ -81,11 +80,11 @@
|
|||||||
},
|
},
|
||||||
"nixpkgs-unstable": {
|
"nixpkgs-unstable": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1769461804,
|
"lastModified": 1772773019,
|
||||||
"narHash": "sha256-msG8SU5WsBUfVVa/9RPLaymvi5bI8edTavbIq3vRlhI=",
|
"narHash": "sha256-E1bxHxNKfDoQUuvriG71+f+s/NT0qWkImXsYZNFFfCs=",
|
||||||
"owner": "nixos",
|
"owner": "nixos",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "bfc1b8a4574108ceef22f02bafcf6611380c100d",
|
"rev": "aca4d95fce4914b3892661bcb80b8087293536c6",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
@@ -98,31 +97,10 @@
|
|||||||
"root": {
|
"root": {
|
||||||
"inputs": {
|
"inputs": {
|
||||||
"alerttonotify": "alerttonotify",
|
"alerttonotify": "alerttonotify",
|
||||||
"backup-helper": "backup-helper",
|
"homelab-deploy": "homelab-deploy",
|
||||||
"labmon": "labmon",
|
"nixos-exporter": "nixos-exporter",
|
||||||
"nixpkgs": "nixpkgs",
|
"nixpkgs": "nixpkgs",
|
||||||
"nixpkgs-unstable": "nixpkgs-unstable",
|
"nixpkgs-unstable": "nixpkgs-unstable"
|
||||||
"sops-nix": "sops-nix"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"sops-nix": {
|
|
||||||
"inputs": {
|
|
||||||
"nixpkgs": [
|
|
||||||
"nixpkgs-unstable"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"locked": {
|
|
||||||
"lastModified": 1769469829,
|
|
||||||
"narHash": "sha256-wFcr32ZqspCxk4+FvIxIL0AZktRs6DuF8oOsLt59YBU=",
|
|
||||||
"owner": "Mic92",
|
|
||||||
"repo": "sops-nix",
|
|
||||||
"rev": "c5eebd4eb2e3372fe12a8d70a248a6ee9dd02eff",
|
|
||||||
"type": "github"
|
|
||||||
},
|
|
||||||
"original": {
|
|
||||||
"owner": "Mic92",
|
|
||||||
"repo": "sops-nix",
|
|
||||||
"type": "github"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|||||||
443
flake.nix
443
flake.nix
@@ -5,20 +5,16 @@
|
|||||||
nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-25.11";
|
nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-25.11";
|
||||||
nixpkgs-unstable.url = "github:nixos/nixpkgs?ref=nixos-unstable";
|
nixpkgs-unstable.url = "github:nixos/nixpkgs?ref=nixos-unstable";
|
||||||
|
|
||||||
sops-nix = {
|
|
||||||
url = "github:Mic92/sops-nix";
|
|
||||||
inputs.nixpkgs.follows = "nixpkgs-unstable";
|
|
||||||
};
|
|
||||||
backup-helper = {
|
|
||||||
url = "git+https://git.t-juice.club/torjus/backup-helper?ref=master";
|
|
||||||
inputs.nixpkgs.follows = "nixpkgs-unstable";
|
|
||||||
};
|
|
||||||
alerttonotify = {
|
alerttonotify = {
|
||||||
url = "git+https://git.t-juice.club/torjus/alerttonotify?ref=master";
|
url = "git+https://git.t-juice.club/torjus/alerttonotify?ref=master";
|
||||||
inputs.nixpkgs.follows = "nixpkgs-unstable";
|
inputs.nixpkgs.follows = "nixpkgs-unstable";
|
||||||
};
|
};
|
||||||
labmon = {
|
nixos-exporter = {
|
||||||
url = "git+https://git.t-juice.club/torjus/labmon?ref=master";
|
url = "git+https://git.t-juice.club/torjus/nixos-exporter";
|
||||||
|
inputs.nixpkgs.follows = "nixpkgs-unstable";
|
||||||
|
};
|
||||||
|
homelab-deploy = {
|
||||||
|
url = "git+https://git.t-juice.club/torjus/homelab-deploy?ref=master";
|
||||||
inputs.nixpkgs.follows = "nixpkgs-unstable";
|
inputs.nixpkgs.follows = "nixpkgs-unstable";
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
@@ -28,10 +24,9 @@
|
|||||||
self,
|
self,
|
||||||
nixpkgs,
|
nixpkgs,
|
||||||
nixpkgs-unstable,
|
nixpkgs-unstable,
|
||||||
sops-nix,
|
|
||||||
backup-helper,
|
|
||||||
alerttonotify,
|
alerttonotify,
|
||||||
labmon,
|
nixos-exporter,
|
||||||
|
homelab-deploy,
|
||||||
...
|
...
|
||||||
}@inputs:
|
}@inputs:
|
||||||
let
|
let
|
||||||
@@ -45,7 +40,19 @@
|
|||||||
commonOverlays = [
|
commonOverlays = [
|
||||||
overlay-unstable
|
overlay-unstable
|
||||||
alerttonotify.overlays.default
|
alerttonotify.overlays.default
|
||||||
labmon.overlays.default
|
];
|
||||||
|
# Common modules applied to all hosts
|
||||||
|
commonModules = [
|
||||||
|
(
|
||||||
|
{ config, pkgs, ... }:
|
||||||
|
{
|
||||||
|
nixpkgs.overlays = commonOverlays;
|
||||||
|
system.configurationRevision = self.rev or self.dirtyRev or "dirty";
|
||||||
|
}
|
||||||
|
)
|
||||||
|
nixos-exporter.nixosModules.default
|
||||||
|
homelab-deploy.nixosModules.default
|
||||||
|
./modules/homelab
|
||||||
];
|
];
|
||||||
allSystems = [
|
allSystems = [
|
||||||
"x86_64-linux"
|
"x86_64-linux"
|
||||||
@@ -58,312 +65,175 @@
|
|||||||
in
|
in
|
||||||
{
|
{
|
||||||
nixosConfigurations = {
|
nixosConfigurations = {
|
||||||
ns1 = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self sops-nix;
|
|
||||||
};
|
|
||||||
modules = [
|
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/ns1
|
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
|
||||||
};
|
|
||||||
ns2 = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self sops-nix;
|
|
||||||
};
|
|
||||||
modules = [
|
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/ns2
|
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
|
||||||
};
|
|
||||||
ns3 = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self sops-nix;
|
|
||||||
};
|
|
||||||
modules = [
|
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/ns3
|
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
|
||||||
};
|
|
||||||
ns4 = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self sops-nix;
|
|
||||||
};
|
|
||||||
modules = [
|
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/ns4
|
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
|
||||||
};
|
|
||||||
nixos-test1 = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self sops-nix;
|
|
||||||
};
|
|
||||||
modules = [
|
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/nixos-test1
|
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
backup-helper.nixosModules.backup-helper
|
|
||||||
];
|
|
||||||
};
|
|
||||||
ha1 = nixpkgs.lib.nixosSystem {
|
ha1 = nixpkgs.lib.nixosSystem {
|
||||||
inherit system;
|
inherit system;
|
||||||
specialArgs = {
|
specialArgs = {
|
||||||
inherit inputs self sops-nix;
|
inherit inputs self;
|
||||||
};
|
};
|
||||||
modules = [
|
modules = commonModules ++ [
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/ha1
|
./hosts/ha1
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
backup-helper.nixosModules.backup-helper
|
|
||||||
];
|
|
||||||
};
|
|
||||||
template1 = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self sops-nix;
|
|
||||||
};
|
|
||||||
modules = [
|
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/template
|
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
template2 = nixpkgs.lib.nixosSystem {
|
template2 = nixpkgs.lib.nixosSystem {
|
||||||
inherit system;
|
inherit system;
|
||||||
specialArgs = {
|
specialArgs = {
|
||||||
inherit inputs self sops-nix;
|
inherit inputs self;
|
||||||
};
|
};
|
||||||
modules = [
|
modules = commonModules ++ [
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/template2
|
./hosts/template2
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
http-proxy = nixpkgs.lib.nixosSystem {
|
http-proxy = nixpkgs.lib.nixosSystem {
|
||||||
inherit system;
|
inherit system;
|
||||||
specialArgs = {
|
specialArgs = {
|
||||||
inherit inputs self sops-nix;
|
inherit inputs self;
|
||||||
};
|
};
|
||||||
modules = [
|
modules = commonModules ++ [
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/http-proxy
|
./hosts/http-proxy
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
|
||||||
};
|
|
||||||
ca = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self sops-nix;
|
|
||||||
};
|
|
||||||
modules = [
|
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/ca
|
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
|
||||||
};
|
|
||||||
monitoring01 = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self sops-nix;
|
|
||||||
};
|
|
||||||
modules = [
|
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/monitoring01
|
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
backup-helper.nixosModules.backup-helper
|
|
||||||
labmon.nixosModules.labmon
|
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
jelly01 = nixpkgs.lib.nixosSystem {
|
jelly01 = nixpkgs.lib.nixosSystem {
|
||||||
inherit system;
|
inherit system;
|
||||||
specialArgs = {
|
specialArgs = {
|
||||||
inherit inputs self sops-nix;
|
inherit inputs self;
|
||||||
};
|
};
|
||||||
modules = [
|
modules = commonModules ++ [
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/jelly01
|
./hosts/jelly01
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
|
||||||
};
|
|
||||||
nix-cache01 = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self sops-nix;
|
|
||||||
};
|
|
||||||
modules = [
|
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/nix-cache01
|
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
|
||||||
};
|
|
||||||
media1 = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self sops-nix;
|
|
||||||
};
|
|
||||||
modules = [
|
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/media1
|
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
|
||||||
};
|
|
||||||
pgdb1 = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self sops-nix;
|
|
||||||
};
|
|
||||||
modules = [
|
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/pgdb1
|
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
nats1 = nixpkgs.lib.nixosSystem {
|
nats1 = nixpkgs.lib.nixosSystem {
|
||||||
inherit system;
|
inherit system;
|
||||||
specialArgs = {
|
specialArgs = {
|
||||||
inherit inputs self sops-nix;
|
inherit inputs self;
|
||||||
};
|
};
|
||||||
modules = [
|
modules = commonModules ++ [
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/nats1
|
./hosts/nats1
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
|
||||||
};
|
|
||||||
auth01 = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self sops-nix;
|
|
||||||
};
|
|
||||||
modules = [
|
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/auth01
|
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
|
||||||
};
|
|
||||||
testvm01 = nixpkgs.lib.nixosSystem {
|
|
||||||
inherit system;
|
|
||||||
specialArgs = {
|
|
||||||
inherit inputs self sops-nix;
|
|
||||||
};
|
|
||||||
modules = [
|
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/testvm01
|
|
||||||
sops-nix.nixosModules.sops
|
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
vault01 = nixpkgs.lib.nixosSystem {
|
vault01 = nixpkgs.lib.nixosSystem {
|
||||||
inherit system;
|
inherit system;
|
||||||
specialArgs = {
|
specialArgs = {
|
||||||
inherit inputs self sops-nix;
|
inherit inputs self;
|
||||||
};
|
};
|
||||||
modules = [
|
modules = commonModules ++ [
|
||||||
(
|
|
||||||
{ config, pkgs, ... }:
|
|
||||||
{
|
|
||||||
nixpkgs.overlays = commonOverlays;
|
|
||||||
}
|
|
||||||
)
|
|
||||||
./hosts/vault01
|
./hosts/vault01
|
||||||
sops-nix.nixosModules.sops
|
];
|
||||||
|
};
|
||||||
|
testvm01 = nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
specialArgs = {
|
||||||
|
inherit inputs self;
|
||||||
|
};
|
||||||
|
modules = commonModules ++ [
|
||||||
|
./hosts/testvm01
|
||||||
|
];
|
||||||
|
};
|
||||||
|
testvm02 = nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
specialArgs = {
|
||||||
|
inherit inputs self;
|
||||||
|
};
|
||||||
|
modules = commonModules ++ [
|
||||||
|
./hosts/testvm02
|
||||||
|
];
|
||||||
|
};
|
||||||
|
testvm03 = nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
specialArgs = {
|
||||||
|
inherit inputs self;
|
||||||
|
};
|
||||||
|
modules = commonModules ++ [
|
||||||
|
./hosts/testvm03
|
||||||
|
];
|
||||||
|
};
|
||||||
|
ns2 = nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
specialArgs = {
|
||||||
|
inherit inputs self;
|
||||||
|
};
|
||||||
|
modules = commonModules ++ [
|
||||||
|
./hosts/ns2
|
||||||
|
];
|
||||||
|
};
|
||||||
|
ns1 = nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
specialArgs = {
|
||||||
|
inherit inputs self;
|
||||||
|
};
|
||||||
|
modules = commonModules ++ [
|
||||||
|
./hosts/ns1
|
||||||
|
];
|
||||||
|
};
|
||||||
|
kanidm01 = nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
specialArgs = {
|
||||||
|
inherit inputs self;
|
||||||
|
};
|
||||||
|
modules = commonModules ++ [
|
||||||
|
./hosts/kanidm01
|
||||||
|
];
|
||||||
|
};
|
||||||
|
monitoring02 = nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
specialArgs = {
|
||||||
|
inherit inputs self;
|
||||||
|
};
|
||||||
|
modules = commonModules ++ [
|
||||||
|
./hosts/monitoring02
|
||||||
|
];
|
||||||
|
};
|
||||||
|
nix-cache02 = nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
specialArgs = {
|
||||||
|
inherit inputs self;
|
||||||
|
};
|
||||||
|
modules = commonModules ++ [
|
||||||
|
./hosts/nix-cache02
|
||||||
|
];
|
||||||
|
};
|
||||||
|
garage01 = nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
specialArgs = {
|
||||||
|
inherit inputs self;
|
||||||
|
};
|
||||||
|
modules = commonModules ++ [
|
||||||
|
./hosts/garage01
|
||||||
|
];
|
||||||
|
};
|
||||||
|
pn01 = nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
specialArgs = {
|
||||||
|
inherit inputs self;
|
||||||
|
};
|
||||||
|
modules = commonModules ++ [
|
||||||
|
./hosts/pn01
|
||||||
|
];
|
||||||
|
};
|
||||||
|
pn02 = nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
specialArgs = {
|
||||||
|
inherit inputs self;
|
||||||
|
};
|
||||||
|
modules = commonModules ++ [
|
||||||
|
./hosts/pn02
|
||||||
|
];
|
||||||
|
};
|
||||||
|
nrec-nixos01 = nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
specialArgs = {
|
||||||
|
inherit inputs self;
|
||||||
|
};
|
||||||
|
modules = commonModules ++ [
|
||||||
|
./hosts/nrec-nixos01
|
||||||
|
];
|
||||||
|
};
|
||||||
|
openstack-template = nixpkgs.lib.nixosSystem {
|
||||||
|
inherit system;
|
||||||
|
specialArgs = {
|
||||||
|
inherit inputs self;
|
||||||
|
};
|
||||||
|
modules = commonModules ++ [
|
||||||
|
./hosts/openstack-template
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
@@ -371,17 +241,24 @@
|
|||||||
{ pkgs }:
|
{ pkgs }:
|
||||||
{
|
{
|
||||||
create-host = pkgs.callPackage ./scripts/create-host { };
|
create-host = pkgs.callPackage ./scripts/create-host { };
|
||||||
|
vault-fetch = pkgs.callPackage ./scripts/vault-fetch { };
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
devShells = forAllSystems (
|
devShells = forAllSystems (
|
||||||
{ pkgs }:
|
{ pkgs }:
|
||||||
{
|
{
|
||||||
default = pkgs.mkShell {
|
default = pkgs.mkShell {
|
||||||
packages = with pkgs; [
|
packages = [
|
||||||
ansible
|
pkgs.ansible
|
||||||
opentofu
|
pkgs.opentofu
|
||||||
|
pkgs.openbao
|
||||||
|
pkgs.kanidm_1_8
|
||||||
|
pkgs.nkeys
|
||||||
|
pkgs.openstackclient
|
||||||
(pkgs.callPackage ./scripts/create-host { })
|
(pkgs.callPackage ./scripts/create-host { })
|
||||||
|
homelab-deploy.packages.${pkgs.system}.default
|
||||||
];
|
];
|
||||||
|
ANSIBLE_CONFIG = "./ansible/ansible.cfg";
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -1,8 +0,0 @@
|
|||||||
{ ... }:
|
|
||||||
{
|
|
||||||
imports = [
|
|
||||||
./configuration.nix
|
|
||||||
../../services/lldap
|
|
||||||
../../services/authelia
|
|
||||||
];
|
|
||||||
}
|
|
||||||
72
hosts/garage01/configuration.nix
Normal file
72
hosts/garage01/configuration.nix
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
|
pkgs,
|
||||||
|
...
|
||||||
|
}:
|
||||||
|
|
||||||
|
{
|
||||||
|
imports = [
|
||||||
|
../template2/hardware-configuration.nix
|
||||||
|
|
||||||
|
../../system
|
||||||
|
../../common/vm
|
||||||
|
];
|
||||||
|
|
||||||
|
# Host metadata (adjust as needed)
|
||||||
|
homelab.host = {
|
||||||
|
tier = "test"; # Start in test tier, move to prod after validation
|
||||||
|
role = "storage";
|
||||||
|
};
|
||||||
|
|
||||||
|
homelab.dns.cnames = [ "s3" ];
|
||||||
|
|
||||||
|
# Enable Vault integration
|
||||||
|
vault.enable = true;
|
||||||
|
|
||||||
|
# Enable remote deployment via NATS
|
||||||
|
homelab.deploy.enable = true;
|
||||||
|
|
||||||
|
nixpkgs.config.allowUnfree = true;
|
||||||
|
boot.loader.grub.enable = true;
|
||||||
|
boot.loader.grub.device = "/dev/vda";
|
||||||
|
|
||||||
|
networking.hostName = "garage01";
|
||||||
|
networking.domain = "home.2rjus.net";
|
||||||
|
networking.useNetworkd = true;
|
||||||
|
networking.useDHCP = false;
|
||||||
|
services.resolved.enable = true;
|
||||||
|
networking.nameservers = [
|
||||||
|
"10.69.13.5"
|
||||||
|
"10.69.13.6"
|
||||||
|
];
|
||||||
|
|
||||||
|
systemd.network.enable = true;
|
||||||
|
systemd.network.networks."ens18" = {
|
||||||
|
matchConfig.Name = "ens18";
|
||||||
|
address = [
|
||||||
|
"10.69.13.26/24"
|
||||||
|
];
|
||||||
|
routes = [
|
||||||
|
{ Gateway = "10.69.13.1"; }
|
||||||
|
];
|
||||||
|
linkConfig.RequiredForOnline = "routable";
|
||||||
|
};
|
||||||
|
time.timeZone = "Europe/Oslo";
|
||||||
|
|
||||||
|
|
||||||
|
nix.settings.tarball-ttl = 0;
|
||||||
|
environment.systemPackages = with pkgs; [
|
||||||
|
vim
|
||||||
|
wget
|
||||||
|
git
|
||||||
|
];
|
||||||
|
|
||||||
|
# Open ports in the firewall.
|
||||||
|
# networking.firewall.allowedTCPPorts = [ ... ];
|
||||||
|
# networking.firewall.allowedUDPPorts = [ ... ];
|
||||||
|
# Or disable the firewall altogether.
|
||||||
|
networking.firewall.enable = false;
|
||||||
|
|
||||||
|
system.stateVersion = "25.11"; # Did you read the comment?
|
||||||
|
}
|
||||||
6
hosts/garage01/default.nix
Normal file
6
hosts/garage01/default.nix
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
{ ... }: {
|
||||||
|
imports = [
|
||||||
|
./configuration.nix
|
||||||
|
../../services/garage
|
||||||
|
];
|
||||||
|
}
|
||||||
@@ -7,12 +7,14 @@
|
|||||||
|
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../template/hardware-configuration.nix
|
./hardware-configuration.nix
|
||||||
|
|
||||||
../../system
|
../../system
|
||||||
../../common/vm
|
../../common/vm
|
||||||
];
|
];
|
||||||
|
|
||||||
|
homelab.host.role = "home-automation";
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
nixpkgs.config.allowUnfree = true;
|
||||||
# Use the systemd-boot EFI boot loader.
|
# Use the systemd-boot EFI boot loader.
|
||||||
boot.loader.grub = {
|
boot.loader.grub = {
|
||||||
@@ -44,10 +46,7 @@
|
|||||||
};
|
};
|
||||||
time.timeZone = "Europe/Oslo";
|
time.timeZone = "Europe/Oslo";
|
||||||
|
|
||||||
nix.settings.experimental-features = [
|
|
||||||
"nix-command"
|
|
||||||
"flakes"
|
|
||||||
];
|
|
||||||
nix.settings.tarball-ttl = 0;
|
nix.settings.tarball-ttl = 0;
|
||||||
environment.systemPackages = with pkgs; [
|
environment.systemPackages = with pkgs; [
|
||||||
vim
|
vim
|
||||||
@@ -55,16 +54,37 @@
|
|||||||
git
|
git
|
||||||
];
|
];
|
||||||
|
|
||||||
|
# Vault secrets management
|
||||||
|
vault.enable = true;
|
||||||
|
homelab.deploy.enable = true;
|
||||||
|
vault.secrets.backup-helper = {
|
||||||
|
secretPath = "shared/backup/password";
|
||||||
|
extractKey = "password";
|
||||||
|
outputDir = "/run/secrets/backup_helper_secret";
|
||||||
|
services = [ "restic-backups-ha1" ];
|
||||||
|
};
|
||||||
|
|
||||||
# Backup service dirs
|
# Backup service dirs
|
||||||
sops.secrets."backup_helper_secret" = { };
|
services.restic.backups.ha1 = {
|
||||||
backup-helper = {
|
repository = "rest:http://10.69.12.52:8000/backup-nix";
|
||||||
enable = true;
|
passwordFile = "/run/secrets/backup_helper_secret";
|
||||||
password-file = "/run/secrets/backup_helper_secret";
|
paths = [
|
||||||
backup-dirs = [
|
|
||||||
"/var/lib/hass"
|
"/var/lib/hass"
|
||||||
"/var/lib/zigbee2mqtt"
|
"/var/lib/zigbee2mqtt"
|
||||||
"/var/lib/mosquitto"
|
"/var/lib/mosquitto"
|
||||||
];
|
];
|
||||||
|
timerConfig = {
|
||||||
|
OnCalendar = "daily";
|
||||||
|
Persistent = true;
|
||||||
|
RandomizedDelaySec = "2h";
|
||||||
|
};
|
||||||
|
pruneOpts = [
|
||||||
|
"--keep-daily 7"
|
||||||
|
"--keep-weekly 4"
|
||||||
|
"--keep-monthly 6"
|
||||||
|
"--keep-within 1d"
|
||||||
|
];
|
||||||
|
extraOptions = [ "--retry-lock=5m" ];
|
||||||
};
|
};
|
||||||
|
|
||||||
# Open ports in the firewall.
|
# Open ports in the firewall.
|
||||||
|
|||||||
@@ -5,12 +5,22 @@
|
|||||||
|
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../template/hardware-configuration.nix
|
./hardware-configuration.nix
|
||||||
|
|
||||||
../../system
|
../../system
|
||||||
../../common/vm
|
../../common/vm
|
||||||
];
|
];
|
||||||
|
|
||||||
|
homelab.host.role = "proxy";
|
||||||
|
homelab.dns.cnames = [
|
||||||
|
"nzbget"
|
||||||
|
"radarr"
|
||||||
|
"sonarr"
|
||||||
|
"ha"
|
||||||
|
"z2m"
|
||||||
|
"jelly"
|
||||||
|
];
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
nixpkgs.config.allowUnfree = true;
|
||||||
# Use the systemd-boot EFI boot loader.
|
# Use the systemd-boot EFI boot loader.
|
||||||
boot.loader.grub = {
|
boot.loader.grub = {
|
||||||
@@ -42,10 +52,10 @@
|
|||||||
};
|
};
|
||||||
time.timeZone = "Europe/Oslo";
|
time.timeZone = "Europe/Oslo";
|
||||||
|
|
||||||
nix.settings.experimental-features = [
|
|
||||||
"nix-command"
|
vault.enable = true;
|
||||||
"flakes"
|
homelab.deploy.enable = true;
|
||||||
];
|
|
||||||
nix.settings.tarball-ttl = 0;
|
nix.settings.tarball-ttl = 0;
|
||||||
environment.systemPackages = with pkgs; [
|
environment.systemPackages = with pkgs; [
|
||||||
vim
|
vim
|
||||||
|
|||||||
42
hosts/http-proxy/hardware-configuration.nix
Normal file
42
hosts/http-proxy/hardware-configuration.nix
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
|
pkgs,
|
||||||
|
modulesPath,
|
||||||
|
...
|
||||||
|
}:
|
||||||
|
|
||||||
|
{
|
||||||
|
imports = [
|
||||||
|
(modulesPath + "/profiles/qemu-guest.nix")
|
||||||
|
];
|
||||||
|
boot.initrd.availableKernelModules = [
|
||||||
|
"ata_piix"
|
||||||
|
"uhci_hcd"
|
||||||
|
"virtio_pci"
|
||||||
|
"virtio_scsi"
|
||||||
|
"sd_mod"
|
||||||
|
"sr_mod"
|
||||||
|
];
|
||||||
|
boot.initrd.kernelModules = [ "dm-snapshot" ];
|
||||||
|
boot.kernelModules = [
|
||||||
|
"ptp_kvm"
|
||||||
|
];
|
||||||
|
boot.extraModulePackages = [ ];
|
||||||
|
|
||||||
|
fileSystems."/" = {
|
||||||
|
device = "/dev/disk/by-label/root";
|
||||||
|
fsType = "xfs";
|
||||||
|
};
|
||||||
|
|
||||||
|
swapDevices = [ { device = "/dev/disk/by-label/swap"; } ];
|
||||||
|
|
||||||
|
# Enables DHCP on each ethernet and wireless interface. In case of scripted networking
|
||||||
|
# (the default) this is the recommended approach. When using systemd-networkd it's
|
||||||
|
# still possible to use this option, but it's recommended to use it in conjunction
|
||||||
|
# with explicit per-interface declarations with `networking.interfaces.<interface>.useDHCP`.
|
||||||
|
networking.useDHCP = lib.mkDefault true;
|
||||||
|
# networking.interfaces.ens18.useDHCP = lib.mkDefault true;
|
||||||
|
|
||||||
|
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
|
||||||
|
}
|
||||||
@@ -1,9 +1,12 @@
|
|||||||
{ config, ... }:
|
{ config, ... }:
|
||||||
{
|
{
|
||||||
sops.secrets.wireguard_private_key = {
|
vault.secrets.wireguard = {
|
||||||
sopsFile = ../../secrets/http-proxy/wireguard.yaml;
|
secretPath = "hosts/http-proxy/wireguard";
|
||||||
key = "wg_private_key";
|
extractKey = "private_key";
|
||||||
|
outputDir = "/run/secrets/wireguard_private_key";
|
||||||
|
services = [ "wireguard-wg0" ];
|
||||||
};
|
};
|
||||||
|
|
||||||
networking.wireguard = {
|
networking.wireguard = {
|
||||||
enable = true;
|
enable = true;
|
||||||
useNetworkd = true;
|
useNetworkd = true;
|
||||||
@@ -13,7 +16,7 @@
|
|||||||
ips = [ "10.69.222.3/24" ];
|
ips = [ "10.69.222.3/24" ];
|
||||||
mtu = 1384;
|
mtu = 1384;
|
||||||
listenPort = 51820;
|
listenPort = 51820;
|
||||||
privateKeyFile = config.sops.secrets.wireguard_private_key.path;
|
privateKeyFile = "/run/secrets/wireguard_private_key";
|
||||||
peers = [
|
peers = [
|
||||||
{
|
{
|
||||||
name = "docker2.t-juice.club";
|
name = "docker2.t-juice.club";
|
||||||
@@ -26,7 +29,11 @@
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
# monitoring
|
homelab.monitoring.scrapeTargets = [{
|
||||||
|
job_name = "wireguard";
|
||||||
|
port = 9586;
|
||||||
|
}];
|
||||||
|
|
||||||
services.prometheus.exporters.wireguard = {
|
services.prometheus.exporters.wireguard = {
|
||||||
enable = true;
|
enable = true;
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -5,12 +5,14 @@
|
|||||||
|
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../template/hardware-configuration.nix
|
./hardware-configuration.nix
|
||||||
|
|
||||||
../../system
|
../../system
|
||||||
../../common/vm
|
../../common/vm
|
||||||
];
|
];
|
||||||
|
|
||||||
|
homelab.host.role = "media";
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
nixpkgs.config.allowUnfree = true;
|
||||||
# Use the systemd-boot EFI boot loader.
|
# Use the systemd-boot EFI boot loader.
|
||||||
boot.loader.grub = {
|
boot.loader.grub = {
|
||||||
@@ -42,10 +44,7 @@
|
|||||||
};
|
};
|
||||||
time.timeZone = "Europe/Oslo";
|
time.timeZone = "Europe/Oslo";
|
||||||
|
|
||||||
nix.settings.experimental-features = [
|
|
||||||
"nix-command"
|
|
||||||
"flakes"
|
|
||||||
];
|
|
||||||
nix.settings.tarball-ttl = 0;
|
nix.settings.tarball-ttl = 0;
|
||||||
environment.systemPackages = with pkgs; [
|
environment.systemPackages = with pkgs; [
|
||||||
vim
|
vim
|
||||||
@@ -61,9 +60,8 @@
|
|||||||
# Or disable the firewall altogether.
|
# Or disable the firewall altogether.
|
||||||
networking.firewall.enable = false;
|
networking.firewall.enable = false;
|
||||||
|
|
||||||
zramSwap = {
|
vault.enable = true;
|
||||||
enable = true;
|
homelab.deploy.enable = true;
|
||||||
};
|
|
||||||
|
|
||||||
system.stateVersion = "23.11"; # Did you read the comment?
|
system.stateVersion = "23.11"; # Did you read the comment?
|
||||||
}
|
}
|
||||||
|
|||||||
42
hosts/jelly01/hardware-configuration.nix
Normal file
42
hosts/jelly01/hardware-configuration.nix
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
|
pkgs,
|
||||||
|
modulesPath,
|
||||||
|
...
|
||||||
|
}:
|
||||||
|
|
||||||
|
{
|
||||||
|
imports = [
|
||||||
|
(modulesPath + "/profiles/qemu-guest.nix")
|
||||||
|
];
|
||||||
|
boot.initrd.availableKernelModules = [
|
||||||
|
"ata_piix"
|
||||||
|
"uhci_hcd"
|
||||||
|
"virtio_pci"
|
||||||
|
"virtio_scsi"
|
||||||
|
"sd_mod"
|
||||||
|
"sr_mod"
|
||||||
|
];
|
||||||
|
boot.initrd.kernelModules = [ "dm-snapshot" ];
|
||||||
|
boot.kernelModules = [
|
||||||
|
"ptp_kvm"
|
||||||
|
];
|
||||||
|
boot.extraModulePackages = [ ];
|
||||||
|
|
||||||
|
fileSystems."/" = {
|
||||||
|
device = "/dev/disk/by-label/root";
|
||||||
|
fsType = "xfs";
|
||||||
|
};
|
||||||
|
|
||||||
|
swapDevices = [ { device = "/dev/disk/by-label/swap"; } ];
|
||||||
|
|
||||||
|
# Enables DHCP on each ethernet and wireless interface. In case of scripted networking
|
||||||
|
# (the default) this is the recommended approach. When using systemd-networkd it's
|
||||||
|
# still possible to use this option, but it's recommended to use it in conjunction
|
||||||
|
# with explicit per-interface declarations with `networking.interfaces.<interface>.useDHCP`.
|
||||||
|
networking.useDHCP = lib.mkDefault true;
|
||||||
|
# networking.interfaces.ens18.useDHCP = lib.mkDefault true;
|
||||||
|
|
||||||
|
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
|
||||||
|
}
|
||||||
@@ -1,53 +0,0 @@
|
|||||||
{ config, lib, pkgs, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports =
|
|
||||||
[
|
|
||||||
../template/hardware-configuration.nix
|
|
||||||
../../system
|
|
||||||
];
|
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
|
||||||
# Use the systemd-boot EFI boot loader.
|
|
||||||
boot.loader.grub.enable = true;
|
|
||||||
boot.loader.grub.device = "/dev/sda";
|
|
||||||
|
|
||||||
networking.hostName = "jump";
|
|
||||||
networking.domain = "home.2rjus.net";
|
|
||||||
networking.useNetworkd = true;
|
|
||||||
networking.useDHCP = false;
|
|
||||||
services.resolved.enable = false;
|
|
||||||
networking.nameservers = [
|
|
||||||
"10.69.13.5"
|
|
||||||
"10.69.13.6"
|
|
||||||
];
|
|
||||||
|
|
||||||
systemd.network.enable = true;
|
|
||||||
systemd.network.networks."ens18" = {
|
|
||||||
matchConfig.Name = "ens18";
|
|
||||||
address = [
|
|
||||||
"10.69.13.10/24"
|
|
||||||
];
|
|
||||||
routes = [
|
|
||||||
{ Gateway = "10.69.13.1"; }
|
|
||||||
];
|
|
||||||
linkConfig.RequiredForOnline = "routable";
|
|
||||||
};
|
|
||||||
time.timeZone = "Europe/Oslo";
|
|
||||||
|
|
||||||
nix.settings.experimental-features = [ "nix-command" "flakes" ];
|
|
||||||
environment.systemPackages = with pkgs; [
|
|
||||||
vim
|
|
||||||
wget
|
|
||||||
git
|
|
||||||
];
|
|
||||||
|
|
||||||
# Open ports in the firewall.
|
|
||||||
# networking.firewall.allowedTCPPorts = [ ... ];
|
|
||||||
# networking.firewall.allowedUDPPorts = [ ... ];
|
|
||||||
# Or disable the firewall altogether.
|
|
||||||
networking.firewall.enable = false;
|
|
||||||
|
|
||||||
system.stateVersion = "23.11"; # Did you read the comment?
|
|
||||||
}
|
|
||||||
|
|
||||||
@@ -1,36 +0,0 @@
|
|||||||
{ config, lib, pkgs, modulesPath, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports =
|
|
||||||
[
|
|
||||||
(modulesPath + "/profiles/qemu-guest.nix")
|
|
||||||
];
|
|
||||||
|
|
||||||
boot.initrd.availableKernelModules = [ "ata_piix" "uhci_hcd" "virtio_pci" "virtio_scsi" "sd_mod" "sr_mod" ];
|
|
||||||
boot.initrd.kernelModules = [ ];
|
|
||||||
# boot.kernelModules = [ ];
|
|
||||||
# boot.extraModulePackages = [ ];
|
|
||||||
|
|
||||||
fileSystems."/" =
|
|
||||||
{
|
|
||||||
device = "/dev/disk/by-uuid/6889aba9-61ed-4687-ab10-e5cf4017ac8d";
|
|
||||||
fsType = "xfs";
|
|
||||||
};
|
|
||||||
|
|
||||||
fileSystems."/boot" =
|
|
||||||
{
|
|
||||||
device = "/dev/disk/by-uuid/BC07-3B7A";
|
|
||||||
fsType = "vfat";
|
|
||||||
};
|
|
||||||
|
|
||||||
swapDevices =
|
|
||||||
[{ device = "/dev/disk/by-uuid/64e5757b-6625-4dd2-aa2a-66ca93444d23"; }];
|
|
||||||
|
|
||||||
# Enables DHCP on each ethernet and wireless interface. In case of scripted networking
|
|
||||||
# (the default) this is the recommended approach. When using systemd-networkd it's
|
|
||||||
# still possible to use this option, but it's recommended to use it in conjunction
|
|
||||||
# with explicit per-interface declarations with `networking.interfaces.<interface>.useDHCP`.
|
|
||||||
# networking.interfaces.ens18.useDHCP = lib.mkDefault true;
|
|
||||||
|
|
||||||
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
|
|
||||||
}
|
|
||||||
@@ -1,29 +1,38 @@
|
|||||||
{
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
pkgs,
|
pkgs,
|
||||||
...
|
...
|
||||||
}:
|
}:
|
||||||
|
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../template/hardware-configuration.nix
|
../template2/hardware-configuration.nix
|
||||||
|
|
||||||
../../system
|
../../system
|
||||||
../../common/vm
|
../../common/vm
|
||||||
|
../../services/kanidm
|
||||||
];
|
];
|
||||||
|
|
||||||
fileSystems."/nix" = {
|
homelab.host = {
|
||||||
device = "/dev/disk/by-label/nixcache";
|
tier = "prod";
|
||||||
fsType = "xfs";
|
role = "auth";
|
||||||
};
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
|
||||||
# Use the systemd-boot EFI boot loader.
|
|
||||||
boot.loader.grub = {
|
|
||||||
enable = true;
|
|
||||||
device = "/dev/sda";
|
|
||||||
configurationLimit = 3;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
networking.hostName = "nix-cache01";
|
# DNS CNAME for auth.home.2rjus.net
|
||||||
|
homelab.dns.cnames = [ "auth" ];
|
||||||
|
|
||||||
|
# Enable Vault integration
|
||||||
|
vault.enable = true;
|
||||||
|
|
||||||
|
# Enable remote deployment via NATS
|
||||||
|
homelab.deploy.enable = true;
|
||||||
|
|
||||||
|
nixpkgs.config.allowUnfree = true;
|
||||||
|
boot.loader.grub.enable = true;
|
||||||
|
boot.loader.grub.device = "/dev/vda";
|
||||||
|
|
||||||
|
networking.hostName = "kanidm01";
|
||||||
networking.domain = "home.2rjus.net";
|
networking.domain = "home.2rjus.net";
|
||||||
networking.useNetworkd = true;
|
networking.useNetworkd = true;
|
||||||
networking.useDHCP = false;
|
networking.useDHCP = false;
|
||||||
@@ -37,7 +46,7 @@
|
|||||||
systemd.network.networks."ens18" = {
|
systemd.network.networks."ens18" = {
|
||||||
matchConfig.Name = "ens18";
|
matchConfig.Name = "ens18";
|
||||||
address = [
|
address = [
|
||||||
"10.69.13.15/24"
|
"10.69.13.23/24"
|
||||||
];
|
];
|
||||||
routes = [
|
routes = [
|
||||||
{ Gateway = "10.69.13.1"; }
|
{ Gateway = "10.69.13.1"; }
|
||||||
@@ -46,10 +55,7 @@
|
|||||||
};
|
};
|
||||||
time.timeZone = "Europe/Oslo";
|
time.timeZone = "Europe/Oslo";
|
||||||
|
|
||||||
nix.settings.experimental-features = [
|
|
||||||
"nix-command"
|
|
||||||
"flakes"
|
|
||||||
];
|
|
||||||
nix.settings.tarball-ttl = 0;
|
nix.settings.tarball-ttl = 0;
|
||||||
environment.systemPackages = with pkgs; [
|
environment.systemPackages = with pkgs; [
|
||||||
vim
|
vim
|
||||||
@@ -57,13 +63,11 @@
|
|||||||
git
|
git
|
||||||
];
|
];
|
||||||
|
|
||||||
services.qemuGuest.enable = true;
|
|
||||||
|
|
||||||
# Open ports in the firewall.
|
# Open ports in the firewall.
|
||||||
# networking.firewall.allowedTCPPorts = [ ... ];
|
# networking.firewall.allowedTCPPorts = [ ... ];
|
||||||
# networking.firewall.allowedUDPPorts = [ ... ];
|
# networking.firewall.allowedUDPPorts = [ ... ];
|
||||||
# Or disable the firewall altogether.
|
# Or disable the firewall altogether.
|
||||||
networking.firewall.enable = false;
|
networking.firewall.enable = false;
|
||||||
|
|
||||||
system.stateVersion = "24.05"; # Did you read the comment?
|
system.stateVersion = "25.11"; # Did you read the comment?
|
||||||
}
|
}
|
||||||
@@ -1,76 +0,0 @@
|
|||||||
{
|
|
||||||
pkgs,
|
|
||||||
...
|
|
||||||
}:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports = [
|
|
||||||
./hardware-configuration.nix
|
|
||||||
|
|
||||||
../../system
|
|
||||||
];
|
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
|
||||||
|
|
||||||
# Use the systemd-boot EFI boot loader.
|
|
||||||
boot = {
|
|
||||||
loader.systemd-boot = {
|
|
||||||
enable = true;
|
|
||||||
configurationLimit = 5;
|
|
||||||
memtest86.enable = true;
|
|
||||||
};
|
|
||||||
loader.efi.canTouchEfiVariables = true;
|
|
||||||
supportedFilesystems = [ "nfs" ];
|
|
||||||
};
|
|
||||||
|
|
||||||
networking.hostName = "media1";
|
|
||||||
networking.domain = "home.2rjus.net";
|
|
||||||
networking.useNetworkd = true;
|
|
||||||
networking.useDHCP = false;
|
|
||||||
services.resolved.enable = true;
|
|
||||||
networking.nameservers = [
|
|
||||||
"10.69.13.5"
|
|
||||||
"10.69.13.6"
|
|
||||||
];
|
|
||||||
|
|
||||||
systemd.network.enable = true;
|
|
||||||
systemd.network.networks."enp2s0" = {
|
|
||||||
matchConfig.Name = "enp2s0";
|
|
||||||
address = [
|
|
||||||
"10.69.12.82/24"
|
|
||||||
];
|
|
||||||
routes = [
|
|
||||||
{ Gateway = "10.69.12.1"; }
|
|
||||||
];
|
|
||||||
linkConfig.RequiredForOnline = "routable";
|
|
||||||
};
|
|
||||||
time.timeZone = "Europe/Oslo";
|
|
||||||
|
|
||||||
# Graphics
|
|
||||||
hardware.graphics = {
|
|
||||||
enable = true;
|
|
||||||
extraPackages = with pkgs; [
|
|
||||||
libvdpau-va-gl
|
|
||||||
libva-vdpau-driver
|
|
||||||
];
|
|
||||||
};
|
|
||||||
|
|
||||||
nix.settings.experimental-features = [
|
|
||||||
"nix-command"
|
|
||||||
"flakes"
|
|
||||||
];
|
|
||||||
nix.settings.tarball-ttl = 0;
|
|
||||||
environment.systemPackages = with pkgs; [
|
|
||||||
vim
|
|
||||||
wget
|
|
||||||
git
|
|
||||||
];
|
|
||||||
|
|
||||||
# Open ports in the firewall.
|
|
||||||
# networking.firewall.allowedTCPPorts = [ ... ];
|
|
||||||
# networking.firewall.allowedUDPPorts = [ ... ];
|
|
||||||
# Or disable the firewall altogether.
|
|
||||||
networking.firewall.enable = false;
|
|
||||||
|
|
||||||
system.stateVersion = "23.11"; # Did you read the comment?
|
|
||||||
}
|
|
||||||
@@ -1,29 +0,0 @@
|
|||||||
{ pkgs, ... }:
|
|
||||||
let
|
|
||||||
kodipkg = pkgs.kodi-wayland.withPackages (
|
|
||||||
p: with p; [
|
|
||||||
jellyfin
|
|
||||||
]
|
|
||||||
);
|
|
||||||
in
|
|
||||||
{
|
|
||||||
users.users.kodi = {
|
|
||||||
isNormalUser = true;
|
|
||||||
description = "Kodi Media Center user";
|
|
||||||
};
|
|
||||||
#services.xserver = {
|
|
||||||
# enable = true;
|
|
||||||
#};
|
|
||||||
services.cage = {
|
|
||||||
enable = true;
|
|
||||||
user = "kodi";
|
|
||||||
environment = {
|
|
||||||
XKB_DEFAULT_LAYOUT = "no";
|
|
||||||
};
|
|
||||||
program = "${kodipkg}/bin/kodi";
|
|
||||||
};
|
|
||||||
|
|
||||||
environment.systemPackages = with pkgs; [
|
|
||||||
firefox
|
|
||||||
];
|
|
||||||
}
|
|
||||||
@@ -1,134 +0,0 @@
|
|||||||
{
|
|
||||||
pkgs,
|
|
||||||
...
|
|
||||||
}:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports = [
|
|
||||||
../template/hardware-configuration.nix
|
|
||||||
|
|
||||||
../../system
|
|
||||||
../../common/vm
|
|
||||||
];
|
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
|
||||||
# Use the systemd-boot EFI boot loader.
|
|
||||||
boot.loader.grub = {
|
|
||||||
enable = true;
|
|
||||||
device = "/dev/sda";
|
|
||||||
configurationLimit = 3;
|
|
||||||
};
|
|
||||||
|
|
||||||
networking.hostName = "monitoring01";
|
|
||||||
networking.domain = "home.2rjus.net";
|
|
||||||
networking.useNetworkd = true;
|
|
||||||
networking.useDHCP = false;
|
|
||||||
services.resolved.enable = true;
|
|
||||||
networking.nameservers = [
|
|
||||||
"10.69.13.5"
|
|
||||||
"10.69.13.6"
|
|
||||||
];
|
|
||||||
|
|
||||||
systemd.network.enable = true;
|
|
||||||
systemd.network.networks."ens18" = {
|
|
||||||
matchConfig.Name = "ens18";
|
|
||||||
address = [
|
|
||||||
"10.69.13.13/24"
|
|
||||||
];
|
|
||||||
routes = [
|
|
||||||
{ Gateway = "10.69.13.1"; }
|
|
||||||
];
|
|
||||||
linkConfig.RequiredForOnline = "routable";
|
|
||||||
};
|
|
||||||
time.timeZone = "Europe/Oslo";
|
|
||||||
|
|
||||||
nix.settings.experimental-features = [
|
|
||||||
"nix-command"
|
|
||||||
"flakes"
|
|
||||||
];
|
|
||||||
nix.settings.tarball-ttl = 0;
|
|
||||||
environment.systemPackages = with pkgs; [
|
|
||||||
vim
|
|
||||||
wget
|
|
||||||
git
|
|
||||||
sqlite
|
|
||||||
];
|
|
||||||
|
|
||||||
services.qemuGuest.enable = true;
|
|
||||||
|
|
||||||
sops.secrets."backup_helper_secret" = { };
|
|
||||||
backup-helper = {
|
|
||||||
enable = true;
|
|
||||||
password-file = "/run/secrets/backup_helper_secret";
|
|
||||||
backup-dirs = [
|
|
||||||
"/var/lib/grafana/plugins"
|
|
||||||
];
|
|
||||||
backup-commands = [
|
|
||||||
# "grafana.db:${pkgs.sqlite}/bin/sqlite /var/lib/grafana/data/grafana.db .dump"
|
|
||||||
"grafana.db:${pkgs.sqlite}/bin/sqlite3 /var/lib/grafana/data/grafana.db .dump"
|
|
||||||
];
|
|
||||||
};
|
|
||||||
|
|
||||||
labmon = {
|
|
||||||
enable = true;
|
|
||||||
|
|
||||||
settings = {
|
|
||||||
ListenAddr = ":9969";
|
|
||||||
Profiling = true;
|
|
||||||
StepMonitors = [
|
|
||||||
{
|
|
||||||
Enabled = true;
|
|
||||||
BaseURL = "https://ca.home.2rjus.net";
|
|
||||||
RootID = "3381bda8015a86b9a3cd1851439d1091890a79005e0f1f7c4301fe4bccc29d80";
|
|
||||||
}
|
|
||||||
];
|
|
||||||
|
|
||||||
TLSConnectionMonitors = [
|
|
||||||
{
|
|
||||||
Enabled = true;
|
|
||||||
Address = "ca.home.2rjus.net:443";
|
|
||||||
Verify = true;
|
|
||||||
Duration = "12h";
|
|
||||||
}
|
|
||||||
{
|
|
||||||
Enabled = true;
|
|
||||||
Address = "jelly.home.2rjus.net:443";
|
|
||||||
Verify = true;
|
|
||||||
Duration = "12h";
|
|
||||||
}
|
|
||||||
{
|
|
||||||
Enabled = true;
|
|
||||||
Address = "grafana.home.2rjus.net:443";
|
|
||||||
Verify = true;
|
|
||||||
Duration = "12h";
|
|
||||||
}
|
|
||||||
{
|
|
||||||
Enabled = true;
|
|
||||||
Address = "prometheus.home.2rjus.net:443";
|
|
||||||
Verify = true;
|
|
||||||
Duration = "12h";
|
|
||||||
}
|
|
||||||
{
|
|
||||||
Enabled = true;
|
|
||||||
Address = "alertmanager.home.2rjus.net:443";
|
|
||||||
Verify = true;
|
|
||||||
Duration = "12h";
|
|
||||||
}
|
|
||||||
{
|
|
||||||
Enabled = true;
|
|
||||||
Address = "pyroscope.home.2rjus.net:443";
|
|
||||||
Verify = true;
|
|
||||||
Duration = "12h";
|
|
||||||
}
|
|
||||||
];
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
# Open ports in the firewall.
|
|
||||||
# networking.firewall.allowedTCPPorts = [ ... ];
|
|
||||||
# networking.firewall.allowedUDPPorts = [ ... ];
|
|
||||||
# Or disable the firewall altogether.
|
|
||||||
networking.firewall.enable = false;
|
|
||||||
|
|
||||||
system.stateVersion = "23.11"; # Did you read the comment?
|
|
||||||
}
|
|
||||||
@@ -1,7 +0,0 @@
|
|||||||
{ ... }:
|
|
||||||
{
|
|
||||||
imports = [
|
|
||||||
./configuration.nix
|
|
||||||
../../services/monitoring
|
|
||||||
];
|
|
||||||
}
|
|
||||||
71
hosts/monitoring02/configuration.nix
Normal file
71
hosts/monitoring02/configuration.nix
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
|
pkgs,
|
||||||
|
...
|
||||||
|
}:
|
||||||
|
|
||||||
|
{
|
||||||
|
imports = [
|
||||||
|
../template2/hardware-configuration.nix
|
||||||
|
|
||||||
|
../../system
|
||||||
|
../../common/vm
|
||||||
|
];
|
||||||
|
|
||||||
|
homelab.host = {
|
||||||
|
tier = "prod";
|
||||||
|
role = "monitoring";
|
||||||
|
};
|
||||||
|
|
||||||
|
homelab.dns.cnames = [ "monitoring" "alertmanager" "grafana" "grafana-test" "metrics" "vmalert" "loki" ];
|
||||||
|
|
||||||
|
# Enable Vault integration
|
||||||
|
vault.enable = true;
|
||||||
|
|
||||||
|
# Enable remote deployment via NATS
|
||||||
|
homelab.deploy.enable = true;
|
||||||
|
|
||||||
|
nixpkgs.config.allowUnfree = true;
|
||||||
|
boot.loader.grub.enable = true;
|
||||||
|
boot.loader.grub.device = "/dev/vda";
|
||||||
|
|
||||||
|
networking.hostName = "monitoring02";
|
||||||
|
networking.domain = "home.2rjus.net";
|
||||||
|
networking.useNetworkd = true;
|
||||||
|
networking.useDHCP = false;
|
||||||
|
services.resolved.enable = true;
|
||||||
|
networking.nameservers = [
|
||||||
|
"10.69.13.5"
|
||||||
|
"10.69.13.6"
|
||||||
|
];
|
||||||
|
|
||||||
|
systemd.network.enable = true;
|
||||||
|
systemd.network.networks."ens18" = {
|
||||||
|
matchConfig.Name = "ens18";
|
||||||
|
address = [
|
||||||
|
"10.69.13.24/24"
|
||||||
|
];
|
||||||
|
routes = [
|
||||||
|
{ Gateway = "10.69.13.1"; }
|
||||||
|
];
|
||||||
|
linkConfig.RequiredForOnline = "routable";
|
||||||
|
};
|
||||||
|
time.timeZone = "Europe/Oslo";
|
||||||
|
|
||||||
|
|
||||||
|
nix.settings.tarball-ttl = 0;
|
||||||
|
environment.systemPackages = with pkgs; [
|
||||||
|
vim
|
||||||
|
wget
|
||||||
|
git
|
||||||
|
];
|
||||||
|
|
||||||
|
# Open ports in the firewall.
|
||||||
|
# networking.firewall.allowedTCPPorts = [ ... ];
|
||||||
|
# networking.firewall.allowedUDPPorts = [ ... ];
|
||||||
|
# Or disable the firewall altogether.
|
||||||
|
networking.firewall.enable = false;
|
||||||
|
|
||||||
|
system.stateVersion = "25.11"; # Did you read the comment?
|
||||||
|
}
|
||||||
12
hosts/monitoring02/default.nix
Normal file
12
hosts/monitoring02/default.nix
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
{ ... }: {
|
||||||
|
imports = [
|
||||||
|
./configuration.nix
|
||||||
|
../../services/grafana
|
||||||
|
../../services/victoriametrics
|
||||||
|
../../services/loki
|
||||||
|
../../services/monitoring/alerttonotify.nix
|
||||||
|
../../services/monitoring/blackbox.nix
|
||||||
|
../../services/monitoring/exportarr.nix
|
||||||
|
../../services/monitoring/pve.nix
|
||||||
|
];
|
||||||
|
}
|
||||||
@@ -5,12 +5,14 @@
|
|||||||
|
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../template/hardware-configuration.nix
|
./hardware-configuration.nix
|
||||||
|
|
||||||
../../system
|
../../system
|
||||||
../../common/vm
|
../../common/vm
|
||||||
];
|
];
|
||||||
|
|
||||||
|
homelab.host.role = "messaging";
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
nixpkgs.config.allowUnfree = true;
|
||||||
# Use the systemd-boot EFI boot loader.
|
# Use the systemd-boot EFI boot loader.
|
||||||
boot.loader.grub = {
|
boot.loader.grub = {
|
||||||
@@ -42,10 +44,7 @@
|
|||||||
};
|
};
|
||||||
time.timeZone = "Europe/Oslo";
|
time.timeZone = "Europe/Oslo";
|
||||||
|
|
||||||
nix.settings.experimental-features = [
|
|
||||||
"nix-command"
|
|
||||||
"flakes"
|
|
||||||
];
|
|
||||||
nix.settings.tarball-ttl = 0;
|
nix.settings.tarball-ttl = 0;
|
||||||
environment.systemPackages = with pkgs; [
|
environment.systemPackages = with pkgs; [
|
||||||
vim
|
vim
|
||||||
@@ -59,5 +58,8 @@
|
|||||||
# Or disable the firewall altogether.
|
# Or disable the firewall altogether.
|
||||||
networking.firewall.enable = false;
|
networking.firewall.enable = false;
|
||||||
|
|
||||||
|
vault.enable = true;
|
||||||
|
homelab.deploy.enable = true;
|
||||||
|
|
||||||
system.stateVersion = "23.11"; # Did you read the comment?
|
system.stateVersion = "23.11"; # Did you read the comment?
|
||||||
}
|
}
|
||||||
|
|||||||
42
hosts/nats1/hardware-configuration.nix
Normal file
42
hosts/nats1/hardware-configuration.nix
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
|
pkgs,
|
||||||
|
modulesPath,
|
||||||
|
...
|
||||||
|
}:
|
||||||
|
|
||||||
|
{
|
||||||
|
imports = [
|
||||||
|
(modulesPath + "/profiles/qemu-guest.nix")
|
||||||
|
];
|
||||||
|
boot.initrd.availableKernelModules = [
|
||||||
|
"ata_piix"
|
||||||
|
"uhci_hcd"
|
||||||
|
"virtio_pci"
|
||||||
|
"virtio_scsi"
|
||||||
|
"sd_mod"
|
||||||
|
"sr_mod"
|
||||||
|
];
|
||||||
|
boot.initrd.kernelModules = [ "dm-snapshot" ];
|
||||||
|
boot.kernelModules = [
|
||||||
|
"ptp_kvm"
|
||||||
|
];
|
||||||
|
boot.extraModulePackages = [ ];
|
||||||
|
|
||||||
|
fileSystems."/" = {
|
||||||
|
device = "/dev/disk/by-label/root";
|
||||||
|
fsType = "xfs";
|
||||||
|
};
|
||||||
|
|
||||||
|
swapDevices = [ { device = "/dev/disk/by-label/swap"; } ];
|
||||||
|
|
||||||
|
# Enables DHCP on each ethernet and wireless interface. In case of scripted networking
|
||||||
|
# (the default) this is the recommended approach. When using systemd-networkd it's
|
||||||
|
# still possible to use this option, but it's recommended to use it in conjunction
|
||||||
|
# with explicit per-interface declarations with `networking.interfaces.<interface>.useDHCP`.
|
||||||
|
networking.useDHCP = lib.mkDefault true;
|
||||||
|
# networking.interfaces.ens18.useDHCP = lib.mkDefault true;
|
||||||
|
|
||||||
|
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
|
||||||
|
}
|
||||||
@@ -1,6 +0,0 @@
|
|||||||
{ ... }:
|
|
||||||
{
|
|
||||||
zramSwap = {
|
|
||||||
enable = true;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
45
hosts/nix-cache02/builder.nix
Normal file
45
hosts/nix-cache02/builder.nix
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
{ config, ... }:
|
||||||
|
{
|
||||||
|
# Fetch builder NKey from Vault
|
||||||
|
vault.secrets.builder-nkey = {
|
||||||
|
secretPath = "shared/homelab-deploy/builder-nkey";
|
||||||
|
extractKey = "nkey";
|
||||||
|
outputDir = "/run/secrets/builder-nkey";
|
||||||
|
services = [ "homelab-deploy-builder" ];
|
||||||
|
};
|
||||||
|
|
||||||
|
# Configure the builder service
|
||||||
|
services.homelab-deploy.builder = {
|
||||||
|
enable = true;
|
||||||
|
natsUrl = "nats://nats1.home.2rjus.net:4222";
|
||||||
|
nkeyFile = "/run/secrets/builder-nkey";
|
||||||
|
|
||||||
|
settings.repos = {
|
||||||
|
nixos-servers = {
|
||||||
|
url = "git+https://git.t-juice.club/torjus/nixos-servers.git";
|
||||||
|
defaultBranch = "master";
|
||||||
|
};
|
||||||
|
nixos = {
|
||||||
|
url = "git+https://git.t-juice.club/torjus/nixos.git";
|
||||||
|
defaultBranch = "master";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
timeout = 14400;
|
||||||
|
metrics.enable = true;
|
||||||
|
};
|
||||||
|
|
||||||
|
# Expose builder metrics for Prometheus scraping
|
||||||
|
homelab.monitoring.scrapeTargets = [
|
||||||
|
{
|
||||||
|
job_name = "homelab-deploy-builder";
|
||||||
|
port = 9973;
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
|
# Ensure builder starts after vault secret is available
|
||||||
|
systemd.services.homelab-deploy-builder = {
|
||||||
|
after = [ "vault-secret-builder-nkey.service" ];
|
||||||
|
requires = [ "vault-secret-builder-nkey.service" ];
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -1,25 +1,36 @@
|
|||||||
{
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
pkgs,
|
pkgs,
|
||||||
...
|
...
|
||||||
}:
|
}:
|
||||||
|
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../template/hardware-configuration.nix
|
../template2/hardware-configuration.nix
|
||||||
|
|
||||||
../../system
|
../../system
|
||||||
../../common/vm
|
../../common/vm
|
||||||
];
|
];
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
homelab.host = {
|
||||||
# Use the systemd-boot EFI boot loader.
|
tier = "prod";
|
||||||
boot.loader.grub = {
|
role = "build-host";
|
||||||
enable = true;
|
|
||||||
device = "/dev/sda";
|
|
||||||
configurationLimit = 3;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
networking.hostName = "auth01";
|
homelab.dns.cnames = [ "nix-cache" ];
|
||||||
|
|
||||||
|
# Enable Vault integration
|
||||||
|
vault.enable = true;
|
||||||
|
|
||||||
|
# Enable remote deployment via NATS
|
||||||
|
homelab.deploy.enable = true;
|
||||||
|
|
||||||
|
nixpkgs.config.allowUnfree = true;
|
||||||
|
boot.loader.grub.enable = true;
|
||||||
|
boot.loader.grub.device = "/dev/vda";
|
||||||
|
|
||||||
|
networking.hostName = "nix-cache02";
|
||||||
networking.domain = "home.2rjus.net";
|
networking.domain = "home.2rjus.net";
|
||||||
networking.useNetworkd = true;
|
networking.useNetworkd = true;
|
||||||
networking.useDHCP = false;
|
networking.useDHCP = false;
|
||||||
@@ -33,7 +44,7 @@
|
|||||||
systemd.network.networks."ens18" = {
|
systemd.network.networks."ens18" = {
|
||||||
matchConfig.Name = "ens18";
|
matchConfig.Name = "ens18";
|
||||||
address = [
|
address = [
|
||||||
"10.69.13.18/24"
|
"10.69.13.25/24"
|
||||||
];
|
];
|
||||||
routes = [
|
routes = [
|
||||||
{ Gateway = "10.69.13.1"; }
|
{ Gateway = "10.69.13.1"; }
|
||||||
@@ -42,10 +53,7 @@
|
|||||||
};
|
};
|
||||||
time.timeZone = "Europe/Oslo";
|
time.timeZone = "Europe/Oslo";
|
||||||
|
|
||||||
nix.settings.experimental-features = [
|
|
||||||
"nix-command"
|
|
||||||
"flakes"
|
|
||||||
];
|
|
||||||
nix.settings.tarball-ttl = 0;
|
nix.settings.tarball-ttl = 0;
|
||||||
environment.systemPackages = with pkgs; [
|
environment.systemPackages = with pkgs; [
|
||||||
vim
|
vim
|
||||||
@@ -53,13 +61,11 @@
|
|||||||
git
|
git
|
||||||
];
|
];
|
||||||
|
|
||||||
services.qemuGuest.enable = true;
|
|
||||||
|
|
||||||
# Open ports in the firewall.
|
# Open ports in the firewall.
|
||||||
# networking.firewall.allowedTCPPorts = [ ... ];
|
# networking.firewall.allowedTCPPorts = [ ... ];
|
||||||
# networking.firewall.allowedUDPPorts = [ ... ];
|
# networking.firewall.allowedUDPPorts = [ ... ];
|
||||||
# Or disable the firewall altogether.
|
# Or disable the firewall altogether.
|
||||||
networking.firewall.enable = false;
|
networking.firewall.enable = false;
|
||||||
|
|
||||||
system.stateVersion = "23.11"; # Did you read the comment?
|
system.stateVersion = "25.11"; # Did you read the comment?
|
||||||
}
|
}
|
||||||
@@ -1,9 +1,8 @@
|
|||||||
{ ... }:
|
{ ... }: {
|
||||||
{
|
|
||||||
imports = [
|
imports = [
|
||||||
./configuration.nix
|
./configuration.nix
|
||||||
|
./builder.nix
|
||||||
|
./scheduler.nix
|
||||||
../../services/nix-cache
|
../../services/nix-cache
|
||||||
../../services/actions-runner
|
|
||||||
./zram.nix
|
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
61
hosts/nix-cache02/scheduler.nix
Normal file
61
hosts/nix-cache02/scheduler.nix
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
{ config, pkgs, lib, inputs, ... }:
|
||||||
|
let
|
||||||
|
homelab-deploy = inputs.homelab-deploy.packages.${pkgs.system}.default;
|
||||||
|
|
||||||
|
scheduledBuildScript = pkgs.writeShellApplication {
|
||||||
|
name = "scheduled-build";
|
||||||
|
runtimeInputs = [ homelab-deploy ];
|
||||||
|
text = ''
|
||||||
|
NATS_URL="nats://nats1.home.2rjus.net:4222"
|
||||||
|
NKEY_FILE="/run/secrets/scheduler-nkey"
|
||||||
|
|
||||||
|
echo "Starting scheduled builds at $(date)"
|
||||||
|
|
||||||
|
# Build all nixos-servers hosts
|
||||||
|
homelab-deploy build \
|
||||||
|
--nats-url "$NATS_URL" \
|
||||||
|
--nkey-file "$NKEY_FILE" \
|
||||||
|
nixos-servers --all
|
||||||
|
|
||||||
|
# Build all nixos (gunter) hosts
|
||||||
|
homelab-deploy build \
|
||||||
|
--nats-url "$NATS_URL" \
|
||||||
|
--nkey-file "$NKEY_FILE" \
|
||||||
|
nixos --all
|
||||||
|
|
||||||
|
echo "Scheduled builds completed at $(date)"
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
in
|
||||||
|
{
|
||||||
|
# Fetch scheduler NKey from Vault
|
||||||
|
vault.secrets.scheduler-nkey = {
|
||||||
|
secretPath = "shared/homelab-deploy/scheduler-nkey";
|
||||||
|
extractKey = "nkey";
|
||||||
|
outputDir = "/run/secrets/scheduler-nkey";
|
||||||
|
services = [ "scheduled-build" ];
|
||||||
|
};
|
||||||
|
|
||||||
|
# Timer: every 2 hours
|
||||||
|
systemd.timers.scheduled-build = {
|
||||||
|
description = "Trigger scheduled Nix builds";
|
||||||
|
wantedBy = [ "timers.target" ];
|
||||||
|
timerConfig = {
|
||||||
|
OnCalendar = "*-*-* 00/2:00:00"; # Every 2 hours at :00
|
||||||
|
Persistent = true; # Run missed builds on boot
|
||||||
|
RandomizedDelaySec = "5m"; # Slight jitter
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# Service: oneshot that triggers builds
|
||||||
|
systemd.services.scheduled-build = {
|
||||||
|
description = "Trigger builds for all hosts via NATS";
|
||||||
|
after = [ "network-online.target" "vault-secret-scheduler-nkey.service" ];
|
||||||
|
requires = [ "vault-secret-scheduler-nkey.service" ];
|
||||||
|
wants = [ "network-online.target" ];
|
||||||
|
serviceConfig = {
|
||||||
|
Type = "oneshot";
|
||||||
|
ExecStart = lib.getExe scheduledBuildScript;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -1,67 +0,0 @@
|
|||||||
{ config, lib, pkgs, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports =
|
|
||||||
[
|
|
||||||
../template/hardware-configuration.nix
|
|
||||||
|
|
||||||
../../system
|
|
||||||
];
|
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
|
||||||
# Use the systemd-boot EFI boot loader.
|
|
||||||
boot.loader.grub.enable = true;
|
|
||||||
boot.loader.grub.device = "/dev/sda";
|
|
||||||
|
|
||||||
networking.hostName = "nixos-test1";
|
|
||||||
networking.domain = "home.2rjus.net";
|
|
||||||
networking.useNetworkd = true;
|
|
||||||
networking.useDHCP = false;
|
|
||||||
services.resolved.enable = true;
|
|
||||||
networking.nameservers = [
|
|
||||||
"10.69.13.5"
|
|
||||||
"10.69.13.6"
|
|
||||||
];
|
|
||||||
|
|
||||||
systemd.network.enable = true;
|
|
||||||
systemd.network.networks."ens18" = {
|
|
||||||
matchConfig.Name = "ens18";
|
|
||||||
address = [
|
|
||||||
"10.69.13.10/24"
|
|
||||||
];
|
|
||||||
routes = [
|
|
||||||
{ Gateway = "10.69.13.1"; }
|
|
||||||
];
|
|
||||||
linkConfig.RequiredForOnline = "routable";
|
|
||||||
};
|
|
||||||
time.timeZone = "Europe/Oslo";
|
|
||||||
|
|
||||||
nix.settings.experimental-features = [ "nix-command" "flakes" ];
|
|
||||||
nix.settings.tarball-ttl = 0;
|
|
||||||
environment.systemPackages = with pkgs; [
|
|
||||||
vim
|
|
||||||
wget
|
|
||||||
git
|
|
||||||
];
|
|
||||||
|
|
||||||
# Open ports in the firewall.
|
|
||||||
# networking.firewall.allowedTCPPorts = [ ... ];
|
|
||||||
# networking.firewall.allowedUDPPorts = [ ... ];
|
|
||||||
# Or disable the firewall altogether.
|
|
||||||
networking.firewall.enable = false;
|
|
||||||
|
|
||||||
# Secrets
|
|
||||||
# Backup helper
|
|
||||||
sops.secrets."backup_helper_secret" = { };
|
|
||||||
backup-helper = {
|
|
||||||
enable = true;
|
|
||||||
password-file = "/run/secrets/backup_helper_secret";
|
|
||||||
backup-dirs = [
|
|
||||||
"/etc/machine-id"
|
|
||||||
"/etc/os-release"
|
|
||||||
];
|
|
||||||
};
|
|
||||||
|
|
||||||
system.stateVersion = "23.11"; # Did you read the comment?
|
|
||||||
}
|
|
||||||
|
|
||||||
78
hosts/nrec-nixos01/configuration.nix
Normal file
78
hosts/nrec-nixos01/configuration.nix
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
{
|
||||||
|
lib,
|
||||||
|
pkgs,
|
||||||
|
...
|
||||||
|
}:
|
||||||
|
|
||||||
|
{
|
||||||
|
services.openssh = {
|
||||||
|
enable = true;
|
||||||
|
settings = {
|
||||||
|
PermitRootLogin = lib.mkForce "no";
|
||||||
|
PasswordAuthentication = false;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
users.users.nixos = {
|
||||||
|
isNormalUser = true;
|
||||||
|
extraGroups = [ "wheel" ];
|
||||||
|
shell = pkgs.zsh;
|
||||||
|
openssh.authorizedKeys.keys = [
|
||||||
|
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAwfb2jpKrBnCw28aevnH8HbE5YbcMXpdaVv2KmueDu6 torjus@gunter"
|
||||||
|
];
|
||||||
|
};
|
||||||
|
security.sudo.wheelNeedsPassword = false;
|
||||||
|
programs.zsh.enable = true;
|
||||||
|
|
||||||
|
homelab.dns.enable = false;
|
||||||
|
homelab.monitoring.enable = false;
|
||||||
|
homelab.host.labels.ansible = "false";
|
||||||
|
|
||||||
|
fileSystems."/" = {
|
||||||
|
device = "/dev/disk/by-label/nixos";
|
||||||
|
fsType = "ext4";
|
||||||
|
autoResize = true;
|
||||||
|
};
|
||||||
|
|
||||||
|
boot.loader.grub.enable = true;
|
||||||
|
boot.loader.grub.device = "/dev/vda";
|
||||||
|
networking.hostName = "nrec-nixos01";
|
||||||
|
networking.useNetworkd = true;
|
||||||
|
networking.useDHCP = false;
|
||||||
|
services.resolved.enable = true;
|
||||||
|
|
||||||
|
systemd.network.enable = true;
|
||||||
|
systemd.network.networks."ens3" = {
|
||||||
|
matchConfig.Name = "ens3";
|
||||||
|
networkConfig.DHCP = "ipv4";
|
||||||
|
linkConfig.RequiredForOnline = "routable";
|
||||||
|
};
|
||||||
|
time.timeZone = "Europe/Oslo";
|
||||||
|
|
||||||
|
networking.firewall.enable = true;
|
||||||
|
networking.firewall.allowedTCPPorts = [
|
||||||
|
22
|
||||||
|
80
|
||||||
|
443
|
||||||
|
];
|
||||||
|
|
||||||
|
nix.settings.substituters = [
|
||||||
|
"https://cache.nixos.org"
|
||||||
|
];
|
||||||
|
nix.settings.trusted-public-keys = [
|
||||||
|
"cache.nixos.org-1:6NCHdD59X431o0gWypbMrAURkbJ16ZPMQFGspcDShjY="
|
||||||
|
];
|
||||||
|
|
||||||
|
services.caddy = {
|
||||||
|
enable = true;
|
||||||
|
virtualHosts."nrec-nixos01.t-juice.club" = {
|
||||||
|
extraConfig = ''
|
||||||
|
reverse_proxy 127.0.0.1:3000
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
zramSwap.enable = true;
|
||||||
|
|
||||||
|
system.stateVersion = "25.11";
|
||||||
|
}
|
||||||
9
hosts/nrec-nixos01/default.nix
Normal file
9
hosts/nrec-nixos01/default.nix
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
{ modulesPath, ... }:
|
||||||
|
{
|
||||||
|
imports = [
|
||||||
|
./configuration.nix
|
||||||
|
../../system/packages.nix
|
||||||
|
../../services/forgejo
|
||||||
|
(modulesPath + "/profiles/qemu-guest.nix")
|
||||||
|
];
|
||||||
|
}
|
||||||
@@ -7,23 +7,38 @@
|
|||||||
|
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../template/hardware-configuration.nix
|
../template2/hardware-configuration.nix
|
||||||
|
|
||||||
../../system
|
../../system
|
||||||
|
../../common/vm
|
||||||
|
|
||||||
|
# DNS services
|
||||||
../../services/ns/master-authorative.nix
|
../../services/ns/master-authorative.nix
|
||||||
../../services/ns/resolver.nix
|
../../services/ns/resolver.nix
|
||||||
../../common/vm
|
|
||||||
];
|
];
|
||||||
|
|
||||||
|
# Host metadata
|
||||||
|
homelab.host = {
|
||||||
|
tier = "prod";
|
||||||
|
role = "dns";
|
||||||
|
labels.dns_role = "primary";
|
||||||
|
};
|
||||||
|
|
||||||
|
# Enable Vault integration
|
||||||
|
vault.enable = true;
|
||||||
|
|
||||||
|
# Enable remote deployment via NATS
|
||||||
|
homelab.deploy.enable = true;
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
nixpkgs.config.allowUnfree = true;
|
||||||
# Use the systemd-boot EFI boot loader.
|
|
||||||
boot.loader.grub.enable = true;
|
boot.loader.grub.enable = true;
|
||||||
boot.loader.grub.device = "/dev/sda";
|
boot.loader.grub.device = "/dev/vda";
|
||||||
|
|
||||||
networking.hostName = "ns1";
|
networking.hostName = "ns1";
|
||||||
networking.domain = "home.2rjus.net";
|
networking.domain = "home.2rjus.net";
|
||||||
networking.useNetworkd = true;
|
networking.useNetworkd = true;
|
||||||
networking.useDHCP = false;
|
networking.useDHCP = false;
|
||||||
|
# Disable resolved - conflicts with Unbound resolver
|
||||||
services.resolved.enable = false;
|
services.resolved.enable = false;
|
||||||
networking.nameservers = [
|
networking.nameservers = [
|
||||||
"10.69.13.5"
|
"10.69.13.5"
|
||||||
@@ -43,10 +58,7 @@
|
|||||||
};
|
};
|
||||||
time.timeZone = "Europe/Oslo";
|
time.timeZone = "Europe/Oslo";
|
||||||
|
|
||||||
nix.settings.experimental-features = [
|
|
||||||
"nix-command"
|
|
||||||
"flakes"
|
|
||||||
];
|
|
||||||
nix.settings.tarball-ttl = 0;
|
nix.settings.tarball-ttl = 0;
|
||||||
environment.systemPackages = with pkgs; [
|
environment.systemPackages = with pkgs; [
|
||||||
vim
|
vim
|
||||||
@@ -60,5 +72,5 @@
|
|||||||
# Or disable the firewall altogether.
|
# Or disable the firewall altogether.
|
||||||
networking.firewall.enable = false;
|
networking.firewall.enable = false;
|
||||||
|
|
||||||
system.stateVersion = "23.11"; # Did you read the comment?
|
system.stateVersion = "25.11"; # Did you read the comment?
|
||||||
}
|
}
|
||||||
@@ -1,36 +0,0 @@
|
|||||||
{ config, lib, pkgs, modulesPath, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports =
|
|
||||||
[
|
|
||||||
(modulesPath + "/profiles/qemu-guest.nix")
|
|
||||||
];
|
|
||||||
|
|
||||||
boot.initrd.availableKernelModules = [ "ata_piix" "uhci_hcd" "virtio_pci" "virtio_scsi" "sd_mod" "sr_mod" ];
|
|
||||||
boot.initrd.kernelModules = [ ];
|
|
||||||
# boot.kernelModules = [ ];
|
|
||||||
# boot.extraModulePackages = [ ];
|
|
||||||
|
|
||||||
fileSystems."/" =
|
|
||||||
{
|
|
||||||
device = "/dev/disk/by-uuid/6889aba9-61ed-4687-ab10-e5cf4017ac8d";
|
|
||||||
fsType = "xfs";
|
|
||||||
};
|
|
||||||
|
|
||||||
fileSystems."/boot" =
|
|
||||||
{
|
|
||||||
device = "/dev/disk/by-uuid/BC07-3B7A";
|
|
||||||
fsType = "vfat";
|
|
||||||
};
|
|
||||||
|
|
||||||
swapDevices =
|
|
||||||
[{ device = "/dev/disk/by-uuid/64e5757b-6625-4dd2-aa2a-66ca93444d23"; }];
|
|
||||||
|
|
||||||
# Enables DHCP on each ethernet and wireless interface. In case of scripted networking
|
|
||||||
# (the default) this is the recommended approach. When using systemd-networkd it's
|
|
||||||
# still possible to use this option, but it's recommended to use it in conjunction
|
|
||||||
# with explicit per-interface declarations with `networking.interfaces.<interface>.useDHCP`.
|
|
||||||
# networking.interfaces.ens18.useDHCP = lib.mkDefault true;
|
|
||||||
|
|
||||||
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
|
|
||||||
}
|
|
||||||
@@ -7,23 +7,38 @@
|
|||||||
|
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
../template/hardware-configuration.nix
|
../template2/hardware-configuration.nix
|
||||||
|
|
||||||
../../system
|
../../system
|
||||||
|
../../common/vm
|
||||||
|
|
||||||
|
# DNS services
|
||||||
../../services/ns/secondary-authorative.nix
|
../../services/ns/secondary-authorative.nix
|
||||||
../../services/ns/resolver.nix
|
../../services/ns/resolver.nix
|
||||||
../../common/vm
|
|
||||||
];
|
];
|
||||||
|
|
||||||
|
# Host metadata
|
||||||
|
homelab.host = {
|
||||||
|
tier = "prod";
|
||||||
|
role = "dns";
|
||||||
|
labels.dns_role = "secondary";
|
||||||
|
};
|
||||||
|
|
||||||
|
# Enable Vault integration
|
||||||
|
vault.enable = true;
|
||||||
|
|
||||||
|
# Enable remote deployment via NATS
|
||||||
|
homelab.deploy.enable = true;
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
nixpkgs.config.allowUnfree = true;
|
||||||
# Use the systemd-boot EFI boot loader.
|
|
||||||
boot.loader.grub.enable = true;
|
boot.loader.grub.enable = true;
|
||||||
boot.loader.grub.device = "/dev/sda";
|
boot.loader.grub.device = "/dev/vda";
|
||||||
|
|
||||||
networking.hostName = "ns2";
|
networking.hostName = "ns2";
|
||||||
networking.domain = "home.2rjus.net";
|
networking.domain = "home.2rjus.net";
|
||||||
networking.useNetworkd = true;
|
networking.useNetworkd = true;
|
||||||
networking.useDHCP = false;
|
networking.useDHCP = false;
|
||||||
|
# Disable resolved - conflicts with Unbound resolver
|
||||||
services.resolved.enable = false;
|
services.resolved.enable = false;
|
||||||
networking.nameservers = [
|
networking.nameservers = [
|
||||||
"10.69.13.5"
|
"10.69.13.5"
|
||||||
@@ -43,10 +58,8 @@
|
|||||||
};
|
};
|
||||||
time.timeZone = "Europe/Oslo";
|
time.timeZone = "Europe/Oslo";
|
||||||
|
|
||||||
nix.settings.experimental-features = [
|
|
||||||
"nix-command"
|
nix.settings.tarball-ttl = 0;
|
||||||
"flakes"
|
|
||||||
];
|
|
||||||
environment.systemPackages = with pkgs; [
|
environment.systemPackages = with pkgs; [
|
||||||
vim
|
vim
|
||||||
wget
|
wget
|
||||||
@@ -59,5 +72,5 @@
|
|||||||
# Or disable the firewall altogether.
|
# Or disable the firewall altogether.
|
||||||
networking.firewall.enable = false;
|
networking.firewall.enable = false;
|
||||||
|
|
||||||
system.stateVersion = "23.11"; # Did you read the comment?
|
system.stateVersion = "25.11"; # Did you read the comment?
|
||||||
}
|
}
|
||||||
@@ -1,36 +0,0 @@
|
|||||||
{ config, lib, pkgs, modulesPath, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports =
|
|
||||||
[
|
|
||||||
(modulesPath + "/profiles/qemu-guest.nix")
|
|
||||||
];
|
|
||||||
|
|
||||||
boot.initrd.availableKernelModules = [ "ata_piix" "uhci_hcd" "virtio_pci" "virtio_scsi" "sd_mod" "sr_mod" ];
|
|
||||||
boot.initrd.kernelModules = [ ];
|
|
||||||
# boot.kernelModules = [ ];
|
|
||||||
# boot.extraModulePackages = [ ];
|
|
||||||
|
|
||||||
fileSystems."/" =
|
|
||||||
{
|
|
||||||
device = "/dev/disk/by-uuid/6889aba9-61ed-4687-ab10-e5cf4017ac8d";
|
|
||||||
fsType = "xfs";
|
|
||||||
};
|
|
||||||
|
|
||||||
fileSystems."/boot" =
|
|
||||||
{
|
|
||||||
device = "/dev/disk/by-uuid/BC07-3B7A";
|
|
||||||
fsType = "vfat";
|
|
||||||
};
|
|
||||||
|
|
||||||
swapDevices =
|
|
||||||
[{ device = "/dev/disk/by-uuid/64e5757b-6625-4dd2-aa2a-66ca93444d23"; }];
|
|
||||||
|
|
||||||
# Enables DHCP on each ethernet and wireless interface. In case of scripted networking
|
|
||||||
# (the default) this is the recommended approach. When using systemd-networkd it's
|
|
||||||
# still possible to use this option, but it's recommended to use it in conjunction
|
|
||||||
# with explicit per-interface declarations with `networking.interfaces.<interface>.useDHCP`.
|
|
||||||
# networking.interfaces.ens18.useDHCP = lib.mkDefault true;
|
|
||||||
|
|
||||||
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
|
|
||||||
}
|
|
||||||
@@ -1,56 +0,0 @@
|
|||||||
{ config, lib, pkgs, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports =
|
|
||||||
[
|
|
||||||
../template/hardware-configuration.nix
|
|
||||||
|
|
||||||
../../system
|
|
||||||
../../services/ns/master-authorative.nix
|
|
||||||
../../services/ns/resolver.nix
|
|
||||||
];
|
|
||||||
|
|
||||||
nixpkgs.config.allowUnfree = true;
|
|
||||||
# Use the systemd-boot EFI boot loader.
|
|
||||||
boot.loader.grub.enable = true;
|
|
||||||
boot.loader.grub.device = "/dev/sda";
|
|
||||||
|
|
||||||
networking.hostName = "ns3";
|
|
||||||
networking.domain = "home.2rjus.net";
|
|
||||||
networking.useNetworkd = true;
|
|
||||||
networking.useDHCP = false;
|
|
||||||
services.resolved.enable = false;
|
|
||||||
networking.nameservers = [
|
|
||||||
"10.69.13.5"
|
|
||||||
"10.69.13.6"
|
|
||||||
];
|
|
||||||
|
|
||||||
systemd.network.enable = true;
|
|
||||||
systemd.network.networks."ens18" = {
|
|
||||||
matchConfig.Name = "ens18";
|
|
||||||
address = [
|
|
||||||
"10.69.13.7/24"
|
|
||||||
];
|
|
||||||
routes = [
|
|
||||||
{ Gateway = "10.69.13.1"; }
|
|
||||||
];
|
|
||||||
linkConfig.RequiredForOnline = "routable";
|
|
||||||
};
|
|
||||||
time.timeZone = "Europe/Oslo";
|
|
||||||
|
|
||||||
nix.settings.experimental-features = [ "nix-command" "flakes" ];
|
|
||||||
environment.systemPackages = with pkgs; [
|
|
||||||
vim
|
|
||||||
wget
|
|
||||||
git
|
|
||||||
];
|
|
||||||
|
|
||||||
# Open ports in the firewall.
|
|
||||||
# networking.firewall.allowedTCPPorts = [ ... ];
|
|
||||||
# networking.firewall.allowedUDPPorts = [ ... ];
|
|
||||||
# Or disable the firewall altogether.
|
|
||||||
networking.firewall.enable = false;
|
|
||||||
|
|
||||||
system.stateVersion = "23.11"; # Did you read the comment?
|
|
||||||
}
|
|
||||||
|
|
||||||
@@ -1,36 +0,0 @@
|
|||||||
{ config, lib, pkgs, modulesPath, ... }:
|
|
||||||
|
|
||||||
{
|
|
||||||
imports =
|
|
||||||
[
|
|
||||||
(modulesPath + "/profiles/qemu-guest.nix")
|
|
||||||
];
|
|
||||||
|
|
||||||
boot.initrd.availableKernelModules = [ "ata_piix" "uhci_hcd" "virtio_pci" "virtio_scsi" "sd_mod" "sr_mod" ];
|
|
||||||
boot.initrd.kernelModules = [ ];
|
|
||||||
# boot.kernelModules = [ ];
|
|
||||||
# boot.extraModulePackages = [ ];
|
|
||||||
|
|
||||||
fileSystems."/" =
|
|
||||||
{
|
|
||||||
device = "/dev/disk/by-uuid/6889aba9-61ed-4687-ab10-e5cf4017ac8d";
|
|
||||||
fsType = "xfs";
|
|
||||||
};
|
|
||||||
|
|
||||||
fileSystems."/boot" =
|
|
||||||
{
|
|
||||||
device = "/dev/disk/by-uuid/BC07-3B7A";
|
|
||||||
fsType = "vfat";
|
|
||||||
};
|
|
||||||
|
|
||||||
swapDevices =
|
|
||||||
[{ device = "/dev/disk/by-uuid/64e5757b-6625-4dd2-aa2a-66ca93444d23"; }];
|
|
||||||
|
|
||||||
# Enables DHCP on each ethernet and wireless interface. In case of scripted networking
|
|
||||||
# (the default) this is the recommended approach. When using systemd-networkd it's
|
|
||||||
# still possible to use this option, but it's recommended to use it in conjunction
|
|
||||||
# with explicit per-interface declarations with `networking.interfaces.<interface>.useDHCP`.
|
|
||||||
# networking.interfaces.ens18.useDHCP = lib.mkDefault true;
|
|
||||||
|
|
||||||
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
|
|
||||||
}
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user