From 47747329c43914a7216f1c8b66b82a5183002486 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Tue, 10 Feb 2026 22:26:40 +0100 Subject: [PATCH 1/3] nix-cache02: add homelab-deploy builder service - Configure builder to build nixos-servers and nixos (gunter) repos - Add builder NKey to Vault secrets - Update NATS permissions for builder, test-deployer, and admin-deployer - Grant nix-cache02 access to shared homelab-deploy secrets Co-Authored-By: Claude Opus 4.5 --- flake.lock | 8 +++--- hosts/nix-cache02/builder.nix | 44 ++++++++++++++++++++++++++++++ hosts/nix-cache02/default.nix | 1 + services/nats/default.nix | 20 ++++++++++++-- terraform/vault/hosts-generated.tf | 1 + terraform/vault/secrets.tf | 5 ++++ terraform/vault/variables.tf | 7 +++++ 7 files changed, 80 insertions(+), 6 deletions(-) create mode 100644 hosts/nix-cache02/builder.nix diff --git a/flake.lock b/flake.lock index 29aacfa..d278e8d 100644 --- a/flake.lock +++ b/flake.lock @@ -28,11 +28,11 @@ ] }, "locked": { - "lastModified": 1770648258, - "narHash": "sha256-sExxD8N9Q0RrHIoppOV6qp4jcJirLVjpQd20C72V78I=", + "lastModified": 1770758165, + "narHash": "sha256-jjCcxhZavm2r7gjZ2+FNOMvTYQsRlIa9ijPICK0HVk4=", "ref": "master", - "rev": "277a49a666347e2e2ae67128cf732956a9c3be56", - "revCount": 27, + "rev": "a8aab16d0e7400aaa00500d08c12734da3b638e0", + "revCount": 32, "type": "git", "url": "https://git.t-juice.club/torjus/homelab-deploy" }, diff --git a/hosts/nix-cache02/builder.nix b/hosts/nix-cache02/builder.nix new file mode 100644 index 0000000..012f3f8 --- /dev/null +++ b/hosts/nix-cache02/builder.nix @@ -0,0 +1,44 @@ +{ config, ... }: +{ + # Fetch builder NKey from Vault + vault.secrets.builder-nkey = { + secretPath = "shared/homelab-deploy/builder-nkey"; + extractKey = "nkey"; + outputDir = "/run/secrets/builder-nkey"; + services = [ "homelab-deploy-builder" ]; + }; + + # Configure the builder service + services.homelab-deploy.builder = { + enable = true; + natsUrl = "nats://nats1.home.2rjus.net:4222"; + nkeyFile = "/run/secrets/builder-nkey"; + + settings.repos = { + nixos-servers = { + url = "git+https://git.t-juice.club/torjus/nixos-servers.git"; + defaultBranch = "master"; + }; + nixos = { + url = "git+https://git.t-juice.club/torjus/nixos.git"; + defaultBranch = "master"; + }; + }; + + metrics.enable = true; + }; + + # Expose builder metrics for Prometheus scraping + homelab.monitoring.scrapeTargets = [ + { + job_name = "homelab-deploy-builder"; + port = 9973; + } + ]; + + # Ensure builder starts after vault secret is available + systemd.services.homelab-deploy-builder = { + after = [ "vault-secret-builder-nkey.service" ]; + requires = [ "vault-secret-builder-nkey.service" ]; + }; +} diff --git a/hosts/nix-cache02/default.nix b/hosts/nix-cache02/default.nix index 57ed4b4..d44f078 100644 --- a/hosts/nix-cache02/default.nix +++ b/hosts/nix-cache02/default.nix @@ -1,5 +1,6 @@ { ... }: { imports = [ ./configuration.nix + ./builder.nix ]; } \ No newline at end of file diff --git a/services/nats/default.nix b/services/nats/default.nix index bb0a94f..b657f0a 100644 --- a/services/nats/default.nix +++ b/services/nats/default.nix @@ -74,10 +74,12 @@ publish = [ "deploy.test.>" "deploy.discover" + "build.>" ]; subscribe = [ "deploy.responses.>" "deploy.discover" + "build.responses.>" ]; }; } @@ -85,8 +87,22 @@ { nkey = "UD2BFB7DLM67P5UUVCKBUJMCHADIZLGGVUNSRLZE2ZC66FW2XT44P73Y"; permissions = { - publish = [ "deploy.>" ]; - subscribe = [ "deploy.>" ]; + publish = [ + "deploy.>" + "build.>" + ]; + subscribe = [ + "deploy.>" + "build.responses.>" + ]; + }; + } + # Builder (subscribes to build requests, publishes responses) + { + nkey = "UB4PUHGKAWAK6OS62FX7DOQTPFFJTLZZBTKCOCAXDP75H3NSMWAEDJ7E"; + permissions = { + subscribe = [ "build.>" ]; + publish = [ "build.responses.>" ]; }; } ]; diff --git a/terraform/vault/hosts-generated.tf b/terraform/vault/hosts-generated.tf index 96c048c..adc5489 100644 --- a/terraform/vault/hosts-generated.tf +++ b/terraform/vault/hosts-generated.tf @@ -36,6 +36,7 @@ locals { "nix-cache02" = { paths = [ "secret/data/hosts/nix-cache02/*", + "secret/data/shared/homelab-deploy/*", ] } diff --git a/terraform/vault/secrets.tf b/terraform/vault/secrets.tf index 1bb8926..5f2a03e 100644 --- a/terraform/vault/secrets.tf +++ b/terraform/vault/secrets.tf @@ -103,6 +103,11 @@ locals { data = { nkey = var.homelab_deploy_admin_deployer_nkey } } + "shared/homelab-deploy/builder-nkey" = { + auto_generate = false + data = { nkey = var.homelab_deploy_builder_nkey } + } + # Kanidm idm_admin password "kanidm/idm-admin-password" = { auto_generate = true diff --git a/terraform/vault/variables.tf b/terraform/vault/variables.tf index 3a03b66..6cc7b58 100644 --- a/terraform/vault/variables.tf +++ b/terraform/vault/variables.tf @@ -73,6 +73,13 @@ variable "homelab_deploy_admin_deployer_nkey" { sensitive = true } +variable "homelab_deploy_builder_nkey" { + description = "NKey seed for homelab-deploy builder" + type = string + default = "PLACEHOLDER" + sensitive = true +} + variable "nixos_exporter_nkey" { description = "NKey seed for nixos-exporter NATS authentication" type = string -- 2.49.1 From f83145d97a137c2ff95df09efd1cab1815d610e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Tue, 10 Feb 2026 22:43:48 +0100 Subject: [PATCH 2/3] docs: update nix-cache-reprovision plan with progress - Mark Phase 1 (new build host) and Phase 2 (NATS build triggering) complete - Document nix-cache02 configuration and tested build times - Add remaining work for Harmonia, Actions runner, and DNS cutover - Enable --enable-builds flag in MCP config Co-Authored-By: Claude Opus 4.5 --- .mcp.json | 3 +- docs/plans/nix-cache-reprovision.md | 291 ++++++++++------------------ 2 files changed, 104 insertions(+), 190 deletions(-) diff --git a/.mcp.json b/.mcp.json index 363a82d..f5d61f6 100644 --- a/.mcp.json +++ b/.mcp.json @@ -31,7 +31,8 @@ "--", "mcp", "--nats-url", "nats://nats1.home.2rjus.net:4222", - "--nkey-file", "/home/torjus/.config/homelab-deploy/test-deployer.nkey" + "--nkey-file", "/home/torjus/.config/homelab-deploy/test-deployer.nkey", + "--enable-builds" ] }, "git-explorer": { diff --git a/docs/plans/nix-cache-reprovision.md b/docs/plans/nix-cache-reprovision.md index 74f5394..94e405c 100644 --- a/docs/plans/nix-cache-reprovision.md +++ b/docs/plans/nix-cache-reprovision.md @@ -6,207 +6,120 @@ Reprovision `nix-cache01` using the OpenTofu workflow, and improve the build/cac 1. NATS-based remote build triggering (replacing the current bash script) 2. Safer flake update workflow that validates builds before pushing to master +## Status + +**Phase 1: New Build Host** - COMPLETE +**Phase 2: NATS Build Triggering** - COMPLETE +**Phase 3: Safe Flake Update Workflow** - NOT STARTED +**Phase 4: Decommission Old System** - NOT STARTED + +## Completed Work + +### New Build Host (nix-cache02) + +Instead of reprovisioning nix-cache01 in-place, we created a new host `nix-cache02` at 10.69.13.25: + +- **Specs**: 8 CPU cores, 16GB RAM (temporarily, will increase to 24GB after nix-cache01 decommissioned), 200GB disk +- **Provisioned via OpenTofu** with automatic Vault credential bootstrapping +- **Builder service** configured with two repos: + - `nixos-servers` → `git+https://git.t-juice.club/torjus/nixos-servers.git` + - `nixos` (gunter) → `git+https://git.t-juice.club/torjus/nixos.git` + +### NATS-Based Build Triggering + +The `homelab-deploy` tool was extended with a builder mode: + +**NATS Subjects:** +- `build..` - e.g., `build.nixos-servers.all` or `build.nixos-servers.ns1` + +**NATS Permissions (in DEPLOY account):** +| User | Publish | Subscribe | +|------|---------|-----------| +| Builder | `build.responses.>` | `build.>` | +| Test deployer | `deploy.test.>`, `deploy.discover`, `build.>` | `deploy.responses.>`, `deploy.discover`, `build.responses.>` | +| Admin deployer | `deploy.>`, `build.>` | `deploy.>`, `build.responses.>` | + +**Vault Secrets:** +- `shared/homelab-deploy/builder-nkey` - NKey seed for builder authentication + +**NixOS Configuration:** +- `hosts/nix-cache02/builder.nix` - Builder service configuration +- `services/nats/default.nix` - Updated with builder NATS user + +**MCP Integration:** +- `.mcp.json` updated with `--enable-builds` flag +- Build tool available via MCP for Claude Code + +**Tested:** +- Single host build: `build nixos-servers testvm01` (~30s) +- All hosts build: `build nixos-servers all` (16 hosts in ~226s) + ## Current State -### Host Configuration -- `nix-cache01` at 10.69.13.15 serves the binary cache via Harmonia -- Runs Gitea Actions runner for CI workflows -- Has `homelab.deploy.enable = true` (already supports NATS-based deployment) -- Uses a dedicated XFS volume at `/nix` for cache storage +### Old System (nix-cache01) +- Still running at 10.69.13.15 +- Serves binary cache via Harmonia +- Runs Gitea Actions runner +- Has the old `build-flakes.sh` timer (every 30 min) +- Will be decommissioned after nix-cache02 is fully validated -### Current Build System (`services/nix-cache/build-flakes.sh`) -- Runs every 30 minutes via systemd timer -- Clones/pulls two repos: `nixos-servers` and `nixos` (gunter) -- Builds all hosts with `nixos-rebuild build` (no blacklist despite docs mentioning it) -- Pushes success/failure metrics to pushgateway -- Simple but has no filtering, no parallelism, no remote triggering +### New System (nix-cache02) +- Running at 10.69.13.25 +- Builder service active, responding to NATS build requests +- Metrics exposed on port 9973 (`homelab-deploy-builder` job) +- Does NOT yet have: + - Harmonia (binary cache server) + - Actions runner + - Cache signing key -### Current Flake Update Workflow (`.github/workflows/flake-update.yaml`) -- Runs daily at midnight via cron -- Runs `nix flake update --commit-lock-file` -- Pushes directly to master -- No build validation — can push broken inputs +## Remaining Work -## Improvement 1: NATS-Based Remote Build Triggering - -### Design - -Extend the existing `homelab-deploy` tool to support a "build" command that triggers builds on the cache host. This reuses the NATS infrastructure already in place. - -| Approach | Pros | Cons | -|----------|------|------| -| Extend homelab-deploy | Reuses existing NATS auth, NKey handling, CLI | Adds scope to existing tool | -| New nix-cache-tool | Clean separation | Duplicate NATS boilerplate, new credentials | -| Gitea Actions webhook | No custom tooling | Less flexible, tied to Gitea | - -**Recommendation:** Extend `homelab-deploy` with a build subcommand. The tool already has NATS client code, authentication handling, and a listener module in NixOS. - -### Implementation - -1. Add new message type to homelab-deploy: `build.` subject -2. Listener on nix-cache01 subscribes to `build.>` wildcard -3. On message receipt, builds the specified host and returns success/failure -4. CLI command: `homelab-deploy build ` or `homelab-deploy build --all` - -### Benefits -- Trigger rebuild for specific host to ensure it's cached -- Could be called from CI after merging PRs -- Reuses existing NATS infrastructure and auth -- Progress/status could stream back via NATS reply - -## Improvement 2: Smarter Flake Update Workflow - -### Current Problems -1. Updates can push breaking changes to master -2. No visibility into what broke when it does -3. Hosts that auto-update can pull broken configs - -### Proposed Workflow - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ Flake Update Workflow │ -├─────────────────────────────────────────────────────────────────┤ -│ 1. nix flake update (on feature branch) │ -│ 2. Build ALL hosts locally │ -│ 3. If all pass → fast-forward merge to master │ -│ 4. If any fail → create PR with failure logs attached │ -└─────────────────────────────────────────────────────────────────┘ -``` - -### Implementation Options - -| Option | Description | Pros | Cons | -|--------|-------------|------|------| -| **A: Self-hosted runner** | Build on nix-cache01 | Fast (local cache), simple | Ties up cache host during build | -| **B: Gitea Actions only** | Use container runner | Clean separation | Slow (no cache), resource limits | -| **C: Hybrid** | Trigger builds on nix-cache01 via NATS from Actions | Best of both | More complex | - -**Recommendation:** Option A with nix-cache01 as the runner. The host is already running Gitea Actions runner and has the cache. Building all ~16 hosts is disk I/O heavy but feasible on dedicated hardware. - -### Workflow Steps - -1. Workflow runs on schedule (daily or weekly) -2. Creates branch `flake-update/YYYY-MM-DD` -3. Runs `nix flake update --commit-lock-file` -4. Builds each host: `nix build .#nixosConfigurations..config.system.build.toplevel` -5. If all succeed: - - Fast-forward merge to master - - Delete feature branch -6. If any fail: - - Create PR from the update branch - - Attach build logs as PR comment - - Label PR with `needs-review` or `build-failure` - - Do NOT merge automatically - -### Workflow File Changes - -```yaml -# New: .github/workflows/flake-update-safe.yaml -name: Safe flake update -on: - schedule: - - cron: "0 2 * * 0" # Weekly on Sunday at 2 AM - workflow_dispatch: # Manual trigger - -jobs: - update-and-validate: - runs-on: homelab # Use self-hosted runner on nix-cache01 - steps: - - uses: actions/checkout@v4 - with: - ref: master - fetch-depth: 0 # Need full history for merge - - - name: Create update branch - run: | - BRANCH="flake-update/$(date +%Y-%m-%d)" - git checkout -b "$BRANCH" - - - name: Update flake - run: nix flake update --commit-lock-file - - - name: Build all hosts - id: build - run: | - FAILED="" - for host in $(nix flake show --json | jq -r '.nixosConfigurations | keys[]'); do - echo "Building $host..." - if ! nix build ".#nixosConfigurations.$host.config.system.build.toplevel" 2>&1 | tee "build-$host.log"; then - FAILED="$FAILED $host" - fi - done - echo "failed=$FAILED" >> $GITHUB_OUTPUT - - - name: Merge to master (if all pass) - if: steps.build.outputs.failed == '' - run: | - git checkout master - git merge --ff-only "$BRANCH" - git push origin master - git push origin --delete "$BRANCH" - - - name: Create PR (if any fail) - if: steps.build.outputs.failed != '' - run: | - git push origin "$BRANCH" - # Create PR via Gitea API with build logs - # ... (PR creation with log attachment) -``` - -## Migration Steps - -### Phase 1: Reprovision Host via OpenTofu - -1. Add `nix-cache01` to `terraform/vms.tf`: - ```hcl - "nix-cache01" = { - ip = "10.69.13.15/24" - cpu_cores = 4 - memory = 8192 - disk_size = "100G" # Larger for nix store - } - ``` - -2. Shut down existing nix-cache01 VM -3. Run `tofu apply` to provision new VM -4. Verify bootstrap completes and cache is serving - -**Note:** The cache will be cold after reprovision. Run initial builds to populate. - -### Phase 2: Add Build Triggering to homelab-deploy - -1. Add `build` command to homelab-deploy CLI -2. Add listener handler in NixOS module for `build.*` subjects -3. Update nix-cache01 config to enable build listener -4. Test with `homelab-deploy build testvm01` - -### Phase 3: Implement Safe Flake Update Workflow +### Phase 3: Safe Flake Update Workflow 1. Create `.github/workflows/flake-update-safe.yaml` 2. Disable or remove old `flake-update.yaml` 3. Test manually with `workflow_dispatch` 4. Monitor first automated run -### Phase 4: Remove Old Build Script +### Phase 4: Complete Migration -1. After new workflow is stable, remove: - - `services/nix-cache/build-flakes.nix` - - `services/nix-cache/build-flakes.sh` -2. The new workflow handles scheduled builds +1. **Add Harmonia to nix-cache02** - Copy cache signing key, configure service +2. **Add Actions runner to nix-cache02** - Configure with Vault token +3. **Update DNS** - Point `nix-cache.home.2rjus.net` to nix-cache02 +4. **Increase RAM** - Bump to 24GB after nix-cache01 is gone +5. **Decommission nix-cache01**: + - Remove from `terraform/vms.tf` + - Remove old build script (`services/nix-cache/build-flakes.nix`, `build-flakes.sh`) + - Archive or delete host config + +### Phase 5: Scheduled Builds (Optional) + +Add a systemd timer on nix-cache02 to trigger periodic builds via NATS: + +```nix +systemd.services.scheduled-build = { + script = '' + homelab-deploy build nixos-servers --all + homelab-deploy build nixos --all + ''; +}; +systemd.timers.scheduled-build = { + wantedBy = [ "timers.target" ]; + timerConfig.OnCalendar = "*-*-* *:30:00"; +}; +``` + +Or trigger builds from CI after merges to master. + +## Resolved Questions + +- **Parallel vs sequential builds?** Sequential - hosts share packages, subsequent builds are fast after first +- **What about gunter?** Configured as `nixos` repo in builder settings +- **Disk size?** 200GB for new host +- **Build host specs?** 8 cores, 16-24GB RAM matches current nix-cache01 ## Open Questions -- [ ] What runner labels should the self-hosted runner use for the update workflow? -- [ ] Should we build hosts in parallel (faster) or sequentially (easier to debug)? -- [ ] How long to keep flake-update PRs open before auto-closing stale ones? -- [ ] Should successful updates trigger a NATS notification to rebuild all hosts? -- [ ] What to do about `gunter` (external nixos repo) - include in validation? -- [ ] Disk size for new nix-cache01 - is 100G enough for cache + builds? - -## Notes - -- The existing `homelab.deploy.enable = true` on nix-cache01 means it already has NATS connectivity -- The Harmonia service and cache signing key will work the same after reprovision -- Actions runner token is in Vault, will be provisioned automatically -- Consider adding a `homelab.host.role = "build-host"` label for monitoring/filtering +- [ ] When to cut over DNS from nix-cache01 to nix-cache02? +- [ ] Keep Actions runner on nix-cache02 or separate host? +- [ ] Implement safe flake update workflow before or after full migration? -- 2.49.1 From 5bfb51a4976211190fda025626df2574a8bf8df2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Tue, 10 Feb 2026 22:46:38 +0100 Subject: [PATCH 3/3] docs: add observability phase to nix-cache plan - Add Phase 6 for alerting and Grafana dashboards - Document available Prometheus metrics - Include example alerting rules for build failures Co-Authored-By: Claude Opus 4.5 --- docs/plans/nix-cache-reprovision.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/docs/plans/nix-cache-reprovision.md b/docs/plans/nix-cache-reprovision.md index 94e405c..2bb57ef 100644 --- a/docs/plans/nix-cache-reprovision.md +++ b/docs/plans/nix-cache-reprovision.md @@ -118,6 +118,30 @@ Or trigger builds from CI after merges to master. - **Disk size?** 200GB for new host - **Build host specs?** 8 cores, 16-24GB RAM matches current nix-cache01 +### Phase 6: Observability + +1. **Alerting rules** for build failures: + ```promql + # Alert if any build fails + increase(homelab_deploy_build_host_total{status="failure"}[1h]) > 0 + + # Alert if no successful builds in 24h (scheduled builds stopped) + time() - homelab_deploy_build_last_success_timestamp > 86400 + ``` + +2. **Grafana dashboard** for build metrics: + - Build success/failure rate over time + - Average build duration per host (histogram) + - Build frequency (builds per hour/day) + - Last successful build timestamp per repo + +Available metrics: +- `homelab_deploy_builds_total{repo, status}` - total builds by repo and status +- `homelab_deploy_build_host_total{repo, host, status}` - per-host build counts +- `homelab_deploy_build_duration_seconds_{bucket,sum,count}` - build duration histogram +- `homelab_deploy_build_last_timestamp{repo}` - last build attempt +- `homelab_deploy_build_last_success_timestamp{repo}` - last successful build + ## Open Questions - [ ] When to cut over DNS from nix-cache01 to nix-cache02? -- 2.49.1