From cf19ade34bfb6ffbfd75e1b85a63512d065d918a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Thu, 12 Mar 2026 23:15:51 +0100 Subject: [PATCH 1/2] nix-cache02: add native nix forgejo runner instance Add a second runner instance (actions-native) that executes jobs directly on the host, giving workflows persistent nix store access and automatic binary cache population via Harmonia. Co-Authored-By: Claude Opus 4.6 --- docs/plans/bare-metal-actions-runner.md | 155 -------------------- docs/plans/native-forgejo-runner.md | 181 ++++++++++++++++++++++++ hosts/nix-cache02/actions-runner.nix | 37 ++++- terraform/vault/secrets.tf | 7 +- terraform/vault/variables.tf | 7 + 5 files changed, 230 insertions(+), 157 deletions(-) delete mode 100644 docs/plans/bare-metal-actions-runner.md create mode 100644 docs/plans/native-forgejo-runner.md diff --git a/docs/plans/bare-metal-actions-runner.md b/docs/plans/bare-metal-actions-runner.md deleted file mode 100644 index 1b3cdcd..0000000 --- a/docs/plans/bare-metal-actions-runner.md +++ /dev/null @@ -1,155 +0,0 @@ -# Bare Metal Forgejo Actions Runner on nix-cache02 - -## Goal - -Add a second Forgejo Actions runner instance on nix-cache02 that executes jobs directly on the host (bare metal). This allows CI builds to populate the nix binary cache automatically, reducing reliance on manually triggered builds before deployments. - -## Motivation - -Currently the workflow for updating a flake input (e.g. nixos-exporter) is: - -1. Update flake lock -2. Push to master -3. Manually trigger a build on nix-cache02 (or wait for the scheduled builder) -4. Deploy to hosts - -With a bare metal runner, repos like nixos-exporter can have CI workflows that run `nix build`, and those derivations automatically end up in the cache (served by harmonia). By the time hosts auto-upgrade, everything is already cached. - -## Design - -### Two Runner Instances - -- **actions1** (existing) — Container-based, available to all Forgejo repos. Unchanged. -- **actions2** (new) — Host-based, restricted to trusted repos only via Forgejo runner scoping. - -### Trusted Repos - -Repos that should be allowed to use the bare metal runner: - -- `torjus/nixos-servers` -- `torjus/nixos-exporter` -- `torjus/nixos` (gunter/magicman configs) -- Other repos with nix builds that benefit from cache population (add as needed) - -Restriction is configured in the Forgejo web UI when registering the runner — scope it to specific repos or the org. - -### Label Configuration - -The new instance would use a host label: - -```nix -labels = [ "native:host" ]; -``` - -Workflow files in trusted repos would target this with `runs-on: native`. - -### Host Packages - -The runner needs nix and basic tools available: - -```nix -hostPackages = with pkgs; [ - bash - coreutils - curl - gawk - gitMinimal - gnused - nodejs - wget - nix -]; -``` - -## Security Analysis - -### What the runner CAN access - -- **Nix store** — Can read and write derivations. This is the whole point; harmonia serves the store to all hosts. -- **Network** — Full network access during job execution. -- **World-readable files** — Standard for any process on the system. - -### What the runner CANNOT access - -- **Cache signing key** — `/run/secrets/cache-secret` is mode `0400` root-owned. Harmonia signs derivations on serve, not on store write. -- **Vault AppRole credentials** — `/var/lib/vault/approle/` is root-owned. -- **Other vault secrets** — All in `/run/secrets/` with restrictive permissions. - -### Mitigations - -- **Trusted repos only** — Forgejo runner scoping restricts which repos can submit jobs. Only repos we control should have access. -- **DynamicUser** — The runner uses systemd DynamicUser, so no persistent user account. Each invocation gets an ephemeral UID. -- **Separate instance** — Container-based jobs (untrusted repos) remain on actions1 and never get host access. - -### Accepted Risks - -- A compromised trusted repo could inject bad derivations into the nix store/cache. This is an accepted risk since those repos already have deploy access to production hosts. -- Jobs can consume host resources (CPU, memory, disk). The `runner.capacity` setting limits concurrent jobs. - -## Implementation - -### 1. NixOS Configuration - -**File:** `hosts/nix-cache02/actions-runner.nix` - -Add a second instance alongside the existing overrides: - -```nix -{ pkgs, ... }: -{ - # ... existing actions1 overrides ... - - services.gitea-actions-runner.instances.actions2 = { - enable = true; - name = "nix-cache02-native"; - url = "https://code.t-juice.club"; - tokenFile = "/run/secrets/forgejo-runner-token-native"; - labels = [ "native:host" ]; - hostPackages = with pkgs; [ - bash coreutils curl gawk gitMinimal gnused nodejs wget nix - ]; - settings = { - runner.capacity = 4; - cache = { - enabled = true; - dir = "/var/lib/gitea-runner/actions2/cache"; - }; - }; - }; -} -``` - -### 2. Vault Secret - -The native runner needs its own registration token (separate from actions1): - -- Add `hosts/nix-cache02/forgejo-runner-token-native` to `terraform/vault/secrets.tf` -- Add `forgejo_runner_token_native` variable to `terraform/vault/variables.tf` -- Add vault secret config in `actions-runner.nix` pointing to the new path - -### 3. Forgejo Setup - -1. Generate a new runner token in Forgejo, scoped to trusted repos only -2. Store in Vault: `bao kv put secret/hosts/nix-cache02/forgejo-runner-token-native token=` -3. Set the tfvar and run `tofu apply` in `terraform/vault/` - -### 4. Example Workflow - -In a trusted repo (e.g. nixos-exporter): - -```yaml -name: Build -on: [push] -jobs: - build: - runs-on: native - steps: - - uses: actions/checkout@v4 - - run: nix build -``` - -## Open Questions - -- Should `hostPackages` include additional tools (e.g. `cachix`, `nix-prefetch-*`)? -- Should we set resource limits on the runner (systemd MemoryMax, CPUQuota)? -- Do we want a separate capacity for the native runner vs container runner, or is 4 fine for both? diff --git a/docs/plans/native-forgejo-runner.md b/docs/plans/native-forgejo-runner.md new file mode 100644 index 0000000..5d48282 --- /dev/null +++ b/docs/plans/native-forgejo-runner.md @@ -0,0 +1,181 @@ +# Native Nix Forgejo Runner on nix-cache02 + +## Goal + +Add a second Forgejo Actions runner instance on nix-cache02 that executes jobs directly on the host (no containers). This allows CI builds to populate the nix binary cache automatically, reducing reliance on manually triggered builds before deployments. + +## Motivation + +- **Nix store caching**: The container-based `nix` label runs in ephemeral Podman containers, losing all nix store paths between jobs. Native execution uses the host's persistent store, so builds reuse cached paths automatically. +- **Binary cache integration**: nix-cache02 *is* the binary cache server (Harmonia). Paths built by CI are immediately available to all hosts. +- **Faster deploy cycle**: Currently updating a flake input (e.g. nixos-exporter) requires pushing to master, then waiting for the scheduled builder or manually triggering a build. With a native runner, repos can have CI workflows that run `nix build`, and those derivations are in the cache by the time hosts auto-upgrade. +- **NixOS config builds**: Enables future workflows that build `nixosConfigurations.*` from this repo, populating the cache as a side effect of CI. + +## Design + +### Two Runner Instances + +- **actions1** (existing) — Container-based, global runner available to all Forgejo repos. Unchanged. +- **actions-native** (new) — Host-based, registered as a user-level runner under the `torjus` Forgejo account, so only repos owned by that user can target it. + +### Trusted Repos + +Repos that should be allowed to use the native runner: + +- `torjus/nixos-servers` +- `torjus/nixos-exporter` +- `torjus/nixos` (gunter/magicman configs) +- Other repos with nix builds that benefit from cache population (add as needed) + +Restriction is configured in the Forgejo web UI when registering the runner — scope it to the user or specific repos. + +### Label Configuration + +```nix +labels = [ "native-nix:host" ]; +``` + +Workflow files in trusted repos target this with `runs-on: native-nix`. + +### Host Packages + +The runner needs nix and basic tools available on the host: + +```nix +hostPackages = with pkgs; [ + bash + coreutils + curl + gawk + git + gnused + nodejs + wget + nix +]; +``` + +## Security Analysis + +### What the runner CAN access + +- **Nix store** — Can read and write derivations. This is the whole point; harmonia serves the store to all hosts. +- **Network** — Full network access during job execution. +- **World-readable files** — Standard for any process on the system. + +### What the runner CANNOT access + +- **Cache signing key** — `/run/secrets/cache-secret` is mode `0400` root-owned. Harmonia signs derivations on serve, not on store write. +- **Vault AppRole credentials** — `/var/lib/vault/approle/` is root-owned. +- **Other vault secrets** — All in `/run/secrets/` with restrictive permissions. + +### Mitigations + +- **User-level runner** — Registered to the `torjus` user on Forgejo (not global), so only repos owned by that user can submit jobs. +- **DynamicUser** — The runner uses systemd DynamicUser, so no persistent user account. Each invocation gets an ephemeral UID. +- **Nix sandbox** — Nix builds already run sandboxed by default. Non-nix `run:` steps execute as the runner's system user but have no special privileges. +- **Separate instance** — Container-based jobs (untrusted repos) remain on actions1 and never get host access. + +### Accepted Risks + +- A compromised trusted repo could inject bad derivations into the nix store/cache. This is an accepted risk since those repos already have deploy access to production hosts. +- Jobs can consume host resources (CPU, memory, disk). The `runner.capacity` setting limits concurrent jobs. + +## Implementation + +### 1. Register runner on Forgejo and store token in Vault + +- In Forgejo web UI: go to user settings > Actions > Runners, create a new runner registration token. +- Store the token in Vault via Terraform. + +**terraform/vault/variables.tf** — add variable: +```hcl +variable "forgejo_native_runner_token" { + description = "Forgejo Actions runner token for native nix runner on nix-cache02" + type = string + default = "PLACEHOLDER" + sensitive = true +} +``` + +**terraform/vault/secrets.tf** — add secret: +```hcl +"hosts/nix-cache02/forgejo-native-runner-token" = { + auto_generate = false + data = { token = var.forgejo_native_runner_token } +} +``` + +### 2. Add NixOS configuration for native runner instance + +Note: nix-cache02 already has an AppRole with access to `secret/data/hosts/nix-cache02/*` (defined in `terraform/vault/hosts-generated.tf`), so no approle changes are needed. + +**File:** `hosts/nix-cache02/actions-runner.nix` + +Add vault secret and runner instance alongside the existing overrides: + +```nix +# Fetch native runner token from Vault +vault.secrets.forgejo-native-runner-token = { + secretPath = "hosts/nix-cache02/forgejo-native-runner-token"; + extractKey = "token"; + mode = "0444"; + services = [ "gitea-runner-actions-native" ]; +}; + +# Native nix runner instance +services.gitea-actions-runner.instances.actions-native = { + enable = true; + name = "${config.networking.hostName}-native"; + url = "https://code.t-juice.club"; + tokenFile = "/run/secrets/forgejo-native-runner-token"; + labels = [ "native-nix:host" ]; + hostPackages = with pkgs; [ + bash coreutils curl gawk git gnused nodejs wget nix + ]; + settings = { + runner.capacity = 4; + cache = { + enabled = true; + dir = "/var/lib/gitea-runner/actions-native/cache"; + }; + }; +}; +``` + +### 3. Build and deploy + +1. Create feature branch +2. Apply Terraform changes (variables + secrets + approle policy) +3. Set the actual token value in `terraform.tfvars` +4. Run `tofu apply` in `terraform/vault/` +5. Build the NixOS configuration: `nix build .#nixosConfigurations.nix-cache02.config.system.build.toplevel` +6. Deploy to nix-cache02 +7. Verify the native runner appears as online in Forgejo UI + +### 4. Test with a workflow + +In a trusted repo (e.g. nixos-exporter): + +```yaml +name: Build +on: [push] +jobs: + build: + runs-on: native-nix + steps: + - uses: actions/checkout@v4 + - run: nix build +``` + +## Future Work + +- **NixOS config CI**: Workflow that builds all `nixosConfigurations` on push to master, populating the binary cache. +- **Nix store GC policy**: CI builds will accumulate store paths. Since this host is the binary cache, GC needs to be conservative — only delete paths not referenced by current system configurations. Defer to a follow-up. +- **Resource limits**: Consider systemd MemoryMax/CPUQuota on the native runner if resource contention becomes an issue. +- **Additional host packages**: Evaluate whether tools like `cachix` or `nix-prefetch-*` should be added. + +## Open Questions + +- Should `hostPackages` include additional tools beyond the basics listed above? +- Do we want a separate capacity for the native runner vs container runner, or is 4 fine for both? diff --git a/hosts/nix-cache02/actions-runner.nix b/hosts/nix-cache02/actions-runner.nix index fcac442..64205b8 100644 --- a/hosts/nix-cache02/actions-runner.nix +++ b/hosts/nix-cache02/actions-runner.nix @@ -1,4 +1,4 @@ -{ ... }: +{ config, pkgs, ... }: { # Fetch runner token from Vault vault.secrets.forgejo-runner-token = { @@ -13,4 +13,39 @@ tokenFile = "/run/secrets/forgejo-runner-token"; settings.runner.capacity = 4; }; + + # Fetch native runner token from Vault + vault.secrets.forgejo-native-runner-token = { + secretPath = "hosts/nix-cache02/forgejo-native-runner-token"; + extractKey = "token"; + mode = "0444"; + services = [ "gitea-runner-actions-native" ]; + }; + + # Native nix runner instance (user-level, no containers) + services.gitea-actions-runner.instances.actions-native = { + enable = true; + name = "${config.networking.hostName}-native"; + url = "https://code.t-juice.club"; + tokenFile = "/run/secrets/forgejo-native-runner-token"; + labels = [ "native-nix:host" ]; + hostPackages = with pkgs; [ + bash + coreutils + curl + gawk + git + gnused + nodejs + wget + nix + ]; + settings = { + runner.capacity = 4; + cache = { + enabled = true; + dir = "/var/lib/gitea-runner/actions-native/cache"; + }; + }; + }; } diff --git a/terraform/vault/secrets.tf b/terraform/vault/secrets.tf index 2ad5abf..cea7c62 100644 --- a/terraform/vault/secrets.tf +++ b/terraform/vault/secrets.tf @@ -145,12 +145,17 @@ locals { password_length = 64 } - # Forgejo runner token for nix-cache02 + # Forgejo runner tokens for nix-cache02 "hosts/nix-cache02/forgejo-runner-token" = { auto_generate = false data = { token = var.forgejo_runner_token } } + "hosts/nix-cache02/forgejo-native-runner-token" = { + auto_generate = false + data = { token = var.forgejo_native_runner_token } + } + # Loki push authentication (used by Promtail on all hosts) "shared/loki/push-auth" = { auto_generate = true diff --git a/terraform/vault/variables.tf b/terraform/vault/variables.tf index 2c23774..899385d 100644 --- a/terraform/vault/variables.tf +++ b/terraform/vault/variables.tf @@ -116,3 +116,10 @@ variable "forgejo_runner_token" { sensitive = true } +variable "forgejo_native_runner_token" { + description = "Forgejo Actions runner token for native nix runner on nix-cache02" + type = string + default = "PLACEHOLDER" + sensitive = true +} + -- 2.49.1 From 3cb5148c40d649b7fc703c055bfd181fb16582af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Thu, 12 Mar 2026 23:25:01 +0100 Subject: [PATCH 2/2] docs: move native forgejo runner plan to completed Co-Authored-By: Claude Opus 4.6 --- docs/plans/{ => completed}/native-forgejo-runner.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/plans/{ => completed}/native-forgejo-runner.md (100%) diff --git a/docs/plans/native-forgejo-runner.md b/docs/plans/completed/native-forgejo-runner.md similarity index 100% rename from docs/plans/native-forgejo-runner.md rename to docs/plans/completed/native-forgejo-runner.md -- 2.49.1