From 737bb162c998ba02f4421ecf194dd28f84492f2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sat, 7 Feb 2026 03:07:30 +0100 Subject: [PATCH] chore: initial commit with scaffolding and design doc --- design.md | 505 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ flake.nix | 15 ++ go.mod | 3 + 3 files changed, 523 insertions(+) create mode 100644 design.md create mode 100644 flake.nix create mode 100644 go.mod diff --git a/design.md b/design.md new file mode 100644 index 0000000..33e0dbb --- /dev/null +++ b/design.md @@ -0,0 +1,505 @@ +# homelab-deploy Design Document + +A message-based deployment system for NixOS configurations using NATS for messaging. This binary runs in multiple modes to enable on-demand NixOS configuration updates across a fleet of hosts. + +## Overview + +The `homelab-deploy` binary provides three operational modes: + +1. **Listener mode** - Runs on each NixOS host as a systemd service, subscribing to NATS subjects and executing `nixos-rebuild` when deployment requests arrive +2. **MCP mode** - Runs as an MCP (Model Context Protocol) server, exposing deployment tools for AI assistants +3. **CLI mode** - Manual deployment commands for administrators + +## Architecture + +``` +┌─────────────┐ ┌─────────────┐ +│ MCP Tool │ deploy.test.> │ Admin CLI │ deploy.test.> + deploy.prod.> +│ │────────────┐ ┌─────│ │ +└─────────────┘ │ │ └─────────────┘ + ▼ ▼ + ┌──────────────┐ + │ NATS Server │ + │ (authz) │ + └──────┬───────┘ + │ + ┌─────────────────┼─────────────────┐ + │ │ │ + ▼ ▼ ▼ + ┌──────────┐ ┌──────────┐ ┌──────────┐ + │ host-a │ │ host-b │ │ host-c │ + │ tier=test│ │ tier=prod│ │ tier=prod│ + └──────────┘ └──────────┘ └──────────┘ +``` + +## Repository Structure + +``` +homelab-deploy/ +├── flake.nix # Nix flake with Go package + NixOS module +├── go.mod +├── go.sum +├── cmd/ +│ └── homelab-deploy/ +│ └── main.go # CLI entrypoint with subcommands +├── internal/ +│ ├── listener/ # Listener mode logic +│ ├── mcp/ # MCP server mode logic +│ ├── nats/ # NATS client wrapper +│ └── deploy/ # Shared deployment execution logic +└── nixos/ + └── module.nix # NixOS module for listener service +``` + +## CLI Interface + +```bash +# Listener mode (runs as systemd service on each host) +homelab-deploy listener \ + --hostname \ + --tier \ + --nats-url nats://server:4222 \ + --nkey-file /path/to/listener.nkey \ + --flake-url \ + [--role ] \ + [--timeout 600] + +# MCP server mode (for AI assistants) +homelab-deploy mcp \ + --nats-url nats://server:4222 \ + --nkey-file /path/to/mcp.nkey \ + [--enable-admin --admin-nkey-file /path/to/admin.nkey] + +# CLI commands for manual use +homelab-deploy deploy \ + --nats-url nats://server:4222 \ + --nkey-file /path/to/deployer.nkey \ + [--branch ] \ + [--action ] + +homelab-deploy deploy \ + --tier \ + --all \ + --nats-url nats://server:4222 \ + --nkey-file /path/to/deployer.nkey \ + [--branch ] \ + [--action ] + +homelab-deploy deploy \ + --tier \ + --role \ + --nats-url nats://server:4222 \ + --nkey-file /path/to/deployer.nkey \ + [--branch ] \ + [--action ] +``` + +## NATS Subject Structure + +Subjects follow the pattern `deploy..`: + +| Subject Pattern | Description | +|-----------------|-------------| +| `deploy..` | Deploy to specific host (e.g., `deploy.prod.ns1`) | +| `deploy..all` | Deploy to all hosts in tier (e.g., `deploy.test.all`) | +| `deploy..role.` | Deploy to hosts with role in tier (e.g., `deploy.prod.role.dns`) | +| `deploy.responses.` | Response subject for request/reply pattern | + +## Listener Mode + +### Responsibilities + +1. Connect to NATS using NKey authentication +2. Subscribe to subjects based on hostname, tier, and role +3. Validate incoming deployment requests +4. Execute `nixos-rebuild` with the specified parameters +5. Report status back via NATS reply subject + +### Subject Subscriptions + +A listener subscribes to multiple subjects based on its configuration: + +- `deploy..` - Direct messages to this host +- `deploy..all` - Broadcast to all hosts in tier +- `deploy..role.` - Broadcast to hosts with matching role (only if role is configured) + +**Example:** A host with `hostname=ns1, tier=prod, role=dns` subscribes to: +- `deploy.prod.ns1` +- `deploy.prod.all` +- `deploy.prod.role.dns` + +### Message Formats + +**Request message:** +```json +{ + "action": "switch", + "revision": "master", + "reply_to": "deploy.responses.abc123" +} +``` + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `action` | string | yes | One of: `switch`, `boot`, `test`, `dry-activate` | +| `revision` | string | yes | Git branch name or commit hash | +| `reply_to` | string | yes | Subject to publish responses to | + +**Response message:** +```json +{ + "hostname": "ns1", + "status": "completed", + "error": null, + "message": "Successfully switched to generation 42" +} +``` + +| Field | Type | Description | +|-------|------|-------------| +| `hostname` | string | The responding host's name | +| `status` | string | One of: `accepted`, `rejected`, `started`, `completed`, `failed` | +| `error` | string or null | Error code if status is `rejected` or `failed` | +| `message` | string | Human-readable details | + +**Error codes:** +- `invalid_revision` - The specified branch/commit does not exist +- `invalid_action` - The action is not recognized +- `already_running` - A deployment is already in progress on this host +- `build_failed` - nixos-rebuild exited with non-zero status +- `timeout` - Deployment exceeded the configured timeout + +### Request/Reply Flow + +1. Deployer sends request with unique `reply_to` subject +2. Deployer subscribes to the `reply_to` subject before sending +3. Listener validates request: + - Checks revision exists using `git ls-remote` + - Checks no other deployment is running +4. Listener sends immediate response: + - `{"status": "rejected", ...}` if validation fails, or + - `{"status": "started", ...}` if deployment begins +5. If started, listener executes nixos-rebuild +6. Listener sends final response: + - `{"status": "completed", ...}` on success, or + - `{"status": "failed", ...}` on failure + +### Deployment Execution + +The listener executes `nixos-rebuild` with the following command pattern: + +```bash +nixos-rebuild --flake ?ref=# +``` + +Where: +- `` is one of: `switch`, `boot`, `test`, `dry-activate` +- `` is the configured git flake URL (e.g., `git+https://git.example.com/user/nixos-configs.git`) +- `` is the branch name or commit hash from the request +- `` is the listener's configured hostname + +**Environment requirements:** +- Must run as root (nixos-rebuild requires root) +- Nix must be configured with proper git credentials if the flake is private +- Network access to the git repository + +### Concurrency Control + +Only one deployment may run at a time per host. The listener maintains a simple lock: +- Before starting a deployment, acquire lock +- If lock is held, reject with `already_running` error +- Release lock when deployment completes (success or failure) +- Lock should be in-memory (no persistence needed - restarts clear it) + +### Logging + +All deployment events should be logged to stdout/stderr (captured by systemd journal): +- Request received (with subject, action, revision) +- Validation result +- Deployment start +- Deployment completion (with exit code) +- Any errors + +This enables integration with log aggregation systems (e.g., Loki via Promtail). + +## MCP Mode + +### Purpose + +Exposes deployment functionality as MCP tools for AI assistants (e.g., Claude Code). + +### Tools + +| Tool | Description | Parameters | +|------|-------------|------------| +| `deploy` | Deploy to test-tier hosts | `hostname` or `all`, optional `role`, `branch`, `action` | +| `deploy_admin` | Deploy to any tier (requires `--enable-admin`) | `tier`, `hostname` or `all`, optional `role`, `branch`, `action` | +| `list_hosts` | List available deployment targets | `tier` (optional) | + +### Tool Schemas + +**deploy:** +```json +{ + "name": "deploy", + "description": "Deploy NixOS configuration to test-tier hosts", + "inputSchema": { + "type": "object", + "properties": { + "hostname": { + "type": "string", + "description": "Target hostname, or omit to use 'all' or 'role' targeting" + }, + "all": { + "type": "boolean", + "description": "Deploy to all test-tier hosts" + }, + "role": { + "type": "string", + "description": "Deploy to all test-tier hosts with this role" + }, + "branch": { + "type": "string", + "description": "Git branch or commit to deploy (default: master)" + }, + "action": { + "type": "string", + "enum": ["switch", "boot", "test", "dry-activate"], + "description": "nixos-rebuild action (default: switch)" + } + } + } +} +``` + +**deploy_admin:** +```json +{ + "name": "deploy_admin", + "description": "Deploy NixOS configuration to any host (admin access required)", + "inputSchema": { + "type": "object", + "properties": { + "tier": { + "type": "string", + "enum": ["test", "prod"], + "description": "Target tier" + }, + "hostname": { + "type": "string", + "description": "Target hostname, or omit to use 'all' or 'role' targeting" + }, + "all": { + "type": "boolean", + "description": "Deploy to all hosts in tier" + }, + "role": { + "type": "string", + "description": "Deploy to all hosts with this role in tier" + }, + "branch": { + "type": "string", + "description": "Git branch or commit to deploy (default: master)" + }, + "action": { + "type": "string", + "enum": ["switch", "boot", "test", "dry-activate"], + "description": "nixos-rebuild action (default: switch)" + } + }, + "required": ["tier"] + } +} +``` + +**list_hosts:** +```json +{ + "name": "list_hosts", + "description": "List available deployment targets", + "inputSchema": { + "type": "object", + "properties": { + "tier": { + "type": "string", + "enum": ["test", "prod"], + "description": "Filter by tier (optional)" + } + } + } +} +``` + +### Security Layers + +1. **MCP flag**: `deploy_admin` tool only registered when `--enable-admin` is passed +2. **NATS authz**: MCP credentials can only publish to authorized subjects +3. **AI assistant permissions**: The assistant's configuration can require confirmation for admin operations + +### Multi-Host Deployments + +When deploying to multiple hosts (via `all` or `role`), the MCP should: +1. Publish the request to the appropriate broadcast subject +2. Collect responses from all responding hosts +3. Return aggregated results showing each host's status + +**Timeout handling:** +- Set a reasonable timeout for collecting responses (e.g., 30 seconds after last response, or max 15 minutes) +- Return partial results if some hosts don't respond +- Indicate which hosts did not respond + +### Host Discovery + +The `list_hosts` tool needs to know available hosts. Options: +1. **Static configuration**: Read from a config file or environment variable +2. **NATS request**: Publish to a discovery subject and collect responses from listeners + +Recommend option 2: Listeners respond to `deploy.discover` with their metadata: +```json +{ + "hostname": "ns1", + "tier": "prod", + "role": "dns" +} +``` + +## NixOS Module + +The NixOS module configures the listener as a systemd service. + +### Module Options + +```nix +{ + options.services.homelab-deploy.listener = { + enable = lib.mkEnableOption "homelab-deploy listener service"; + + hostname = lib.mkOption { + type = lib.types.str; + description = "Hostname for this listener (used for NATS subject)"; + }; + + tier = lib.mkOption { + type = lib.types.enum [ "test" "prod" ]; + description = "Deployment tier for this host"; + }; + + role = lib.mkOption { + type = lib.types.nullOr lib.types.str; + default = null; + description = "Role for role-based deployment targeting"; + }; + + natsUrl = lib.mkOption { + type = lib.types.str; + description = "NATS server URL"; + }; + + nkeyFile = lib.mkOption { + type = lib.types.path; + description = "Path to NKey seed file for NATS authentication"; + }; + + flakeUrl = lib.mkOption { + type = lib.types.str; + description = "Git flake URL for nixos-rebuild"; + }; + + timeout = lib.mkOption { + type = lib.types.int; + default = 600; + description = "Deployment timeout in seconds"; + }; + }; +} +``` + +### Systemd Service + +The module should create a systemd service with: +- `Type=simple` +- `Restart=always` +- `RestartSec=10` +- Run as root (required for nixos-rebuild) +- Proper ordering (after network-online.target) +- Resource limits if desired + +## NATS Authentication + +All NATS connections use NKey authentication. NKeys are ed25519 keypairs where: +- The seed (private key) is stored in a file readable by the service +- The public key is configured in the NATS server's user list + +### Credential Types + +| Credential | Purpose | Publish Permissions | Subscribe Permissions | +|------------|---------|---------------------|----------------------| +| listener | Host listener service | `deploy.responses.>` | `deploy.*.>` | +| mcp-deployer | MCP test-tier access | `deploy.test.>` | `deploy.responses.>`, `deploy.discover` | +| admin-deployer | Full deployment access | `deploy.test.>`, `deploy.prod.>` | `deploy.responses.>`, `deploy.discover` | + +## Flake Structure + +The flake.nix should provide: + +1. **Package**: The Go binary +2. **NixOS module**: The listener service configuration +3. **Development shell**: Go toolchain for development + +```nix +{ + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + }; + + outputs = { self, nixpkgs }: { + packages.x86_64-linux.default = /* Go package build */; + packages.x86_64-linux.homelab-deploy = self.packages.x86_64-linux.default; + + nixosModules.default = import ./nixos/module.nix; + nixosModules.homelab-deploy = self.nixosModules.default; + + devShells.x86_64-linux.default = /* Go dev shell */; + }; +} +``` + +## Implementation Notes + +### Go Dependencies + +Recommended libraries: +- `github.com/nats-io/nats.go` - NATS client +- `github.com/spf13/cobra` - CLI framework +- `github.com/mark3labs/mcp-go` - MCP server implementation (or similar) +- Standard library for JSON, logging, process execution + +### Error Handling + +- NATS connection errors: Retry with exponential backoff +- nixos-rebuild failures: Capture stdout/stderr, report in response message +- Timeout: Kill the nixos-rebuild process, report timeout error + +### Testing + +- Unit tests for message parsing and validation +- Integration tests using a local NATS server +- End-to-end tests with a NixOS VM (optional, can be done in consuming repo) + +## Security Considerations + +- **Privilege**: Listener runs as root to execute nixos-rebuild +- **Input validation**: Strictly validate revision format (alphanumeric, dashes, underscores, dots, slashes for branch names; hex for commit hashes) +- **Command injection**: Never interpolate user input into shell commands without validation +- **Rate limiting**: Consider adding rate limiting to prevent rapid-fire deployments +- **Audit logging**: Log all deployment requests with full context +- **Network isolation**: NATS should only be accessible from trusted networks + +## Future Enhancements + +These are not required for initial implementation: + +1. **Deployment locking** - Cluster-wide lock to prevent fleet-wide concurrent deploys +2. **Prometheus metrics** - Export deployment count, duration, success/failure rates +3. **Webhook triggers** - HTTP endpoint for CI/CD integration +4. **Scheduled deployments** - Deploy at specific times (though this overlaps with existing auto-upgrade) diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..c7a9a1c --- /dev/null +++ b/flake.nix @@ -0,0 +1,15 @@ +{ + description = "A very basic flake"; + + inputs = { + nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-unstable"; + }; + + outputs = { self, nixpkgs }: { + + packages.x86_64-linux.hello = nixpkgs.legacyPackages.x86_64-linux.hello; + + packages.x86_64-linux.default = self.packages.x86_64-linux.hello; + + }; +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..e7f01b6 --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module git.t-juice.club/torjus/homelab-deploy + +go 1.25.5