From 14f5b31fafd165a68e7a62b003681c0f228fc81c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Tue, 10 Feb 2026 22:03:14 +0100 Subject: [PATCH 1/4] feat: add builder mode for centralized Nix builds Add a new "builder" capability to trigger Nix builds on a dedicated build host via NATS messaging. This allows pre-building NixOS configurations before deployment. New components: - Builder mode: subscribes to build..* subjects, executes nix build - Build CLI command: triggers builds with progress tracking - MCP build tool: available with --enable-builds flag - Builder metrics: tracks build success/failure per repo and host - NixOS module: services.homelab-deploy.builder The builder uses a YAML config file to define allowed repositories with their URLs and default branches. Builds can target all hosts or specific hosts, with real-time progress updates. Co-Authored-By: Claude Opus 4.5 --- README.md | 189 ++++++++++++++++- cmd/homelab-deploy/main.go | 203 ++++++++++++++++++- go.mod | 2 +- internal/builder/builder.go | 323 ++++++++++++++++++++++++++++++ internal/builder/config.go | 65 ++++++ internal/builder/executor.go | 116 +++++++++++ internal/cli/build.go | 140 +++++++++++++ internal/mcp/build_tools.go | 109 ++++++++++ internal/mcp/server.go | 6 + internal/messages/build.go | 125 ++++++++++++ internal/metrics/build_metrics.go | 99 +++++++++ internal/metrics/server.go | 5 + nixos/module.nix | 210 ++++++++++++++----- 13 files changed, 1535 insertions(+), 57 deletions(-) create mode 100644 internal/builder/builder.go create mode 100644 internal/builder/config.go create mode 100644 internal/builder/executor.go create mode 100644 internal/cli/build.go create mode 100644 internal/mcp/build_tools.go create mode 100644 internal/messages/build.go create mode 100644 internal/metrics/build_metrics.go diff --git a/README.md b/README.md index b5107c5..2eef68e 100644 --- a/README.md +++ b/README.md @@ -4,11 +4,12 @@ A message-based deployment system for NixOS configurations using NATS for messag ## Overview -The `homelab-deploy` binary provides three operational modes: +The `homelab-deploy` binary provides four operational modes: 1. **Listener mode** - Runs on each NixOS host as a systemd service, subscribing to NATS subjects and executing `nixos-rebuild` when deployment requests arrive -2. **MCP mode** - Runs as an MCP (Model Context Protocol) server, exposing deployment tools for AI assistants -3. **CLI mode** - Manual deployment commands for administrators +2. **Builder mode** - Runs on a dedicated build host, subscribing to NATS subjects and executing `nix build` to pre-build configurations +3. **MCP mode** - Runs as an MCP (Model Context Protocol) server, exposing deployment tools for AI assistants +4. **CLI mode** - Manual deployment and build commands for administrators ## Installation @@ -128,6 +129,82 @@ homelab-deploy deploy prod-dns --nats-url ... --nkey-file ... Alias lookup: `HOMELAB_DEPLOY_ALIAS_` where name is uppercased and hyphens become underscores. +### Builder Mode + +Run on a dedicated build host to pre-build NixOS configurations: + +```bash +homelab-deploy builder \ + --nats-url nats://nats.example.com:4222 \ + --nkey-file /run/secrets/builder.nkey \ + --config /etc/homelab-deploy/builder.yaml \ + --timeout 1800 \ + --metrics-enabled \ + --metrics-addr :9973 +``` + +#### Builder Configuration File + +The builder uses a YAML configuration file to define allowed repositories: + +```yaml +repos: + nixos-servers: + url: "git+https://git.example.com/org/nixos-servers.git" + default_branch: "master" + homelab: + url: "git+ssh://git@github.com/user/homelab.git" + default_branch: "main" +``` + +#### Builder Flags + +| Flag | Required | Description | +|------|----------|-------------| +| `--nats-url` | Yes | NATS server URL | +| `--nkey-file` | Yes | Path to NKey seed file | +| `--config` | Yes | Path to builder configuration file | +| `--timeout` | No | Build timeout per host in seconds (default: 1800) | +| `--metrics-enabled` | No | Enable Prometheus metrics endpoint | +| `--metrics-addr` | No | Metrics HTTP server address (default: `:9973`) | + +### Build Command + +Trigger a build on the build server: + +```bash +# Build all hosts in a repository +homelab-deploy build nixos-servers --all \ + --nats-url nats://nats.example.com:4222 \ + --nkey-file /run/secrets/deployer.nkey + +# Build a specific host +homelab-deploy build nixos-servers myhost \ + --nats-url nats://nats.example.com:4222 \ + --nkey-file /run/secrets/deployer.nkey + +# Build with a specific branch +homelab-deploy build nixos-servers --all --branch feature-x \ + --nats-url nats://nats.example.com:4222 \ + --nkey-file /run/secrets/deployer.nkey + +# JSON output for scripting +homelab-deploy build nixos-servers --all --json \ + --nats-url nats://nats.example.com:4222 \ + --nkey-file /run/secrets/deployer.nkey +``` + +#### Build Flags + +| Flag | Required | Env Var | Description | +|------|----------|---------|-------------| +| `--nats-url` | Yes | `HOMELAB_DEPLOY_NATS_URL` | NATS server URL | +| `--nkey-file` | Yes | `HOMELAB_DEPLOY_NKEY_FILE` | Path to NKey seed file | +| `--branch` | No | `HOMELAB_DEPLOY_BRANCH` | Git branch (uses repo default if not specified) | +| `--all` | No | - | Build all hosts in the repository | +| `--timeout` | No | `HOMELAB_DEPLOY_BUILD_TIMEOUT` | Response timeout in seconds (default: 3600) | +| `--json` | No | - | Output results as JSON | + ### MCP Server Mode Run as an MCP server for AI assistant integration: @@ -144,6 +221,12 @@ homelab-deploy mcp \ --nkey-file /run/secrets/mcp.nkey \ --enable-admin \ --admin-nkey-file /run/secrets/admin.nkey + +# With build tool enabled +homelab-deploy mcp \ + --nats-url nats://nats.example.com:4222 \ + --nkey-file /run/secrets/mcp.nkey \ + --enable-builds ``` #### MCP Tools @@ -153,6 +236,7 @@ homelab-deploy mcp \ | `deploy` | Deploy to test-tier hosts only | | `deploy_admin` | Deploy to any tier (requires `--enable-admin`) | | `list_hosts` | Discover available deployment targets | +| `build` | Trigger builds on the build server (requires `--enable-builds`) | #### Tool Parameters @@ -167,6 +251,12 @@ homelab-deploy mcp \ **list_hosts:** - `tier` - Filter by tier (optional) +**build:** +- `repo` - Repository name (required, must match builder config) +- `target` - Target hostname (optional, defaults to all) +- `all` - Build all hosts (default if no target specified) +- `branch` - Git branch (uses repo default if not specified) + ## NixOS Module Add the module to your NixOS configuration: @@ -224,6 +314,37 @@ Default `deploySubjects`: ] ``` +### Builder Module Options + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `enable` | bool | `false` | Enable the builder service | +| `package` | package | from flake | Package to use | +| `natsUrl` | string | required | NATS server URL | +| `nkeyFile` | path | required | Path to NKey seed file | +| `configFile` | path | required | Path to builder configuration file | +| `timeout` | int | `1800` | Build timeout per host in seconds | +| `environment` | attrs | `{}` | Additional environment variables | +| `metrics.enable` | bool | `false` | Enable Prometheus metrics endpoint | +| `metrics.address` | string | `":9973"` | Metrics HTTP server address | +| `metrics.openFirewall` | bool | `false` | Open firewall for metrics port | + +Example builder configuration: + +```nix +services.homelab-deploy.builder = { + enable = true; + natsUrl = "nats://nats.example.com:4222"; + nkeyFile = "/run/secrets/homelab-deploy-builder-nkey"; + configFile = "/etc/homelab-deploy/builder.yaml"; + metrics = { + enable = true; + address = ":9973"; + openFirewall = true; + }; +}; +``` + ## Prometheus Metrics The listener can expose Prometheus metrics for monitoring deployment operations. @@ -298,6 +419,24 @@ histogram_quantile(0.95, rate(homelab_deploy_deployment_duration_seconds_bucket[ sum(homelab_deploy_deployment_in_progress) ``` +### Builder Metrics + +When running in builder mode, additional metrics are available: + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `homelab_deploy_builds_total` | Counter | `repo`, `status` | Total builds processed | +| `homelab_deploy_build_host_total` | Counter | `repo`, `host`, `status` | Total host builds processed | +| `homelab_deploy_build_duration_seconds` | Histogram | `repo`, `host` | Build execution time per host | +| `homelab_deploy_build_last_timestamp` | Gauge | `repo` | Timestamp of last build attempt | +| `homelab_deploy_build_last_success_timestamp` | Gauge | `repo` | Timestamp of last successful build | +| `homelab_deploy_build_last_failure_timestamp` | Gauge | `repo` | Timestamp of last failed build | + +**Label values:** +- `status`: `success`, `failure` +- `repo`: Repository name from config +- `host`: Host name being built + ## Message Protocol ### Deploy Request @@ -325,6 +464,37 @@ sum(homelab_deploy_deployment_in_progress) **Error codes:** `invalid_revision`, `invalid_action`, `already_running`, `build_failed`, `timeout` +### Build Request + +```json +{ + "repo": "nixos-servers", + "target": "all", + "branch": "main", + "reply_to": "build.responses.abc123" +} +``` + +### Build Response + +```json +{ + "status": "completed", + "message": "built 5/5 hosts successfully", + "results": [ + {"host": "host1", "success": true, "duration_seconds": 120.5}, + {"host": "host2", "success": true, "duration_seconds": 95.3} + ], + "total_duration_seconds": 450.2, + "succeeded": 5, + "failed": 0 +} +``` + +**Status values:** `started`, `progress`, `completed`, `failed`, `rejected` + +Progress updates include `host`, `host_success`, `hosts_completed`, and `hosts_total` fields. + ## NATS Authentication All connections use NKey authentication. Generate keys with: @@ -354,13 +524,22 @@ The deployment system uses the following NATS subject hierarchy: - `deploy.prod.all` - Deploy to all production hosts - `deploy.prod.role.dns` - Deploy to all DNS servers in production +### Build Subjects + +| Subject Pattern | Purpose | +|-----------------|---------| +| `build..*` | Build requests for a repository | +| `build..all` | Build all hosts in a repository | +| `build..` | Build a specific host | + ### Response Subjects | Subject Pattern | Purpose | |-----------------|---------| | `deploy.responses.` | Unique reply subject for each deployment request | +| `build.responses.` | Unique reply subject for each build request | -Deployers create a unique response subject for each request and include it in the `reply_to` field. Listeners publish status updates to this subject. +Deployers and build clients create a unique response subject for each request and include it in the `reply_to` field. Listeners and builders publish status updates to this subject. ### Discovery Subject @@ -451,7 +630,9 @@ authorization { | Credential Type | Publish | Subscribe | |-----------------|---------|-----------| | Listener | `deploy.responses.>`, `deploy.discover` | Own subjects, `deploy.discover` | +| Builder | `build.responses.>` | `build..*` for each configured repo | | Test deployer | `deploy.test.>`, `deploy.discover` | `deploy.responses.>`, `deploy.discover` | +| Build client | `build..*` | `build.responses.>` | | Admin deployer | `deploy.>` | `deploy.>` | ### Generating NKeys diff --git a/cmd/homelab-deploy/main.go b/cmd/homelab-deploy/main.go index 4c5bdde..758fe73 100644 --- a/cmd/homelab-deploy/main.go +++ b/cmd/homelab-deploy/main.go @@ -9,6 +9,7 @@ import ( "syscall" "time" + "git.t-juice.club/torjus/homelab-deploy/internal/builder" deploycli "git.t-juice.club/torjus/homelab-deploy/internal/cli" "git.t-juice.club/torjus/homelab-deploy/internal/listener" "git.t-juice.club/torjus/homelab-deploy/internal/mcp" @@ -16,7 +17,7 @@ import ( "github.com/urfave/cli/v3" ) -const version = "0.1.13" +const version = "0.2.0" func main() { app := &cli.Command{ @@ -25,8 +26,10 @@ func main() { Version: version, Commands: []*cli.Command{ listenerCommand(), + builderCommand(), mcpCommand(), deployCommand(), + buildCommand(), listHostsCommand(), }, } @@ -175,6 +178,10 @@ func mcpCommand() *cli.Command { Usage: "Timeout in seconds for deployment operations", Value: 900, }, + &cli.BoolFlag{ + Name: "enable-builds", + Usage: "Enable build tool", + }, }, Action: func(_ context.Context, c *cli.Command) error { enableAdmin := c.Bool("enable-admin") @@ -189,6 +196,7 @@ func mcpCommand() *cli.Command { NKeyFile: c.String("nkey-file"), EnableAdmin: enableAdmin, AdminNKeyFile: adminNKeyFile, + EnableBuilds: c.Bool("enable-builds"), DiscoverSubject: c.String("discover-subject"), Timeout: time.Duration(c.Int("timeout")) * time.Second, } @@ -374,3 +382,196 @@ func listHostsCommand() *cli.Command { }, } } + +func builderCommand() *cli.Command { + return &cli.Command{ + Name: "builder", + Usage: "Run as a build server (systemd service mode)", + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "nats-url", + Usage: "NATS server URL", + Required: true, + }, + &cli.StringFlag{ + Name: "nkey-file", + Usage: "Path to NKey seed file for NATS authentication", + Required: true, + }, + &cli.StringFlag{ + Name: "config", + Usage: "Path to builder configuration file", + Required: true, + }, + &cli.IntFlag{ + Name: "timeout", + Usage: "Build timeout in seconds per host", + Value: 1800, + }, + &cli.BoolFlag{ + Name: "metrics-enabled", + Usage: "Enable Prometheus metrics endpoint", + }, + &cli.StringFlag{ + Name: "metrics-addr", + Usage: "Address for Prometheus metrics HTTP server", + Value: ":9973", + }, + }, + Action: func(ctx context.Context, c *cli.Command) error { + repoCfg, err := builder.LoadConfig(c.String("config")) + if err != nil { + return fmt.Errorf("failed to load config: %w", err) + } + + cfg := builder.BuilderConfig{ + NATSUrl: c.String("nats-url"), + NKeyFile: c.String("nkey-file"), + ConfigFile: c.String("config"), + Timeout: time.Duration(c.Int("timeout")) * time.Second, + MetricsEnabled: c.Bool("metrics-enabled"), + MetricsAddr: c.String("metrics-addr"), + } + + logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{ + Level: slog.LevelInfo, + })) + + b := builder.New(cfg, repoCfg, logger) + + // Handle shutdown signals + ctx, cancel := signal.NotifyContext(ctx, syscall.SIGINT, syscall.SIGTERM) + defer cancel() + + return b.Run(ctx) + }, + } +} + +func buildCommand() *cli.Command { + return &cli.Command{ + Name: "build", + Usage: "Trigger a build on the build server", + ArgsUsage: " [hostname]", + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "nats-url", + Usage: "NATS server URL", + Sources: cli.EnvVars("HOMELAB_DEPLOY_NATS_URL"), + Required: true, + }, + &cli.StringFlag{ + Name: "nkey-file", + Usage: "Path to NKey seed file for NATS authentication", + Sources: cli.EnvVars("HOMELAB_DEPLOY_NKEY_FILE"), + Required: true, + }, + &cli.StringFlag{ + Name: "branch", + Usage: "Git branch to build (uses repo default if not specified)", + Sources: cli.EnvVars("HOMELAB_DEPLOY_BRANCH"), + }, + &cli.BoolFlag{ + Name: "all", + Usage: "Build all hosts in the repo", + }, + &cli.IntFlag{ + Name: "timeout", + Usage: "Timeout in seconds for collecting responses", + Sources: cli.EnvVars("HOMELAB_DEPLOY_BUILD_TIMEOUT"), + Value: 3600, + }, + &cli.BoolFlag{ + Name: "json", + Usage: "Output results as JSON", + }, + }, + Action: func(ctx context.Context, c *cli.Command) error { + if c.Args().Len() < 1 { + return fmt.Errorf("repo argument required") + } + + repo := c.Args().First() + target := c.Args().Get(1) + all := c.Bool("all") + + if target == "" && !all { + return fmt.Errorf("must specify hostname or --all") + } + if target != "" && all { + return fmt.Errorf("cannot specify both hostname and --all") + } + if all { + target = "all" + } + + cfg := deploycli.BuildConfig{ + NATSUrl: c.String("nats-url"), + NKeyFile: c.String("nkey-file"), + Repo: repo, + Target: target, + Branch: c.String("branch"), + Timeout: time.Duration(c.Int("timeout")) * time.Second, + } + + jsonOutput := c.Bool("json") + if !jsonOutput { + branchStr := cfg.Branch + if branchStr == "" { + branchStr = "(default)" + } + fmt.Printf("Building %s target=%s branch=%s\n", repo, target, branchStr) + } + + // Handle shutdown signals + ctx, cancel := signal.NotifyContext(ctx, syscall.SIGINT, syscall.SIGTERM) + defer cancel() + + result, err := deploycli.Build(ctx, cfg, func(resp *messages.BuildResponse) { + if jsonOutput { + return + } + switch resp.Status { + case messages.BuildStatusStarted: + fmt.Printf("Started: %s\n", resp.Message) + case messages.BuildStatusProgress: + successStr := "..." + if resp.HostSuccess != nil { + if *resp.HostSuccess { + successStr = "success" + } else { + successStr = "failed" + } + } + fmt.Printf("[%d/%d] %s: %s\n", resp.HostsCompleted, resp.HostsTotal, resp.Host, successStr) + case messages.BuildStatusCompleted, messages.BuildStatusFailed: + fmt.Printf("\n%s\n", resp.Message) + case messages.BuildStatusRejected: + fmt.Printf("Rejected: %s\n", resp.Message) + } + }) + if err != nil { + return fmt.Errorf("build failed: %w", err) + } + + if jsonOutput { + data, err := result.MarshalJSON() + if err != nil { + return fmt.Errorf("failed to marshal result: %w", err) + } + fmt.Println(string(data)) + } else if result.FinalResponse != nil { + fmt.Printf("\nBuild complete: %d succeeded, %d failed (%.1fs)\n", + result.FinalResponse.Succeeded, + result.FinalResponse.Failed, + result.FinalResponse.TotalDurationSeconds) + } + + if !result.AllSucceeded() { + return fmt.Errorf("some builds failed") + } + + return nil + }, + } +} diff --git a/go.mod b/go.mod index 835f140..59b7c57 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( github.com/nats-io/nkeys v0.4.15 github.com/prometheus/client_golang v1.23.2 github.com/urfave/cli/v3 v3.6.2 + gopkg.in/yaml.v3 v3.0.1 ) require ( @@ -32,5 +33,4 @@ require ( golang.org/x/crypto v0.47.0 // indirect golang.org/x/sys v0.40.0 // indirect google.golang.org/protobuf v1.36.8 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/internal/builder/builder.go b/internal/builder/builder.go new file mode 100644 index 0000000..0ca481d --- /dev/null +++ b/internal/builder/builder.go @@ -0,0 +1,323 @@ +package builder + +import ( + "context" + "fmt" + "log/slog" + "sort" + "sync" + "time" + + "git.t-juice.club/torjus/homelab-deploy/internal/messages" + "git.t-juice.club/torjus/homelab-deploy/internal/metrics" + "git.t-juice.club/torjus/homelab-deploy/internal/nats" +) + +// BuilderConfig holds the configuration for the builder. +type BuilderConfig struct { + NATSUrl string + NKeyFile string + ConfigFile string + Timeout time.Duration + MetricsEnabled bool + MetricsAddr string +} + +// Builder handles build requests from NATS. +type Builder struct { + cfg BuilderConfig + repoCfg *Config + client *nats.Client + executor *Executor + lock sync.Mutex + busy bool + logger *slog.Logger + + // metrics server and collector (nil if metrics disabled) + metricsServer *metrics.Server + metrics *metrics.BuildCollector +} + +// New creates a new builder with the given configuration. +func New(cfg BuilderConfig, repoCfg *Config, logger *slog.Logger) *Builder { + if logger == nil { + logger = slog.Default() + } + + b := &Builder{ + cfg: cfg, + repoCfg: repoCfg, + executor: NewExecutor(cfg.Timeout), + logger: logger, + } + + if cfg.MetricsEnabled { + b.metricsServer = metrics.NewServer(metrics.ServerConfig{ + Addr: cfg.MetricsAddr, + Logger: logger, + }) + b.metrics = metrics.NewBuildCollector(b.metricsServer.Registry()) + } + + return b +} + +// Run starts the builder and blocks until the context is cancelled. +func (b *Builder) Run(ctx context.Context) error { + // Start metrics server if enabled + if b.metricsServer != nil { + if err := b.metricsServer.Start(); err != nil { + return fmt.Errorf("failed to start metrics server: %w", err) + } + defer func() { + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _ = b.metricsServer.Shutdown(shutdownCtx) + }() + } + + // Connect to NATS + b.logger.Info("connecting to NATS", "url", b.cfg.NATSUrl) + + client, err := nats.Connect(nats.Config{ + URL: b.cfg.NATSUrl, + NKeyFile: b.cfg.NKeyFile, + Name: "homelab-deploy-builder", + }) + if err != nil { + return fmt.Errorf("failed to connect to NATS: %w", err) + } + b.client = client + defer b.client.Close() + + b.logger.Info("connected to NATS") + + // Subscribe to build subjects for each repo + for repoName := range b.repoCfg.Repos { + // Subscribe to build..all and build.. + allSubject := fmt.Sprintf("build.%s.*", repoName) + b.logger.Info("subscribing to build subject", "subject", allSubject) + if _, err := b.client.Subscribe(allSubject, b.handleBuildRequest); err != nil { + return fmt.Errorf("failed to subscribe to %s: %w", allSubject, err) + } + } + + b.logger.Info("builder started", "repos", len(b.repoCfg.Repos)) + + // Wait for context cancellation + <-ctx.Done() + b.logger.Info("shutting down builder") + + return nil +} + +func (b *Builder) handleBuildRequest(subject string, data []byte) { + req, err := messages.UnmarshalBuildRequest(data) + if err != nil { + b.logger.Error("failed to unmarshal build request", + "subject", subject, + "error", err, + ) + return + } + + b.logger.Info("received build request", + "subject", subject, + "repo", req.Repo, + "target", req.Target, + "branch", req.Branch, + "reply_to", req.ReplyTo, + ) + + // Validate request + if err := req.Validate(); err != nil { + b.logger.Warn("invalid build request", "error", err) + b.sendResponse(req.ReplyTo, messages.NewBuildResponse( + messages.BuildStatusRejected, + err.Error(), + )) + return + } + + // Get repo config + repo, err := b.repoCfg.GetRepo(req.Repo) + if err != nil { + b.logger.Warn("unknown repo", "repo", req.Repo) + b.sendResponse(req.ReplyTo, messages.NewBuildResponse( + messages.BuildStatusRejected, + fmt.Sprintf("unknown repo: %s", req.Repo), + )) + return + } + + // Try to acquire lock + b.lock.Lock() + if b.busy { + b.lock.Unlock() + b.logger.Warn("build already in progress") + b.sendResponse(req.ReplyTo, messages.NewBuildResponse( + messages.BuildStatusRejected, + "another build is already in progress", + )) + return + } + b.busy = true + b.lock.Unlock() + + defer func() { + b.lock.Lock() + b.busy = false + b.lock.Unlock() + }() + + // Use default branch if not specified + branch := req.Branch + if branch == "" { + branch = repo.DefaultBranch + } + + // Determine hosts to build + var hosts []string + if req.Target == "all" { + // List hosts from flake + b.sendResponse(req.ReplyTo, messages.NewBuildResponse( + messages.BuildStatusStarted, + "discovering hosts...", + )) + + hosts, err = b.executor.ListHosts(context.Background(), repo.URL, branch) + if err != nil { + b.logger.Error("failed to list hosts", "error", err) + b.sendResponse(req.ReplyTo, messages.NewBuildResponse( + messages.BuildStatusFailed, + fmt.Sprintf("failed to list hosts: %v", err), + ).WithError(err.Error())) + if b.metrics != nil { + b.metrics.RecordBuildFailure(req.Repo, "") + } + return + } + // Sort hosts for consistent ordering + sort.Strings(hosts) + } else { + hosts = []string{req.Target} + } + + if len(hosts) == 0 { + b.sendResponse(req.ReplyTo, messages.NewBuildResponse( + messages.BuildStatusFailed, + "no hosts to build", + )) + return + } + + // Send started response + b.sendResponse(req.ReplyTo, &messages.BuildResponse{ + Status: messages.BuildStatusStarted, + Message: fmt.Sprintf("building %d host(s)", len(hosts)), + HostsTotal: len(hosts), + }) + + // Build each host sequentially + startTime := time.Now() + results := make([]messages.BuildHostResult, 0, len(hosts)) + succeeded := 0 + failed := 0 + + for i, host := range hosts { + hostStart := time.Now() + b.logger.Info("building host", + "host", host, + "progress", fmt.Sprintf("%d/%d", i+1, len(hosts)), + "command", b.executor.BuildCommand(repo.URL, branch, host), + ) + + result := b.executor.Build(context.Background(), repo.URL, branch, host) + hostDuration := time.Since(hostStart).Seconds() + + hostResult := messages.BuildHostResult{ + Host: host, + Success: result.Success, + DurationSeconds: hostDuration, + } + if !result.Success { + hostResult.Error = result.Stderr + if hostResult.Error == "" && result.Error != nil { + hostResult.Error = result.Error.Error() + } + } + results = append(results, hostResult) + + if result.Success { + succeeded++ + b.logger.Info("host build succeeded", "host", host, "duration", hostDuration) + if b.metrics != nil { + b.metrics.RecordHostBuildSuccess(req.Repo, host, hostDuration) + } + } else { + failed++ + b.logger.Error("host build failed", "host", host, "error", hostResult.Error) + if b.metrics != nil { + b.metrics.RecordHostBuildFailure(req.Repo, host, hostDuration) + } + } + + // Send progress update + success := result.Success + b.sendResponse(req.ReplyTo, &messages.BuildResponse{ + Status: messages.BuildStatusProgress, + Host: host, + HostSuccess: &success, + HostsCompleted: i + 1, + HostsTotal: len(hosts), + }) + } + + totalDuration := time.Since(startTime).Seconds() + + // Send final response + status := messages.BuildStatusCompleted + message := fmt.Sprintf("built %d/%d hosts successfully", succeeded, len(hosts)) + if failed > 0 { + status = messages.BuildStatusFailed + message = fmt.Sprintf("build failed: %d/%d hosts failed", failed, len(hosts)) + } + + b.sendResponse(req.ReplyTo, &messages.BuildResponse{ + Status: status, + Message: message, + Results: results, + TotalDurationSeconds: totalDuration, + Succeeded: succeeded, + Failed: failed, + }) + + // Record overall build metrics + if b.metrics != nil { + if failed == 0 { + b.metrics.RecordBuildSuccess(req.Repo) + } else { + b.metrics.RecordBuildFailure(req.Repo, "") + } + } +} + +func (b *Builder) sendResponse(replyTo string, resp *messages.BuildResponse) { + data, err := resp.Marshal() + if err != nil { + b.logger.Error("failed to marshal build response", "error", err) + return + } + + if err := b.client.Publish(replyTo, data); err != nil { + b.logger.Error("failed to publish build response", + "reply_to", replyTo, + "error", err, + ) + } + + // Flush to ensure response is sent immediately + if err := b.client.Flush(); err != nil { + b.logger.Error("failed to flush", "error", err) + } +} diff --git a/internal/builder/config.go b/internal/builder/config.go new file mode 100644 index 0000000..bd3d87a --- /dev/null +++ b/internal/builder/config.go @@ -0,0 +1,65 @@ +package builder + +import ( + "fmt" + "os" + + "gopkg.in/yaml.v3" +) + +// RepoConfig holds configuration for a single repository. +type RepoConfig struct { + URL string `yaml:"url"` + DefaultBranch string `yaml:"default_branch"` +} + +// Config holds the builder configuration. +type Config struct { + Repos map[string]RepoConfig `yaml:"repos"` +} + +// LoadConfig loads configuration from a YAML file. +func LoadConfig(path string) (*Config, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("failed to read config file: %w", err) + } + + var cfg Config + if err := yaml.Unmarshal(data, &cfg); err != nil { + return nil, fmt.Errorf("failed to parse config file: %w", err) + } + + if err := cfg.Validate(); err != nil { + return nil, err + } + + return &cfg, nil +} + +// Validate checks that the configuration is valid. +func (c *Config) Validate() error { + if len(c.Repos) == 0 { + return fmt.Errorf("no repos configured") + } + + for name, repo := range c.Repos { + if repo.URL == "" { + return fmt.Errorf("repo %q: url is required", name) + } + if repo.DefaultBranch == "" { + return fmt.Errorf("repo %q: default_branch is required", name) + } + } + + return nil +} + +// GetRepo returns the configuration for a repository, or an error if not found. +func (c *Config) GetRepo(name string) (*RepoConfig, error) { + repo, ok := c.Repos[name] + if !ok { + return nil, fmt.Errorf("repo %q not found in configuration", name) + } + return &repo, nil +} diff --git a/internal/builder/executor.go b/internal/builder/executor.go new file mode 100644 index 0000000..92fe870 --- /dev/null +++ b/internal/builder/executor.go @@ -0,0 +1,116 @@ +package builder + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "os/exec" + "time" +) + +// Executor handles the execution of nix build commands. +type Executor struct { + timeout time.Duration +} + +// NewExecutor creates a new build executor. +func NewExecutor(timeout time.Duration) *Executor { + return &Executor{ + timeout: timeout, + } +} + +// BuildResult contains the result of a build execution. +type BuildResult struct { + Success bool + ExitCode int + Stdout string + Stderr string + Error error +} + +// FlakeShowResult contains the parsed output of nix flake show. +type FlakeShowResult struct { + NixosConfigurations map[string]any `json:"nixosConfigurations"` +} + +// ListHosts returns the list of hosts (nixosConfigurations) available in a flake. +func (e *Executor) ListHosts(ctx context.Context, flakeURL, branch string) ([]string, error) { + ctx, cancel := context.WithTimeout(ctx, 60*time.Second) + defer cancel() + + flakeRef := fmt.Sprintf("%s?ref=%s", flakeURL, branch) + cmd := exec.CommandContext(ctx, "nix", "flake", "show", "--json", flakeRef) + + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + if err := cmd.Run(); err != nil { + if ctx.Err() == context.DeadlineExceeded { + return nil, fmt.Errorf("timeout listing hosts") + } + return nil, fmt.Errorf("failed to list hosts: %w\n%s", err, stderr.String()) + } + + var result FlakeShowResult + if err := json.Unmarshal(stdout.Bytes(), &result); err != nil { + return nil, fmt.Errorf("failed to parse flake show output: %w", err) + } + + hosts := make([]string, 0, len(result.NixosConfigurations)) + for host := range result.NixosConfigurations { + hosts = append(hosts, host) + } + + return hosts, nil +} + +// Build builds a single host's system configuration. +func (e *Executor) Build(ctx context.Context, flakeURL, branch, host string) *BuildResult { + ctx, cancel := context.WithTimeout(ctx, e.timeout) + defer cancel() + + // Build the flake reference for the system toplevel + flakeRef := fmt.Sprintf("%s?ref=%s#nixosConfigurations.%s.config.system.build.toplevel", flakeURL, branch, host) + + cmd := exec.CommandContext(ctx, "nix", "build", "--no-link", flakeRef) + + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err := cmd.Run() + + result := &BuildResult{ + Stdout: stdout.String(), + Stderr: stderr.String(), + } + + if err != nil { + result.Success = false + result.Error = err + + if ctx.Err() == context.DeadlineExceeded { + result.Error = fmt.Errorf("build timed out after %v", e.timeout) + } + + if exitErr, ok := err.(*exec.ExitError); ok { + result.ExitCode = exitErr.ExitCode() + } else { + result.ExitCode = -1 + } + } else { + result.Success = true + result.ExitCode = 0 + } + + return result +} + +// BuildCommand returns the command that would be executed (for logging/debugging). +func (e *Executor) BuildCommand(flakeURL, branch, host string) string { + flakeRef := fmt.Sprintf("%s?ref=%s#nixosConfigurations.%s.config.system.build.toplevel", flakeURL, branch, host) + return fmt.Sprintf("nix build --no-link %s", flakeRef) +} diff --git a/internal/cli/build.go b/internal/cli/build.go new file mode 100644 index 0000000..c7dbb64 --- /dev/null +++ b/internal/cli/build.go @@ -0,0 +1,140 @@ +package cli + +import ( + "context" + "encoding/json" + "fmt" + "sync" + "time" + + "github.com/google/uuid" + + "git.t-juice.club/torjus/homelab-deploy/internal/messages" + "git.t-juice.club/torjus/homelab-deploy/internal/nats" +) + +// BuildConfig holds configuration for a build operation. +type BuildConfig struct { + NATSUrl string + NKeyFile string + Repo string + Target string + Branch string + Timeout time.Duration +} + +// BuildResult contains the aggregated results from a build. +type BuildResult struct { + Responses []*messages.BuildResponse + FinalResponse *messages.BuildResponse + Errors []error +} + +// AllSucceeded returns true if the build completed successfully. +func (r *BuildResult) AllSucceeded() bool { + if len(r.Errors) > 0 { + return false + } + if r.FinalResponse == nil { + return false + } + return r.FinalResponse.Status == messages.BuildStatusCompleted && r.FinalResponse.Failed == 0 +} + +// MarshalJSON returns the JSON representation of the build result. +func (r *BuildResult) MarshalJSON() ([]byte, error) { + if r.FinalResponse != nil { + return json.Marshal(r.FinalResponse) + } + return json.Marshal(map[string]any{ + "status": "unknown", + "responses": r.Responses, + "errors": r.Errors, + }) +} + +// Build triggers a build and collects responses. +func Build(ctx context.Context, cfg BuildConfig, onResponse func(*messages.BuildResponse)) (*BuildResult, error) { + // Connect to NATS + client, err := nats.Connect(nats.Config{ + URL: cfg.NATSUrl, + NKeyFile: cfg.NKeyFile, + Name: "homelab-deploy-build-cli", + }) + if err != nil { + return nil, fmt.Errorf("failed to connect to NATS: %w", err) + } + defer client.Close() + + // Generate unique reply subject + requestID := uuid.New().String() + replySubject := fmt.Sprintf("build.responses.%s", requestID) + + var mu sync.Mutex + result := &BuildResult{} + done := make(chan struct{}) + + // Subscribe to reply subject + sub, err := client.Subscribe(replySubject, func(subject string, data []byte) { + resp, err := messages.UnmarshalBuildResponse(data) + if err != nil { + mu.Lock() + result.Errors = append(result.Errors, fmt.Errorf("failed to unmarshal response: %w", err)) + mu.Unlock() + return + } + + mu.Lock() + result.Responses = append(result.Responses, resp) + if resp.Status.IsFinal() { + result.FinalResponse = resp + select { + case <-done: + default: + close(done) + } + } + mu.Unlock() + + if onResponse != nil { + onResponse(resp) + } + }) + if err != nil { + return nil, fmt.Errorf("failed to subscribe to reply subject: %w", err) + } + defer func() { _ = sub.Unsubscribe() }() + + // Build and send request + req := &messages.BuildRequest{ + Repo: cfg.Repo, + Target: cfg.Target, + Branch: cfg.Branch, + ReplyTo: replySubject, + } + + data, err := req.Marshal() + if err != nil { + return nil, fmt.Errorf("failed to marshal request: %w", err) + } + + // Publish to build.. + buildSubject := fmt.Sprintf("build.%s.%s", cfg.Repo, cfg.Target) + if err := client.Publish(buildSubject, data); err != nil { + return nil, fmt.Errorf("failed to publish request: %w", err) + } + + if err := client.Flush(); err != nil { + return nil, fmt.Errorf("failed to flush: %w", err) + } + + // Wait for final response or timeout + select { + case <-ctx.Done(): + return result, ctx.Err() + case <-done: + return result, nil + case <-time.After(cfg.Timeout): + return result, nil + } +} diff --git a/internal/mcp/build_tools.go b/internal/mcp/build_tools.go new file mode 100644 index 0000000..161035c --- /dev/null +++ b/internal/mcp/build_tools.go @@ -0,0 +1,109 @@ +package mcp + +import ( + "context" + "fmt" + "strings" + + "github.com/mark3labs/mcp-go/mcp" + + deploycli "git.t-juice.club/torjus/homelab-deploy/internal/cli" + "git.t-juice.club/torjus/homelab-deploy/internal/messages" +) + +// BuildTool creates the build tool definition. +func BuildTool() mcp.Tool { + return mcp.NewTool( + "build", + mcp.WithDescription("Trigger a Nix build on the build server"), + mcp.WithString("repo", + mcp.Required(), + mcp.Description("Repository name (must match builder config)"), + ), + mcp.WithString("target", + mcp.Description("Target hostname, or omit to build all hosts"), + ), + mcp.WithBoolean("all", + mcp.Description("Build all hosts in the repository (default if no target specified)"), + ), + mcp.WithString("branch", + mcp.Description("Git branch to build (uses repo default if not specified)"), + ), + ) +} + +// HandleBuild handles the build tool. +func (h *ToolHandler) HandleBuild(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) { + repo, err := request.RequireString("repo") + if err != nil { + return mcp.NewToolResultError("repo is required"), nil + } + + target := request.GetString("target", "") + all := request.GetBool("all", false) + branch := request.GetString("branch", "") + + // Default to "all" if no target specified + if target == "" { + if !all { + all = true + } + target = "all" + } + if all && target != "all" { + return mcp.NewToolResultError("cannot specify both target and all"), nil + } + + cfg := deploycli.BuildConfig{ + NATSUrl: h.cfg.NATSUrl, + NKeyFile: h.cfg.NKeyFile, + Repo: repo, + Target: target, + Branch: branch, + Timeout: h.cfg.Timeout, + } + + var output strings.Builder + branchStr := branch + if branchStr == "" { + branchStr = "(default)" + } + output.WriteString(fmt.Sprintf("Building %s target=%s branch=%s\n\n", repo, target, branchStr)) + + result, err := deploycli.Build(ctx, cfg, func(resp *messages.BuildResponse) { + switch resp.Status { + case messages.BuildStatusStarted: + output.WriteString(fmt.Sprintf("Started: %s\n", resp.Message)) + case messages.BuildStatusProgress: + successStr := "..." + if resp.HostSuccess != nil { + if *resp.HostSuccess { + successStr = "success" + } else { + successStr = "failed" + } + } + output.WriteString(fmt.Sprintf("[%d/%d] %s: %s\n", resp.HostsCompleted, resp.HostsTotal, resp.Host, successStr)) + case messages.BuildStatusCompleted, messages.BuildStatusFailed: + output.WriteString(fmt.Sprintf("\n%s\n", resp.Message)) + case messages.BuildStatusRejected: + output.WriteString(fmt.Sprintf("Rejected: %s\n", resp.Message)) + } + }) + if err != nil { + return mcp.NewToolResultError(fmt.Sprintf("build failed: %v", err)), nil + } + + if result.FinalResponse != nil { + output.WriteString(fmt.Sprintf("\nBuild complete: %d succeeded, %d failed (%.1fs)\n", + result.FinalResponse.Succeeded, + result.FinalResponse.Failed, + result.FinalResponse.TotalDurationSeconds)) + } + + if !result.AllSucceeded() { + output.WriteString("WARNING: Some builds failed\n") + } + + return mcp.NewToolResultText(output.String()), nil +} diff --git a/internal/mcp/server.go b/internal/mcp/server.go index dd89c4a..e87e451 100644 --- a/internal/mcp/server.go +++ b/internal/mcp/server.go @@ -12,6 +12,7 @@ type ServerConfig struct { NKeyFile string EnableAdmin bool AdminNKeyFile string + EnableBuilds bool DiscoverSubject string Timeout time.Duration } @@ -49,6 +50,11 @@ func New(cfg ServerConfig) *Server { s.AddTool(DeployAdminTool(), handler.HandleDeployAdmin) } + // Optionally register build tool + if cfg.EnableBuilds { + s.AddTool(BuildTool(), handler.HandleBuild) + } + return &Server{ cfg: cfg, server: s, diff --git a/internal/messages/build.go b/internal/messages/build.go new file mode 100644 index 0000000..2dbf43f --- /dev/null +++ b/internal/messages/build.go @@ -0,0 +1,125 @@ +package messages + +import ( + "encoding/json" + "fmt" +) + +// BuildStatus represents the status of a build response. +type BuildStatus string + +const ( + BuildStatusStarted BuildStatus = "started" + BuildStatusProgress BuildStatus = "progress" + BuildStatusCompleted BuildStatus = "completed" + BuildStatusFailed BuildStatus = "failed" + BuildStatusRejected BuildStatus = "rejected" +) + +// IsFinal returns true if this status indicates a terminal state. +func (s BuildStatus) IsFinal() bool { + switch s { + case BuildStatusCompleted, BuildStatusFailed, BuildStatusRejected: + return true + default: + return false + } +} + +// BuildRequest is the message sent to request a build. +type BuildRequest struct { + Repo string `json:"repo"` // Must match config + Target string `json:"target"` // Hostname or "all" + Branch string `json:"branch,omitempty"` // Optional, uses repo default + ReplyTo string `json:"reply_to"` +} + +// Validate checks that the request is valid. +func (r *BuildRequest) Validate() error { + if r.Repo == "" { + return fmt.Errorf("repo is required") + } + if !revisionRegex.MatchString(r.Repo) { + return fmt.Errorf("invalid repo name format: %q", r.Repo) + } + if r.Target == "" { + return fmt.Errorf("target is required") + } + if r.Branch != "" && !revisionRegex.MatchString(r.Branch) { + return fmt.Errorf("invalid branch format: %q", r.Branch) + } + if r.ReplyTo == "" { + return fmt.Errorf("reply_to is required") + } + return nil +} + +// Marshal serializes the request to JSON. +func (r *BuildRequest) Marshal() ([]byte, error) { + return json.Marshal(r) +} + +// UnmarshalBuildRequest deserializes a request from JSON. +func UnmarshalBuildRequest(data []byte) (*BuildRequest, error) { + var r BuildRequest + if err := json.Unmarshal(data, &r); err != nil { + return nil, fmt.Errorf("failed to unmarshal build request: %w", err) + } + return &r, nil +} + +// BuildHostResult contains the result of building a single host. +type BuildHostResult struct { + Host string `json:"host"` + Success bool `json:"success"` + Error string `json:"error,omitempty"` + DurationSeconds float64 `json:"duration_seconds"` +} + +// BuildResponse is the message sent in response to a build request. +type BuildResponse struct { + Status BuildStatus `json:"status"` + Message string `json:"message,omitempty"` + + // Progress updates + Host string `json:"host,omitempty"` + HostSuccess *bool `json:"host_success,omitempty"` + HostsCompleted int `json:"hosts_completed,omitempty"` + HostsTotal int `json:"hosts_total,omitempty"` + + // Final response + Results []BuildHostResult `json:"results,omitempty"` + TotalDurationSeconds float64 `json:"total_duration_seconds,omitempty"` + Succeeded int `json:"succeeded,omitempty"` + Failed int `json:"failed,omitempty"` + + Error string `json:"error,omitempty"` +} + +// NewBuildResponse creates a new response with the given status and message. +func NewBuildResponse(status BuildStatus, message string) *BuildResponse { + return &BuildResponse{ + Status: status, + Message: message, + } +} + +// WithError adds an error message to the response. +func (r *BuildResponse) WithError(err string) *BuildResponse { + r.Error = err + return r +} + +// Marshal serializes the response to JSON. +func (r *BuildResponse) Marshal() ([]byte, error) { + return json.Marshal(r) +} + +// UnmarshalBuildResponse deserializes a response from JSON. +func UnmarshalBuildResponse(data []byte) (*BuildResponse, error) { + var r BuildResponse + if err := json.Unmarshal(data, &r); err != nil { + return nil, fmt.Errorf("failed to unmarshal build response: %w", err) + } + return &r, nil +} diff --git a/internal/metrics/build_metrics.go b/internal/metrics/build_metrics.go new file mode 100644 index 0000000..c194dc4 --- /dev/null +++ b/internal/metrics/build_metrics.go @@ -0,0 +1,99 @@ +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +// BuildCollector holds all Prometheus metrics for the builder. +type BuildCollector struct { + buildsTotal *prometheus.CounterVec + buildHostTotal *prometheus.CounterVec + buildDuration *prometheus.HistogramVec + buildLastTimestamp *prometheus.GaugeVec + buildLastSuccessTime *prometheus.GaugeVec + buildLastFailureTime *prometheus.GaugeVec +} + +// NewBuildCollector creates a new build metrics collector and registers it with the given registerer. +func NewBuildCollector(reg prometheus.Registerer) *BuildCollector { + c := &BuildCollector{ + buildsTotal: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "homelab_deploy_builds_total", + Help: "Total builds processed", + }, + []string{"repo", "status"}, + ), + buildHostTotal: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "homelab_deploy_build_host_total", + Help: "Total host builds processed", + }, + []string{"repo", "host", "status"}, + ), + buildDuration: prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "homelab_deploy_build_duration_seconds", + Help: "Build execution time per host", + Buckets: []float64{30, 60, 120, 300, 600, 900, 1200, 1800, 3600}, + }, + []string{"repo", "host"}, + ), + buildLastTimestamp: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "homelab_deploy_build_last_timestamp", + Help: "Timestamp of last build attempt", + }, + []string{"repo"}, + ), + buildLastSuccessTime: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "homelab_deploy_build_last_success_timestamp", + Help: "Timestamp of last successful build", + }, + []string{"repo"}, + ), + buildLastFailureTime: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "homelab_deploy_build_last_failure_timestamp", + Help: "Timestamp of last failed build", + }, + []string{"repo"}, + ), + } + + reg.MustRegister(c.buildsTotal) + reg.MustRegister(c.buildHostTotal) + reg.MustRegister(c.buildDuration) + reg.MustRegister(c.buildLastTimestamp) + reg.MustRegister(c.buildLastSuccessTime) + reg.MustRegister(c.buildLastFailureTime) + + return c +} + +// RecordBuildSuccess records a successful build. +func (c *BuildCollector) RecordBuildSuccess(repo string) { + c.buildsTotal.WithLabelValues(repo, "success").Inc() + c.buildLastTimestamp.WithLabelValues(repo).SetToCurrentTime() + c.buildLastSuccessTime.WithLabelValues(repo).SetToCurrentTime() +} + +// RecordBuildFailure records a failed build. +func (c *BuildCollector) RecordBuildFailure(repo, errorCode string) { + c.buildsTotal.WithLabelValues(repo, "failure").Inc() + c.buildLastTimestamp.WithLabelValues(repo).SetToCurrentTime() + c.buildLastFailureTime.WithLabelValues(repo).SetToCurrentTime() +} + +// RecordHostBuildSuccess records a successful host build. +func (c *BuildCollector) RecordHostBuildSuccess(repo, host string, durationSeconds float64) { + c.buildHostTotal.WithLabelValues(repo, host, "success").Inc() + c.buildDuration.WithLabelValues(repo, host).Observe(durationSeconds) +} + +// RecordHostBuildFailure records a failed host build. +func (c *BuildCollector) RecordHostBuildFailure(repo, host string, durationSeconds float64) { + c.buildHostTotal.WithLabelValues(repo, host, "failure").Inc() + c.buildDuration.WithLabelValues(repo, host).Observe(durationSeconds) +} diff --git a/internal/metrics/server.go b/internal/metrics/server.go index 886d463..83ea11b 100644 --- a/internal/metrics/server.go +++ b/internal/metrics/server.go @@ -74,6 +74,11 @@ func (s *Server) Collector() *Collector { return s.collector } +// Registry returns the Prometheus registry. +func (s *Server) Registry() *prometheus.Registry { + return s.registry +} + // ScrapeCh returns a channel that receives a signal each time the metrics endpoint is scraped. func (s *Server) ScrapeCh() <-chan struct{} { return s.scrapeCh diff --git a/nixos/module.nix b/nixos/module.nix index ee0a77b..298015e 100644 --- a/nixos/module.nix +++ b/nixos/module.nix @@ -2,32 +2,47 @@ { config, lib, pkgs, ... }: let - cfg = config.services.homelab-deploy.listener; + listenerCfg = config.services.homelab-deploy.listener; + builderCfg = config.services.homelab-deploy.builder; - # Build command line arguments from configuration - args = lib.concatStringsSep " " ([ - "--hostname ${lib.escapeShellArg cfg.hostname}" - "--tier ${cfg.tier}" - "--nats-url ${lib.escapeShellArg cfg.natsUrl}" - "--nkey-file ${lib.escapeShellArg cfg.nkeyFile}" - "--flake-url ${lib.escapeShellArg cfg.flakeUrl}" - "--timeout ${toString cfg.timeout}" - "--discover-subject ${lib.escapeShellArg cfg.discoverSubject}" + # Build command line arguments for listener from configuration + listenerArgs = lib.concatStringsSep " " ([ + "--hostname ${lib.escapeShellArg listenerCfg.hostname}" + "--tier ${listenerCfg.tier}" + "--nats-url ${lib.escapeShellArg listenerCfg.natsUrl}" + "--nkey-file ${lib.escapeShellArg listenerCfg.nkeyFile}" + "--flake-url ${lib.escapeShellArg listenerCfg.flakeUrl}" + "--timeout ${toString listenerCfg.timeout}" + "--discover-subject ${lib.escapeShellArg listenerCfg.discoverSubject}" ] - ++ lib.optional (cfg.role != null) "--role ${lib.escapeShellArg cfg.role}" - ++ map (s: "--deploy-subject ${lib.escapeShellArg s}") cfg.deploySubjects - ++ lib.optionals cfg.metrics.enable [ + ++ lib.optional (listenerCfg.role != null) "--role ${lib.escapeShellArg listenerCfg.role}" + ++ map (s: "--deploy-subject ${lib.escapeShellArg s}") listenerCfg.deploySubjects + ++ lib.optionals listenerCfg.metrics.enable [ "--metrics-enabled" - "--metrics-addr ${lib.escapeShellArg cfg.metrics.address}" + "--metrics-addr ${lib.escapeShellArg listenerCfg.metrics.address}" + ]); + + # Build command line arguments for builder from configuration + builderArgs = lib.concatStringsSep " " ([ + "--nats-url ${lib.escapeShellArg builderCfg.natsUrl}" + "--nkey-file ${lib.escapeShellArg builderCfg.nkeyFile}" + "--config ${lib.escapeShellArg builderCfg.configFile}" + "--timeout ${toString builderCfg.timeout}" + ] + ++ lib.optionals builderCfg.metrics.enable [ + "--metrics-enabled" + "--metrics-addr ${lib.escapeShellArg builderCfg.metrics.address}" ]); # Extract port from metrics address for firewall rule - metricsPort = let - addr = cfg.metrics.address; + extractPort = addr: let # Handle both ":9972" and "0.0.0.0:9972" formats parts = lib.splitString ":" addr; in lib.toInt (lib.last parts); + listenerMetricsPort = extractPort listenerCfg.metrics.address; + builderMetricsPort = extractPort builderCfg.metrics.address; + in { options.services.homelab-deploy.listener = { @@ -124,43 +139,136 @@ in }; }; - config = lib.mkIf cfg.enable { - systemd.services.homelab-deploy-listener = { - description = "homelab-deploy listener"; - wantedBy = [ "multi-user.target" ]; - after = [ "network-online.target" ]; - wants = [ "network-online.target" ]; + options.services.homelab-deploy.builder = { + enable = lib.mkEnableOption "homelab-deploy builder service"; - # Prevent self-interruption during nixos-rebuild switch - # The service will continue running the old version until manually restarted - stopIfChanged = false; - restartIfChanged = false; - - environment = cfg.environment // { - # Nix needs a writable cache for git flake fetching - XDG_CACHE_HOME = "/var/cache/homelab-deploy"; - }; - - path = [ pkgs.git config.system.build.nixos-rebuild ]; - - serviceConfig = { - CacheDirectory = "homelab-deploy"; - Type = "simple"; - ExecStart = "${cfg.package}/bin/homelab-deploy listener ${args}"; - Restart = "always"; - RestartSec = 10; - - # Minimal hardening - nixos-rebuild requires broad system access: - # - Write access to /nix/store for building - # - Kernel namespace support for nix sandbox builds - # - Ability to activate system configurations - # - Network access for fetching from git/cache - # Following the approach of nixos auto-upgrade which has no hardening - }; + package = lib.mkOption { + type = lib.types.package; + default = self.packages.${pkgs.system}.homelab-deploy; + description = "The homelab-deploy package to use"; }; - networking.firewall.allowedTCPPorts = lib.mkIf (cfg.metrics.enable && cfg.metrics.openFirewall) [ - metricsPort - ]; + natsUrl = lib.mkOption { + type = lib.types.str; + description = "NATS server URL"; + example = "nats://nats.example.com:4222"; + }; + + nkeyFile = lib.mkOption { + type = lib.types.path; + description = "Path to NKey seed file for NATS authentication"; + example = "/run/secrets/homelab-deploy-builder-nkey"; + }; + + configFile = lib.mkOption { + type = lib.types.path; + description = "Path to builder configuration file (YAML)"; + example = "/etc/homelab-deploy/builder.yaml"; + }; + + timeout = lib.mkOption { + type = lib.types.int; + default = 1800; + description = "Build timeout in seconds per host"; + }; + + environment = lib.mkOption { + type = lib.types.attrsOf lib.types.str; + default = { }; + description = "Additional environment variables for the service"; + example = { GIT_SSH_COMMAND = "ssh -i /run/secrets/deploy-key"; }; + }; + + metrics = { + enable = lib.mkEnableOption "Prometheus metrics endpoint"; + + address = lib.mkOption { + type = lib.types.str; + default = ":9973"; + description = "Address for Prometheus metrics HTTP server"; + example = "127.0.0.1:9973"; + }; + + openFirewall = lib.mkOption { + type = lib.types.bool; + default = false; + description = "Open firewall for metrics port"; + }; + }; }; + + config = lib.mkMerge [ + (lib.mkIf listenerCfg.enable { + systemd.services.homelab-deploy-listener = { + description = "homelab-deploy listener"; + wantedBy = [ "multi-user.target" ]; + after = [ "network-online.target" ]; + wants = [ "network-online.target" ]; + + # Prevent self-interruption during nixos-rebuild switch + # The service will continue running the old version until manually restarted + stopIfChanged = false; + restartIfChanged = false; + + environment = listenerCfg.environment // { + # Nix needs a writable cache for git flake fetching + XDG_CACHE_HOME = "/var/cache/homelab-deploy"; + }; + + path = [ pkgs.git config.system.build.nixos-rebuild ]; + + serviceConfig = { + CacheDirectory = "homelab-deploy"; + Type = "simple"; + ExecStart = "${listenerCfg.package}/bin/homelab-deploy listener ${listenerArgs}"; + Restart = "always"; + RestartSec = 10; + + # Minimal hardening - nixos-rebuild requires broad system access: + # - Write access to /nix/store for building + # - Kernel namespace support for nix sandbox builds + # - Ability to activate system configurations + # - Network access for fetching from git/cache + # Following the approach of nixos auto-upgrade which has no hardening + }; + }; + + networking.firewall.allowedTCPPorts = lib.mkIf (listenerCfg.metrics.enable && listenerCfg.metrics.openFirewall) [ + listenerMetricsPort + ]; + }) + + (lib.mkIf builderCfg.enable { + systemd.services.homelab-deploy-builder = { + description = "homelab-deploy builder"; + wantedBy = [ "multi-user.target" ]; + after = [ "network-online.target" ]; + wants = [ "network-online.target" ]; + + environment = builderCfg.environment // { + # Nix needs a writable cache for git flake fetching + XDG_CACHE_HOME = "/var/cache/homelab-deploy-builder"; + }; + + path = [ pkgs.git pkgs.nix ]; + + serviceConfig = { + CacheDirectory = "homelab-deploy-builder"; + Type = "simple"; + ExecStart = "${builderCfg.package}/bin/homelab-deploy builder ${builderArgs}"; + Restart = "always"; + RestartSec = 10; + + # Minimal hardening - nix build requires broad system access: + # - Write access to /nix/store for building + # - Kernel namespace support for nix sandbox builds + # - Network access for fetching from git/cache + }; + }; + + networking.firewall.allowedTCPPorts = lib.mkIf (builderCfg.metrics.enable && builderCfg.metrics.openFirewall) [ + builderMetricsPort + ]; + }) + ]; } -- 2.49.1 From 08f1fcc6ac6a2f96cd2398fa9d5e33b2e7ebc6ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Tue, 10 Feb 2026 22:07:26 +0100 Subject: [PATCH 2/4] fix: validate target and hostname inputs to prevent injection Add input validation to address security concerns: - Validate Target field in BuildRequest against safe character pattern (must be "all" or match alphanumeric/dash/underscore/dot pattern) - Filter hostnames discovered from nix flake show output, skipping any with invalid characters before using them in build commands This prevents potential command injection via crafted NATS messages or malicious flake configurations. Co-Authored-By: Claude Opus 4.5 --- internal/builder/builder.go | 15 +++++++++++++++ internal/messages/build.go | 4 ++++ 2 files changed, 19 insertions(+) diff --git a/internal/builder/builder.go b/internal/builder/builder.go index 0ca481d..9894c3e 100644 --- a/internal/builder/builder.go +++ b/internal/builder/builder.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "log/slog" + "regexp" "sort" "sync" "time" @@ -13,6 +14,10 @@ import ( "git.t-juice.club/torjus/homelab-deploy/internal/nats" ) +// hostnameRegex validates hostnames from flake output. +// Allows: alphanumeric, dashes, underscores, dots. +var hostnameRegex = regexp.MustCompile(`^[a-zA-Z0-9._-]+$`) + // BuilderConfig holds the configuration for the builder. type BuilderConfig struct { NATSUrl string @@ -197,6 +202,16 @@ func (b *Builder) handleBuildRequest(subject string, data []byte) { } return } + // Filter out hostnames with invalid characters (security: prevent injection) + validHosts := make([]string, 0, len(hosts)) + for _, host := range hosts { + if hostnameRegex.MatchString(host) { + validHosts = append(validHosts, host) + } else { + b.logger.Warn("skipping hostname with invalid characters", "hostname", host) + } + } + hosts = validHosts // Sort hosts for consistent ordering sort.Strings(hosts) } else { diff --git a/internal/messages/build.go b/internal/messages/build.go index 2dbf43f..82749a2 100644 --- a/internal/messages/build.go +++ b/internal/messages/build.go @@ -45,6 +45,10 @@ func (r *BuildRequest) Validate() error { if r.Target == "" { return fmt.Errorf("target is required") } + // Target must be "all" or a valid hostname (same format as revision/branch) + if r.Target != "all" && !revisionRegex.MatchString(r.Target) { + return fmt.Errorf("invalid target format: %q", r.Target) + } if r.Branch != "" && !revisionRegex.MatchString(r.Branch) { return fmt.Errorf("invalid branch format: %q", r.Branch) } -- 2.49.1 From c52e88ca7e61991bee4b4abf8fb6a6484b3d947e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Tue, 10 Feb 2026 22:09:51 +0100 Subject: [PATCH 3/4] fix: add validation for config and reply subjects Address medium severity security issues: - Validate repo names in config only allow alphanumeric, dash, underscore (prevents NATS subject injection via dots or wildcards) - Validate repo URLs must start with git+https://, git+ssh://, or git+file:// - Validate ReplyTo field must start with "build.responses." to prevent publishing responses to arbitrary NATS subjects Co-Authored-By: Claude Opus 4.5 --- internal/builder/config.go | 31 +++++++++++++++++++++++++++++++ internal/messages/build.go | 5 +++++ 2 files changed, 36 insertions(+) diff --git a/internal/builder/config.go b/internal/builder/config.go index bd3d87a..56e6b12 100644 --- a/internal/builder/config.go +++ b/internal/builder/config.go @@ -3,10 +3,23 @@ package builder import ( "fmt" "os" + "regexp" + "strings" "gopkg.in/yaml.v3" ) +// repoNameRegex validates repository names for safe use in NATS subjects. +// Only allows alphanumeric, dashes, and underscores (no dots or wildcards). +var repoNameRegex = regexp.MustCompile(`^[a-zA-Z0-9_-]+$`) + +// validURLPrefixes are the allowed prefixes for repository URLs. +var validURLPrefixes = []string{ + "git+https://", + "git+ssh://", + "git+file://", +} + // RepoConfig holds configuration for a single repository. type RepoConfig struct { URL string `yaml:"url"` @@ -44,9 +57,27 @@ func (c *Config) Validate() error { } for name, repo := range c.Repos { + // Validate repo name for safe use in NATS subjects + if !repoNameRegex.MatchString(name) { + return fmt.Errorf("repo name %q contains invalid characters (only alphanumeric, dash, underscore allowed)", name) + } + if repo.URL == "" { return fmt.Errorf("repo %q: url is required", name) } + + // Validate URL format + validURL := false + for _, prefix := range validURLPrefixes { + if strings.HasPrefix(repo.URL, prefix) { + validURL = true + break + } + } + if !validURL { + return fmt.Errorf("repo %q: url must start with git+https://, git+ssh://, or git+file://", name) + } + if repo.DefaultBranch == "" { return fmt.Errorf("repo %q: default_branch is required", name) } diff --git a/internal/messages/build.go b/internal/messages/build.go index 82749a2..da310df 100644 --- a/internal/messages/build.go +++ b/internal/messages/build.go @@ -3,6 +3,7 @@ package messages import ( "encoding/json" "fmt" + "strings" ) // BuildStatus represents the status of a build response. @@ -55,6 +56,10 @@ func (r *BuildRequest) Validate() error { if r.ReplyTo == "" { return fmt.Errorf("reply_to is required") } + // Validate reply_to format to prevent publishing to arbitrary subjects + if !strings.HasPrefix(r.ReplyTo, "build.responses.") { + return fmt.Errorf("invalid reply_to format: must start with 'build.responses.'") + } return nil } -- 2.49.1 From 00899489ac0fe176893528cf1aa1fdf0f15d0ffd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Tue, 10 Feb 2026 22:13:33 +0100 Subject: [PATCH 4/4] feat(nixos): add settings option for builder config Allow defining builder repository configuration directly in Nix using the `settings.repos` option, which is more idiomatic for NixOS modules. Users can now choose between: - `settings.repos` - Define repos in Nix (recommended) - `configFile` - Point to an external YAML file The module generates a YAML config file from settings when configFile is not specified. An assertion ensures at least one method is used. Co-Authored-By: Claude Opus 4.5 --- README.md | 42 ++++++++++++++++++++++++----- nixos/module.nix | 70 +++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 102 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 2eef68e..042db6b 100644 --- a/README.md +++ b/README.md @@ -322,14 +322,47 @@ Default `deploySubjects`: | `package` | package | from flake | Package to use | | `natsUrl` | string | required | NATS server URL | | `nkeyFile` | path | required | Path to NKey seed file | -| `configFile` | path | required | Path to builder configuration file | +| `configFile` | path | `null` | Path to builder config file (alternative to `settings`) | +| `settings.repos` | attrs | `{}` | Repository configuration (see below) | | `timeout` | int | `1800` | Build timeout per host in seconds | | `environment` | attrs | `{}` | Additional environment variables | | `metrics.enable` | bool | `false` | Enable Prometheus metrics endpoint | | `metrics.address` | string | `":9973"` | Metrics HTTP server address | | `metrics.openFirewall` | bool | `false` | Open firewall for metrics port | -Example builder configuration: +Each entry in `settings.repos` is an attribute set with: + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `url` | string | required | Git flake URL (must start with `git+https://`, `git+ssh://`, or `git+file://`) | +| `defaultBranch` | string | `"master"` | Default branch to build when not specified | + +Example builder configuration using `settings`: + +```nix +services.homelab-deploy.builder = { + enable = true; + natsUrl = "nats://nats.example.com:4222"; + nkeyFile = "/run/secrets/homelab-deploy-builder-nkey"; + settings.repos = { + nixos-servers = { + url = "git+https://git.example.com/org/nixos-servers.git"; + defaultBranch = "master"; + }; + homelab = { + url = "git+ssh://git@github.com/user/homelab.git"; + defaultBranch = "main"; + }; + }; + metrics = { + enable = true; + address = ":9973"; + openFirewall = true; + }; +}; +``` + +Alternatively, you can use `configFile` to point to an external YAML file: ```nix services.homelab-deploy.builder = { @@ -337,11 +370,6 @@ services.homelab-deploy.builder = { natsUrl = "nats://nats.example.com:4222"; nkeyFile = "/run/secrets/homelab-deploy-builder-nkey"; configFile = "/etc/homelab-deploy/builder.yaml"; - metrics = { - enable = true; - address = ":9973"; - openFirewall = true; - }; }; ``` diff --git a/nixos/module.nix b/nixos/module.nix index 298015e..2c34fbd 100644 --- a/nixos/module.nix +++ b/nixos/module.nix @@ -5,6 +5,20 @@ let listenerCfg = config.services.homelab-deploy.listener; builderCfg = config.services.homelab-deploy.builder; + # Generate YAML config from settings + generatedConfigFile = pkgs.writeText "builder.yaml" (lib.generators.toYAML {} { + repos = lib.mapAttrs (name: repo: { + url = repo.url; + default_branch = repo.defaultBranch; + }) builderCfg.settings.repos; + }); + + # Use provided configFile or generate from settings + builderConfigFile = + if builderCfg.configFile != null + then builderCfg.configFile + else generatedConfigFile; + # Build command line arguments for listener from configuration listenerArgs = lib.concatStringsSep " " ([ "--hostname ${lib.escapeShellArg listenerCfg.hostname}" @@ -26,7 +40,7 @@ let builderArgs = lib.concatStringsSep " " ([ "--nats-url ${lib.escapeShellArg builderCfg.natsUrl}" "--nkey-file ${lib.escapeShellArg builderCfg.nkeyFile}" - "--config ${lib.escapeShellArg builderCfg.configFile}" + "--config ${builderConfigFile}" "--timeout ${toString builderCfg.timeout}" ] ++ lib.optionals builderCfg.metrics.enable [ @@ -161,11 +175,52 @@ in }; configFile = lib.mkOption { - type = lib.types.path; - description = "Path to builder configuration file (YAML)"; + type = lib.types.nullOr lib.types.path; + default = null; + description = '' + Path to builder configuration file (YAML). + If not specified, a config file will be generated from the `settings` option. + ''; example = "/etc/homelab-deploy/builder.yaml"; }; + settings = { + repos = lib.mkOption { + type = lib.types.attrsOf (lib.types.submodule { + options = { + url = lib.mkOption { + type = lib.types.str; + description = "Git flake URL for the repository"; + example = "git+https://git.example.com/org/nixos-configs.git"; + }; + defaultBranch = lib.mkOption { + type = lib.types.str; + default = "master"; + description = "Default branch to build when not specified in request"; + example = "main"; + }; + }; + }); + default = {}; + description = '' + Repository configuration for the builder. + Each key is the repository name used in build requests. + ''; + example = lib.literalExpression '' + { + nixos-servers = { + url = "git+https://git.example.com/org/nixos-servers.git"; + defaultBranch = "master"; + }; + homelab = { + url = "git+ssh://git@github.com/user/homelab.git"; + defaultBranch = "main"; + }; + } + ''; + }; + }; + timeout = lib.mkOption { type = lib.types.int; default = 1800; @@ -198,6 +253,15 @@ in }; config = lib.mkMerge [ + (lib.mkIf builderCfg.enable { + assertions = [ + { + assertion = builderCfg.configFile != null || builderCfg.settings.repos != {}; + message = "services.homelab-deploy.builder: either configFile or settings.repos must be specified"; + } + ]; + }) + (lib.mkIf listenerCfg.enable { systemd.services.homelab-deploy-listener = { description = "homelab-deploy listener"; -- 2.49.1