feat: add builder mode for centralized Nix builds

Add a new "builder" capability to trigger Nix builds on a dedicated
build host via NATS messaging. This allows pre-building NixOS
configurations before deployment.

New components:
- Builder mode: subscribes to build.<repo>.* subjects, executes nix build
- Build CLI command: triggers builds with progress tracking
- MCP build tool: available with --enable-builds flag
- Builder metrics: tracks build success/failure per repo and host
- NixOS module: services.homelab-deploy.builder

The builder uses a YAML config file to define allowed repositories
with their URLs and default branches. Builds can target all hosts
or specific hosts, with real-time progress updates.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-02-10 22:03:14 +01:00
parent 277a49a666
commit 14f5b31faf
13 changed files with 1535 additions and 57 deletions

323
internal/builder/builder.go Normal file
View File

@@ -0,0 +1,323 @@
package builder
import (
"context"
"fmt"
"log/slog"
"sort"
"sync"
"time"
"git.t-juice.club/torjus/homelab-deploy/internal/messages"
"git.t-juice.club/torjus/homelab-deploy/internal/metrics"
"git.t-juice.club/torjus/homelab-deploy/internal/nats"
)
// BuilderConfig holds the configuration for the builder.
type BuilderConfig struct {
NATSUrl string
NKeyFile string
ConfigFile string
Timeout time.Duration
MetricsEnabled bool
MetricsAddr string
}
// Builder handles build requests from NATS.
type Builder struct {
cfg BuilderConfig
repoCfg *Config
client *nats.Client
executor *Executor
lock sync.Mutex
busy bool
logger *slog.Logger
// metrics server and collector (nil if metrics disabled)
metricsServer *metrics.Server
metrics *metrics.BuildCollector
}
// New creates a new builder with the given configuration.
func New(cfg BuilderConfig, repoCfg *Config, logger *slog.Logger) *Builder {
if logger == nil {
logger = slog.Default()
}
b := &Builder{
cfg: cfg,
repoCfg: repoCfg,
executor: NewExecutor(cfg.Timeout),
logger: logger,
}
if cfg.MetricsEnabled {
b.metricsServer = metrics.NewServer(metrics.ServerConfig{
Addr: cfg.MetricsAddr,
Logger: logger,
})
b.metrics = metrics.NewBuildCollector(b.metricsServer.Registry())
}
return b
}
// Run starts the builder and blocks until the context is cancelled.
func (b *Builder) Run(ctx context.Context) error {
// Start metrics server if enabled
if b.metricsServer != nil {
if err := b.metricsServer.Start(); err != nil {
return fmt.Errorf("failed to start metrics server: %w", err)
}
defer func() {
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
_ = b.metricsServer.Shutdown(shutdownCtx)
}()
}
// Connect to NATS
b.logger.Info("connecting to NATS", "url", b.cfg.NATSUrl)
client, err := nats.Connect(nats.Config{
URL: b.cfg.NATSUrl,
NKeyFile: b.cfg.NKeyFile,
Name: "homelab-deploy-builder",
})
if err != nil {
return fmt.Errorf("failed to connect to NATS: %w", err)
}
b.client = client
defer b.client.Close()
b.logger.Info("connected to NATS")
// Subscribe to build subjects for each repo
for repoName := range b.repoCfg.Repos {
// Subscribe to build.<repo>.all and build.<repo>.<hostname>
allSubject := fmt.Sprintf("build.%s.*", repoName)
b.logger.Info("subscribing to build subject", "subject", allSubject)
if _, err := b.client.Subscribe(allSubject, b.handleBuildRequest); err != nil {
return fmt.Errorf("failed to subscribe to %s: %w", allSubject, err)
}
}
b.logger.Info("builder started", "repos", len(b.repoCfg.Repos))
// Wait for context cancellation
<-ctx.Done()
b.logger.Info("shutting down builder")
return nil
}
func (b *Builder) handleBuildRequest(subject string, data []byte) {
req, err := messages.UnmarshalBuildRequest(data)
if err != nil {
b.logger.Error("failed to unmarshal build request",
"subject", subject,
"error", err,
)
return
}
b.logger.Info("received build request",
"subject", subject,
"repo", req.Repo,
"target", req.Target,
"branch", req.Branch,
"reply_to", req.ReplyTo,
)
// Validate request
if err := req.Validate(); err != nil {
b.logger.Warn("invalid build request", "error", err)
b.sendResponse(req.ReplyTo, messages.NewBuildResponse(
messages.BuildStatusRejected,
err.Error(),
))
return
}
// Get repo config
repo, err := b.repoCfg.GetRepo(req.Repo)
if err != nil {
b.logger.Warn("unknown repo", "repo", req.Repo)
b.sendResponse(req.ReplyTo, messages.NewBuildResponse(
messages.BuildStatusRejected,
fmt.Sprintf("unknown repo: %s", req.Repo),
))
return
}
// Try to acquire lock
b.lock.Lock()
if b.busy {
b.lock.Unlock()
b.logger.Warn("build already in progress")
b.sendResponse(req.ReplyTo, messages.NewBuildResponse(
messages.BuildStatusRejected,
"another build is already in progress",
))
return
}
b.busy = true
b.lock.Unlock()
defer func() {
b.lock.Lock()
b.busy = false
b.lock.Unlock()
}()
// Use default branch if not specified
branch := req.Branch
if branch == "" {
branch = repo.DefaultBranch
}
// Determine hosts to build
var hosts []string
if req.Target == "all" {
// List hosts from flake
b.sendResponse(req.ReplyTo, messages.NewBuildResponse(
messages.BuildStatusStarted,
"discovering hosts...",
))
hosts, err = b.executor.ListHosts(context.Background(), repo.URL, branch)
if err != nil {
b.logger.Error("failed to list hosts", "error", err)
b.sendResponse(req.ReplyTo, messages.NewBuildResponse(
messages.BuildStatusFailed,
fmt.Sprintf("failed to list hosts: %v", err),
).WithError(err.Error()))
if b.metrics != nil {
b.metrics.RecordBuildFailure(req.Repo, "")
}
return
}
// Sort hosts for consistent ordering
sort.Strings(hosts)
} else {
hosts = []string{req.Target}
}
if len(hosts) == 0 {
b.sendResponse(req.ReplyTo, messages.NewBuildResponse(
messages.BuildStatusFailed,
"no hosts to build",
))
return
}
// Send started response
b.sendResponse(req.ReplyTo, &messages.BuildResponse{
Status: messages.BuildStatusStarted,
Message: fmt.Sprintf("building %d host(s)", len(hosts)),
HostsTotal: len(hosts),
})
// Build each host sequentially
startTime := time.Now()
results := make([]messages.BuildHostResult, 0, len(hosts))
succeeded := 0
failed := 0
for i, host := range hosts {
hostStart := time.Now()
b.logger.Info("building host",
"host", host,
"progress", fmt.Sprintf("%d/%d", i+1, len(hosts)),
"command", b.executor.BuildCommand(repo.URL, branch, host),
)
result := b.executor.Build(context.Background(), repo.URL, branch, host)
hostDuration := time.Since(hostStart).Seconds()
hostResult := messages.BuildHostResult{
Host: host,
Success: result.Success,
DurationSeconds: hostDuration,
}
if !result.Success {
hostResult.Error = result.Stderr
if hostResult.Error == "" && result.Error != nil {
hostResult.Error = result.Error.Error()
}
}
results = append(results, hostResult)
if result.Success {
succeeded++
b.logger.Info("host build succeeded", "host", host, "duration", hostDuration)
if b.metrics != nil {
b.metrics.RecordHostBuildSuccess(req.Repo, host, hostDuration)
}
} else {
failed++
b.logger.Error("host build failed", "host", host, "error", hostResult.Error)
if b.metrics != nil {
b.metrics.RecordHostBuildFailure(req.Repo, host, hostDuration)
}
}
// Send progress update
success := result.Success
b.sendResponse(req.ReplyTo, &messages.BuildResponse{
Status: messages.BuildStatusProgress,
Host: host,
HostSuccess: &success,
HostsCompleted: i + 1,
HostsTotal: len(hosts),
})
}
totalDuration := time.Since(startTime).Seconds()
// Send final response
status := messages.BuildStatusCompleted
message := fmt.Sprintf("built %d/%d hosts successfully", succeeded, len(hosts))
if failed > 0 {
status = messages.BuildStatusFailed
message = fmt.Sprintf("build failed: %d/%d hosts failed", failed, len(hosts))
}
b.sendResponse(req.ReplyTo, &messages.BuildResponse{
Status: status,
Message: message,
Results: results,
TotalDurationSeconds: totalDuration,
Succeeded: succeeded,
Failed: failed,
})
// Record overall build metrics
if b.metrics != nil {
if failed == 0 {
b.metrics.RecordBuildSuccess(req.Repo)
} else {
b.metrics.RecordBuildFailure(req.Repo, "")
}
}
}
func (b *Builder) sendResponse(replyTo string, resp *messages.BuildResponse) {
data, err := resp.Marshal()
if err != nil {
b.logger.Error("failed to marshal build response", "error", err)
return
}
if err := b.client.Publish(replyTo, data); err != nil {
b.logger.Error("failed to publish build response",
"reply_to", replyTo,
"error", err,
)
}
// Flush to ensure response is sent immediately
if err := b.client.Flush(); err != nil {
b.logger.Error("failed to flush", "error", err)
}
}

View File

@@ -0,0 +1,65 @@
package builder
import (
"fmt"
"os"
"gopkg.in/yaml.v3"
)
// RepoConfig holds configuration for a single repository.
type RepoConfig struct {
URL string `yaml:"url"`
DefaultBranch string `yaml:"default_branch"`
}
// Config holds the builder configuration.
type Config struct {
Repos map[string]RepoConfig `yaml:"repos"`
}
// LoadConfig loads configuration from a YAML file.
func LoadConfig(path string) (*Config, error) {
data, err := os.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("failed to read config file: %w", err)
}
var cfg Config
if err := yaml.Unmarshal(data, &cfg); err != nil {
return nil, fmt.Errorf("failed to parse config file: %w", err)
}
if err := cfg.Validate(); err != nil {
return nil, err
}
return &cfg, nil
}
// Validate checks that the configuration is valid.
func (c *Config) Validate() error {
if len(c.Repos) == 0 {
return fmt.Errorf("no repos configured")
}
for name, repo := range c.Repos {
if repo.URL == "" {
return fmt.Errorf("repo %q: url is required", name)
}
if repo.DefaultBranch == "" {
return fmt.Errorf("repo %q: default_branch is required", name)
}
}
return nil
}
// GetRepo returns the configuration for a repository, or an error if not found.
func (c *Config) GetRepo(name string) (*RepoConfig, error) {
repo, ok := c.Repos[name]
if !ok {
return nil, fmt.Errorf("repo %q not found in configuration", name)
}
return &repo, nil
}

View File

@@ -0,0 +1,116 @@
package builder
import (
"bytes"
"context"
"encoding/json"
"fmt"
"os/exec"
"time"
)
// Executor handles the execution of nix build commands.
type Executor struct {
timeout time.Duration
}
// NewExecutor creates a new build executor.
func NewExecutor(timeout time.Duration) *Executor {
return &Executor{
timeout: timeout,
}
}
// BuildResult contains the result of a build execution.
type BuildResult struct {
Success bool
ExitCode int
Stdout string
Stderr string
Error error
}
// FlakeShowResult contains the parsed output of nix flake show.
type FlakeShowResult struct {
NixosConfigurations map[string]any `json:"nixosConfigurations"`
}
// ListHosts returns the list of hosts (nixosConfigurations) available in a flake.
func (e *Executor) ListHosts(ctx context.Context, flakeURL, branch string) ([]string, error) {
ctx, cancel := context.WithTimeout(ctx, 60*time.Second)
defer cancel()
flakeRef := fmt.Sprintf("%s?ref=%s", flakeURL, branch)
cmd := exec.CommandContext(ctx, "nix", "flake", "show", "--json", flakeRef)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
if ctx.Err() == context.DeadlineExceeded {
return nil, fmt.Errorf("timeout listing hosts")
}
return nil, fmt.Errorf("failed to list hosts: %w\n%s", err, stderr.String())
}
var result FlakeShowResult
if err := json.Unmarshal(stdout.Bytes(), &result); err != nil {
return nil, fmt.Errorf("failed to parse flake show output: %w", err)
}
hosts := make([]string, 0, len(result.NixosConfigurations))
for host := range result.NixosConfigurations {
hosts = append(hosts, host)
}
return hosts, nil
}
// Build builds a single host's system configuration.
func (e *Executor) Build(ctx context.Context, flakeURL, branch, host string) *BuildResult {
ctx, cancel := context.WithTimeout(ctx, e.timeout)
defer cancel()
// Build the flake reference for the system toplevel
flakeRef := fmt.Sprintf("%s?ref=%s#nixosConfigurations.%s.config.system.build.toplevel", flakeURL, branch, host)
cmd := exec.CommandContext(ctx, "nix", "build", "--no-link", flakeRef)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
err := cmd.Run()
result := &BuildResult{
Stdout: stdout.String(),
Stderr: stderr.String(),
}
if err != nil {
result.Success = false
result.Error = err
if ctx.Err() == context.DeadlineExceeded {
result.Error = fmt.Errorf("build timed out after %v", e.timeout)
}
if exitErr, ok := err.(*exec.ExitError); ok {
result.ExitCode = exitErr.ExitCode()
} else {
result.ExitCode = -1
}
} else {
result.Success = true
result.ExitCode = 0
}
return result
}
// BuildCommand returns the command that would be executed (for logging/debugging).
func (e *Executor) BuildCommand(flakeURL, branch, host string) string {
flakeRef := fmt.Sprintf("%s?ref=%s#nixosConfigurations.%s.config.system.build.toplevel", flakeURL, branch, host)
return fmt.Sprintf("nix build --no-link %s", flakeRef)
}