This repository has been archived on 2026-03-09. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
homelab-deploy/internal/builder/builder.go
Torjus Håkestad 1a23847d31 fix(builder): separate build output from error to preserve timeout messages
When a build timed out, the timeout error was silently replaced by
truncated stderr output. Split into separate Error and Output fields
on BuildHostResult so the cause (e.g. "build timed out after 30m0s")
is always visible in logs and CLI output.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-12 13:24:04 +01:00

354 lines
9.0 KiB
Go

package builder
import (
"context"
"fmt"
"log/slog"
"regexp"
"sort"
"strings"
"sync"
"time"
"git.t-juice.club/torjus/homelab-deploy/internal/messages"
"git.t-juice.club/torjus/homelab-deploy/internal/metrics"
"git.t-juice.club/torjus/homelab-deploy/internal/nats"
)
// hostnameRegex validates hostnames from flake output.
// Allows: alphanumeric, dashes, underscores, dots.
var hostnameRegex = regexp.MustCompile(`^[a-zA-Z0-9._-]+$`)
// truncateOutput truncates output to the first and last N lines if it exceeds 2*N lines.
func truncateOutput(output string, keepLines int) string {
lines := strings.Split(output, "\n")
if len(lines) <= keepLines*2 {
return output
}
head := lines[:keepLines]
tail := lines[len(lines)-keepLines:]
omitted := len(lines) - keepLines*2
return strings.Join(head, "\n") + fmt.Sprintf("\n\n... (%d lines omitted) ...\n\n", omitted) + strings.Join(tail, "\n")
}
// BuilderConfig holds the configuration for the builder.
type BuilderConfig struct {
NATSUrl string
NKeyFile string
ConfigFile string
Timeout time.Duration
MetricsEnabled bool
MetricsAddr string
}
// Builder handles build requests from NATS.
type Builder struct {
cfg BuilderConfig
repoCfg *Config
client *nats.Client
executor *Executor
lock sync.Mutex
busy bool
logger *slog.Logger
// metrics server and collector (nil if metrics disabled)
metricsServer *metrics.Server
metrics *metrics.BuildCollector
}
// New creates a new builder with the given configuration.
func New(cfg BuilderConfig, repoCfg *Config, logger *slog.Logger) *Builder {
if logger == nil {
logger = slog.Default()
}
b := &Builder{
cfg: cfg,
repoCfg: repoCfg,
executor: NewExecutor(cfg.Timeout),
logger: logger,
}
if cfg.MetricsEnabled {
b.metricsServer = metrics.NewServer(metrics.ServerConfig{
Addr: cfg.MetricsAddr,
Logger: logger,
})
b.metrics = metrics.NewBuildCollector(b.metricsServer.Registry())
}
return b
}
// Run starts the builder and blocks until the context is cancelled.
func (b *Builder) Run(ctx context.Context) error {
// Start metrics server if enabled
if b.metricsServer != nil {
if err := b.metricsServer.Start(); err != nil {
return fmt.Errorf("failed to start metrics server: %w", err)
}
defer func() {
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
_ = b.metricsServer.Shutdown(shutdownCtx)
}()
}
// Connect to NATS
b.logger.Info("connecting to NATS", "url", b.cfg.NATSUrl)
client, err := nats.Connect(nats.Config{
URL: b.cfg.NATSUrl,
NKeyFile: b.cfg.NKeyFile,
Name: "homelab-deploy-builder",
})
if err != nil {
return fmt.Errorf("failed to connect to NATS: %w", err)
}
b.client = client
defer b.client.Close()
b.logger.Info("connected to NATS")
// Subscribe to build subjects for each repo
for repoName := range b.repoCfg.Repos {
// Subscribe to build.<repo>.all and build.<repo>.<hostname>
allSubject := fmt.Sprintf("build.%s.*", repoName)
b.logger.Info("subscribing to build subject", "subject", allSubject)
if _, err := b.client.Subscribe(allSubject, b.handleBuildRequest); err != nil {
return fmt.Errorf("failed to subscribe to %s: %w", allSubject, err)
}
}
b.logger.Info("builder started", "repos", len(b.repoCfg.Repos))
// Wait for context cancellation
<-ctx.Done()
b.logger.Info("shutting down builder")
return nil
}
func (b *Builder) handleBuildRequest(subject string, data []byte) {
req, err := messages.UnmarshalBuildRequest(data)
if err != nil {
b.logger.Error("failed to unmarshal build request",
"subject", subject,
"error", err,
)
return
}
b.logger.Info("received build request",
"subject", subject,
"repo", req.Repo,
"target", req.Target,
"branch", req.Branch,
"reply_to", req.ReplyTo,
)
// Validate request
if err := req.Validate(); err != nil {
b.logger.Warn("invalid build request", "error", err)
b.sendResponse(req.ReplyTo, messages.NewBuildResponse(
messages.BuildStatusRejected,
err.Error(),
))
return
}
// Get repo config
repo, err := b.repoCfg.GetRepo(req.Repo)
if err != nil {
b.logger.Warn("unknown repo", "repo", req.Repo)
b.sendResponse(req.ReplyTo, messages.NewBuildResponse(
messages.BuildStatusRejected,
fmt.Sprintf("unknown repo: %s", req.Repo),
))
return
}
// Try to acquire lock
b.lock.Lock()
if b.busy {
b.lock.Unlock()
b.logger.Warn("build already in progress")
b.sendResponse(req.ReplyTo, messages.NewBuildResponse(
messages.BuildStatusRejected,
"another build is already in progress",
))
return
}
b.busy = true
b.lock.Unlock()
defer func() {
b.lock.Lock()
b.busy = false
b.lock.Unlock()
}()
// Use default branch if not specified
branch := req.Branch
if branch == "" {
branch = repo.DefaultBranch
}
// Determine hosts to build
var hosts []string
if req.Target == "all" {
// List hosts from flake
b.sendResponse(req.ReplyTo, messages.NewBuildResponse(
messages.BuildStatusStarted,
"discovering hosts...",
))
hosts, err = b.executor.ListHosts(context.Background(), repo.URL, branch)
if err != nil {
b.logger.Error("failed to list hosts", "error", err)
b.sendResponse(req.ReplyTo, messages.NewBuildResponse(
messages.BuildStatusFailed,
fmt.Sprintf("failed to list hosts: %v", err),
).WithError(err.Error()))
if b.metrics != nil {
b.metrics.RecordBuildFailure(req.Repo, "")
}
return
}
// Filter out hostnames with invalid characters (security: prevent injection)
validHosts := make([]string, 0, len(hosts))
for _, host := range hosts {
if hostnameRegex.MatchString(host) {
validHosts = append(validHosts, host)
} else {
b.logger.Warn("skipping hostname with invalid characters", "hostname", host)
}
}
hosts = validHosts
// Sort hosts for consistent ordering
sort.Strings(hosts)
} else {
hosts = []string{req.Target}
}
if len(hosts) == 0 {
b.sendResponse(req.ReplyTo, messages.NewBuildResponse(
messages.BuildStatusFailed,
"no hosts to build",
))
return
}
// Send started response
b.sendResponse(req.ReplyTo, &messages.BuildResponse{
Status: messages.BuildStatusStarted,
Message: fmt.Sprintf("building %d host(s)", len(hosts)),
HostsTotal: len(hosts),
})
// Build each host sequentially
startTime := time.Now()
results := make([]messages.BuildHostResult, 0, len(hosts))
succeeded := 0
failed := 0
for i, host := range hosts {
hostStart := time.Now()
b.logger.Info("building host",
"host", host,
"progress", fmt.Sprintf("%d/%d", i+1, len(hosts)),
"command", b.executor.BuildCommand(repo.URL, branch, host),
)
result := b.executor.Build(context.Background(), repo.URL, branch, host)
hostDuration := time.Since(hostStart).Seconds()
hostResult := messages.BuildHostResult{
Host: host,
Success: result.Success,
DurationSeconds: hostDuration,
}
if !result.Success {
if result.Error != nil {
hostResult.Error = result.Error.Error()
}
if result.Stderr != "" {
hostResult.Output = truncateOutput(result.Stderr, 50)
}
}
results = append(results, hostResult)
if result.Success {
succeeded++
b.logger.Info("host build succeeded", "host", host, "duration", hostDuration)
if b.metrics != nil {
b.metrics.RecordHostBuildSuccess(req.Repo, host, hostDuration)
}
} else {
failed++
b.logger.Error("host build failed", "host", host, "error", hostResult.Error, "output", hostResult.Output)
if b.metrics != nil {
b.metrics.RecordHostBuildFailure(req.Repo, host, hostDuration)
}
}
// Send progress update
success := result.Success
b.sendResponse(req.ReplyTo, &messages.BuildResponse{
Status: messages.BuildStatusProgress,
Host: host,
HostSuccess: &success,
HostsCompleted: i + 1,
HostsTotal: len(hosts),
})
}
totalDuration := time.Since(startTime).Seconds()
// Send final response
status := messages.BuildStatusCompleted
message := fmt.Sprintf("built %d/%d hosts successfully", succeeded, len(hosts))
if failed > 0 {
status = messages.BuildStatusFailed
message = fmt.Sprintf("build failed: %d/%d hosts failed", failed, len(hosts))
}
b.sendResponse(req.ReplyTo, &messages.BuildResponse{
Status: status,
Message: message,
Results: results,
TotalDurationSeconds: totalDuration,
Succeeded: succeeded,
Failed: failed,
})
// Record overall build metrics
if b.metrics != nil {
if failed == 0 {
b.metrics.RecordBuildSuccess(req.Repo)
} else {
b.metrics.RecordBuildFailure(req.Repo, "")
}
}
}
func (b *Builder) sendResponse(replyTo string, resp *messages.BuildResponse) {
data, err := resp.Marshal()
if err != nil {
b.logger.Error("failed to marshal build response", "error", err)
return
}
if err := b.client.Publish(replyTo, data); err != nil {
b.logger.Error("failed to publish build response",
"reply_to", replyTo,
"error", err,
)
}
// Flush to ensure response is sent immediately
if err := b.client.Flush(); err != nil {
b.logger.Error("failed to flush", "error", err)
}
}