feat: implement NATS-based NixOS deployment system

Implement the complete homelab-deploy system with three operational modes:

- Listener mode: Runs on NixOS hosts as a systemd service, subscribes to
  NATS subjects with configurable templates, executes nixos-rebuild on
  deployment requests with concurrency control

- MCP mode: MCP server exposing deploy, deploy_admin, and list_hosts
  tools for AI assistants with tiered access control

- CLI mode: Manual deployment commands with subject alias support via
  environment variables

Key components:
- internal/messages: Request/response types with validation
- internal/nats: Client wrapper with NKey authentication
- internal/deploy: Executor with timeout and lock for concurrency
- internal/listener: Subject template expansion and request handling
- internal/cli: Deploy logic with alias resolution
- internal/mcp: MCP server with mcp-go integration
- nixos/module.nix: NixOS module with hardened systemd service

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-02-07 04:19:47 +01:00
parent ad7d1a650c
commit fa49e9322a
27 changed files with 2929 additions and 26 deletions

267
cmd/homelab-deploy/main.go Normal file
View File

@@ -0,0 +1,267 @@
package main
import (
"context"
"fmt"
"log/slog"
"os"
"os/signal"
"syscall"
"time"
deploycli "git.t-juice.club/torjus/homelab-deploy/internal/cli"
"git.t-juice.club/torjus/homelab-deploy/internal/listener"
"git.t-juice.club/torjus/homelab-deploy/internal/mcp"
"git.t-juice.club/torjus/homelab-deploy/internal/messages"
"github.com/urfave/cli/v3"
)
const version = "0.1.0"
func main() {
app := &cli.Command{
Name: "homelab-deploy",
Usage: "Message-based NixOS deployment system using NATS",
Version: version,
Commands: []*cli.Command{
listenerCommand(),
mcpCommand(),
deployCommand(),
},
}
if err := app.Run(context.Background(), os.Args); err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(1)
}
}
func listenerCommand() *cli.Command {
return &cli.Command{
Name: "listener",
Usage: "Run as a deployment listener (systemd service mode)",
Flags: []cli.Flag{
&cli.StringFlag{
Name: "hostname",
Usage: "Hostname for this listener",
Required: true,
},
&cli.StringFlag{
Name: "tier",
Usage: "Deployment tier (test or prod)",
Required: true,
},
&cli.StringFlag{
Name: "role",
Usage: "Role for role-based deployment targeting",
},
&cli.StringFlag{
Name: "nats-url",
Usage: "NATS server URL",
Required: true,
},
&cli.StringFlag{
Name: "nkey-file",
Usage: "Path to NKey seed file for NATS authentication",
Required: true,
},
&cli.StringFlag{
Name: "flake-url",
Usage: "Git flake URL for nixos-rebuild",
Required: true,
},
&cli.IntFlag{
Name: "timeout",
Usage: "Deployment timeout in seconds",
Value: 600,
},
&cli.StringSliceFlag{
Name: "deploy-subject",
Usage: "NATS subject to subscribe to for deployments (can be repeated)",
Value: []string{
"deploy.<tier>.<hostname>",
"deploy.<tier>.all",
"deploy.<tier>.role.<role>",
},
},
&cli.StringFlag{
Name: "discover-subject",
Usage: "NATS subject for host discovery requests",
Value: "deploy.discover",
},
},
Action: func(ctx context.Context, c *cli.Command) error {
tier := c.String("tier")
if tier != "test" && tier != "prod" {
return fmt.Errorf("tier must be 'test' or 'prod', got %q", tier)
}
cfg := listener.Config{
Hostname: c.String("hostname"),
Tier: tier,
Role: c.String("role"),
NATSUrl: c.String("nats-url"),
NKeyFile: c.String("nkey-file"),
FlakeURL: c.String("flake-url"),
Timeout: time.Duration(c.Int("timeout")) * time.Second,
DeploySubjects: c.StringSlice("deploy-subject"),
DiscoverSubject: c.String("discover-subject"),
}
logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
Level: slog.LevelInfo,
}))
l := listener.New(cfg, logger)
// Handle shutdown signals
ctx, cancel := signal.NotifyContext(ctx, syscall.SIGINT, syscall.SIGTERM)
defer cancel()
return l.Run(ctx)
},
}
}
func mcpCommand() *cli.Command {
return &cli.Command{
Name: "mcp",
Usage: "Run as an MCP server for AI assistants",
Flags: []cli.Flag{
&cli.StringFlag{
Name: "nats-url",
Usage: "NATS server URL",
Required: true,
},
&cli.StringFlag{
Name: "nkey-file",
Usage: "Path to NKey seed file for NATS authentication",
Required: true,
},
&cli.BoolFlag{
Name: "enable-admin",
Usage: "Enable admin deployment tool for all tiers",
},
&cli.StringFlag{
Name: "admin-nkey-file",
Usage: "Path to admin NKey seed file (required if --enable-admin)",
},
&cli.StringFlag{
Name: "discover-subject",
Usage: "NATS subject for host discovery",
Value: "deploy.discover",
},
&cli.IntFlag{
Name: "timeout",
Usage: "Timeout in seconds for deployment operations",
Value: 900,
},
},
Action: func(_ context.Context, c *cli.Command) error {
enableAdmin := c.Bool("enable-admin")
adminNKeyFile := c.String("admin-nkey-file")
if enableAdmin && adminNKeyFile == "" {
return fmt.Errorf("--admin-nkey-file is required when --enable-admin is set")
}
cfg := mcp.ServerConfig{
NATSUrl: c.String("nats-url"),
NKeyFile: c.String("nkey-file"),
EnableAdmin: enableAdmin,
AdminNKeyFile: adminNKeyFile,
DiscoverSubject: c.String("discover-subject"),
Timeout: time.Duration(c.Int("timeout")) * time.Second,
}
s := mcp.New(cfg)
return s.Run()
},
}
}
func deployCommand() *cli.Command {
return &cli.Command{
Name: "deploy",
Usage: "Deploy to a target subject",
ArgsUsage: "<subject>",
Flags: []cli.Flag{
&cli.StringFlag{
Name: "nats-url",
Usage: "NATS server URL",
Required: true,
},
&cli.StringFlag{
Name: "nkey-file",
Usage: "Path to NKey seed file for NATS authentication",
Required: true,
},
&cli.StringFlag{
Name: "branch",
Usage: "Git branch or commit to deploy",
Value: "master",
},
&cli.StringFlag{
Name: "action",
Usage: "nixos-rebuild action (switch, boot, test, dry-activate)",
Value: "switch",
},
&cli.IntFlag{
Name: "timeout",
Usage: "Timeout in seconds for collecting responses",
Value: 900,
},
},
Action: func(ctx context.Context, c *cli.Command) error {
if c.Args().Len() < 1 {
return fmt.Errorf("subject argument required")
}
subjectArg := c.Args().First()
subject := deploycli.ResolveAlias(subjectArg)
if deploycli.IsAlias(subjectArg) && subject != subjectArg {
fmt.Printf("Resolved alias %q to %q\n", subjectArg, subject)
}
action := messages.Action(c.String("action"))
if !action.Valid() {
return fmt.Errorf("invalid action: %q", action)
}
cfg := deploycli.DeployConfig{
NATSUrl: c.String("nats-url"),
NKeyFile: c.String("nkey-file"),
Subject: subject,
Action: action,
Revision: c.String("branch"),
Timeout: time.Duration(c.Int("timeout")) * time.Second,
}
fmt.Printf("Deploying to %s (action=%s, revision=%s)\n", subject, action, cfg.Revision)
// Handle shutdown signals
ctx, cancel := signal.NotifyContext(ctx, syscall.SIGINT, syscall.SIGTERM)
defer cancel()
result, err := deploycli.Deploy(ctx, cfg, func(resp *messages.DeployResponse) {
status := string(resp.Status)
if resp.Error != nil {
status = fmt.Sprintf("%s (%s)", status, *resp.Error)
}
fmt.Printf("[%s] %s: %s\n", resp.Hostname, status, resp.Message)
})
if err != nil {
return fmt.Errorf("deploy failed: %w", err)
}
fmt.Printf("\nDeployment complete: %d hosts responded\n", result.HostCount())
if !result.AllSucceeded() {
return fmt.Errorf("some deployments failed")
}
return nil
},
}
}