feat: implement NATS-based NixOS deployment system

Implement the complete homelab-deploy system with three operational modes:

- Listener mode: Runs on NixOS hosts as a systemd service, subscribes to
  NATS subjects with configurable templates, executes nixos-rebuild on
  deployment requests with concurrency control

- MCP mode: MCP server exposing deploy, deploy_admin, and list_hosts
  tools for AI assistants with tiered access control

- CLI mode: Manual deployment commands with subject alias support via
  environment variables

Key components:
- internal/messages: Request/response types with validation
- internal/nats: Client wrapper with NKey authentication
- internal/deploy: Executor with timeout and lock for concurrency
- internal/listener: Subject template expansion and request handling
- internal/cli: Deploy logic with alias resolution
- internal/mcp: MCP server with mcp-go integration
- nixos/module.nix: NixOS module with hardened systemd service

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-02-07 04:19:47 +01:00
parent ad7d1a650c
commit fa49e9322a
27 changed files with 2929 additions and 26 deletions

112
internal/deploy/executor.go Normal file
View File

@@ -0,0 +1,112 @@
package deploy
import (
"bytes"
"context"
"fmt"
"os/exec"
"time"
"git.t-juice.club/torjus/homelab-deploy/internal/messages"
)
// Executor handles the execution of nixos-rebuild commands.
type Executor struct {
flakeURL string
hostname string
timeout time.Duration
}
// NewExecutor creates a new deployment executor.
func NewExecutor(flakeURL, hostname string, timeout time.Duration) *Executor {
return &Executor{
flakeURL: flakeURL,
hostname: hostname,
timeout: timeout,
}
}
// Result contains the result of a deployment execution.
type Result struct {
Success bool
ExitCode int
Stdout string
Stderr string
Error error
}
// ValidateRevision checks if a revision exists in the remote repository.
// It uses git ls-remote to verify the ref exists.
func (e *Executor) ValidateRevision(ctx context.Context, revision string) error {
// Extract the base URL for git ls-remote
// flakeURL is like git+https://git.example.com/user/repo.git
// We need to strip the git+ prefix for git ls-remote
gitURL := e.flakeURL
if len(gitURL) > 4 && gitURL[:4] == "git+" {
gitURL = gitURL[4:]
}
ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "git", "ls-remote", "--exit-code", gitURL, revision)
var stderr bytes.Buffer
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
if ctx.Err() == context.DeadlineExceeded {
return fmt.Errorf("timeout validating revision")
}
return fmt.Errorf("revision %q not found: %w", revision, err)
}
return nil
}
// Execute runs nixos-rebuild with the specified action and revision.
func (e *Executor) Execute(ctx context.Context, action messages.Action, revision string) *Result {
ctx, cancel := context.WithTimeout(ctx, e.timeout)
defer cancel()
// Build the flake reference: <flake-url>?ref=<revision>#<hostname>
flakeRef := fmt.Sprintf("%s?ref=%s#%s", e.flakeURL, revision, e.hostname)
cmd := exec.CommandContext(ctx, "nixos-rebuild", string(action), "--flake", flakeRef)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
err := cmd.Run()
result := &Result{
Stdout: stdout.String(),
Stderr: stderr.String(),
}
if err != nil {
result.Success = false
result.Error = err
if ctx.Err() == context.DeadlineExceeded {
result.Error = fmt.Errorf("deployment timed out after %v", e.timeout)
}
if exitErr, ok := err.(*exec.ExitError); ok {
result.ExitCode = exitErr.ExitCode()
} else {
result.ExitCode = -1
}
} else {
result.Success = true
result.ExitCode = 0
}
return result
}
// BuildCommand returns the command that would be executed (for logging/debugging).
func (e *Executor) BuildCommand(action messages.Action, revision string) string {
flakeRef := fmt.Sprintf("%s?ref=%s#%s", e.flakeURL, revision, e.hostname)
return fmt.Sprintf("nixos-rebuild %s --flake %s", action, flakeRef)
}

View File

@@ -0,0 +1,76 @@
package deploy
import (
"testing"
"time"
"git.t-juice.club/torjus/homelab-deploy/internal/messages"
)
func TestExecutor_BuildCommand(t *testing.T) {
tests := []struct {
name string
flakeURL string
hostname string
action messages.Action
revision string
want string
}{
{
name: "switch action",
flakeURL: "git+https://git.example.com/user/nixos-configs.git",
hostname: "ns1",
action: messages.ActionSwitch,
revision: "master",
want: "nixos-rebuild switch --flake git+https://git.example.com/user/nixos-configs.git?ref=master#ns1",
},
{
name: "boot action with commit hash",
flakeURL: "git+https://git.example.com/user/nixos-configs.git",
hostname: "web1",
action: messages.ActionBoot,
revision: "abc123def456",
want: "nixos-rebuild boot --flake git+https://git.example.com/user/nixos-configs.git?ref=abc123def456#web1",
},
{
name: "test action with feature branch",
flakeURL: "git+ssh://git@github.com/org/repo.git",
hostname: "test-host",
action: messages.ActionTest,
revision: "feature/new-feature",
want: "nixos-rebuild test --flake git+ssh://git@github.com/org/repo.git?ref=feature/new-feature#test-host",
},
{
name: "dry-activate action",
flakeURL: "git+https://git.example.com/repo.git",
hostname: "prod-1",
action: messages.ActionDryActivate,
revision: "v1.0.0",
want: "nixos-rebuild dry-activate --flake git+https://git.example.com/repo.git?ref=v1.0.0#prod-1",
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
e := NewExecutor(tc.flakeURL, tc.hostname, 10*time.Minute)
got := e.BuildCommand(tc.action, tc.revision)
if got != tc.want {
t.Errorf("BuildCommand() = %q, want %q", got, tc.want)
}
})
}
}
func TestNewExecutor(t *testing.T) {
e := NewExecutor("git+https://example.com/repo.git", "host1", 5*time.Minute)
if e.flakeURL != "git+https://example.com/repo.git" {
t.Errorf("flakeURL = %q, want %q", e.flakeURL, "git+https://example.com/repo.git")
}
if e.hostname != "host1" {
t.Errorf("hostname = %q, want %q", e.hostname, "host1")
}
if e.timeout != 5*time.Minute {
t.Errorf("timeout = %v, want %v", e.timeout, 5*time.Minute)
}
}

56
internal/deploy/lock.go Normal file
View File

@@ -0,0 +1,56 @@
// Package deploy provides deployment execution logic.
package deploy
import (
"sync"
)
// Lock provides a simple in-memory lock for single-deployment concurrency control.
type Lock struct {
mu sync.Mutex
held bool
holder string
}
// NewLock creates a new deployment lock.
func NewLock() *Lock {
return &Lock{}
}
// TryAcquire attempts to acquire the lock. Returns true if successful.
// The holder parameter identifies who is holding the lock.
func (l *Lock) TryAcquire(holder string) bool {
l.mu.Lock()
defer l.mu.Unlock()
if l.held {
return false
}
l.held = true
l.holder = holder
return true
}
// Release releases the lock.
func (l *Lock) Release() {
l.mu.Lock()
defer l.mu.Unlock()
l.held = false
l.holder = ""
}
// IsHeld returns true if the lock is currently held.
func (l *Lock) IsHeld() bool {
l.mu.Lock()
defer l.mu.Unlock()
return l.held
}
// Holder returns the current holder of the lock, or empty string if not held.
func (l *Lock) Holder() string {
l.mu.Lock()
defer l.mu.Unlock()
return l.holder
}

View File

@@ -0,0 +1,98 @@
package deploy
import (
"sync"
"testing"
)
func TestLock_TryAcquire(t *testing.T) {
l := NewLock()
// First acquire should succeed
if !l.TryAcquire("request-1") {
t.Error("first TryAcquire should succeed")
}
// Second acquire should fail
if l.TryAcquire("request-2") {
t.Error("second TryAcquire should fail while lock is held")
}
// Verify holder
if got := l.Holder(); got != "request-1" {
t.Errorf("Holder() = %q, want %q", got, "request-1")
}
// Release and try again
l.Release()
if !l.TryAcquire("request-3") {
t.Error("TryAcquire should succeed after Release")
}
if got := l.Holder(); got != "request-3" {
t.Errorf("Holder() = %q, want %q", got, "request-3")
}
}
func TestLock_IsHeld(t *testing.T) {
l := NewLock()
if l.IsHeld() {
t.Error("new lock should not be held")
}
l.TryAcquire("test")
if !l.IsHeld() {
t.Error("lock should be held after TryAcquire")
}
l.Release()
if l.IsHeld() {
t.Error("lock should not be held after Release")
}
}
func TestLock_Concurrent(t *testing.T) {
l := NewLock()
var wg sync.WaitGroup
acquired := make(chan string, 100)
// Try to acquire from multiple goroutines
for i := range 100 {
wg.Add(1)
go func(id int) {
defer wg.Done()
holder := string(rune('A' + (id % 26)))
if l.TryAcquire(holder) {
acquired <- holder
}
}(i)
}
wg.Wait()
close(acquired)
// Only one should have succeeded
count := 0
for range acquired {
count++
}
if count != 1 {
t.Errorf("expected exactly 1 successful acquire, got %d", count)
}
}
func TestLock_ReleaseUnheld(t *testing.T) {
l := NewLock()
// Releasing an unheld lock should not panic
l.Release()
if l.IsHeld() {
t.Error("lock should not be held after Release on unheld lock")
}
}