feat: implement NATS-based NixOS deployment system

Implement the complete homelab-deploy system with three operational modes: - Listener mode: Runs on NixOS hosts as a systemd service, subscribes to NATS subjects with configurable templates, executes nixos-rebuild on deployment requests with concurrency control - MCP mode: MCP server exposing deploy, deploy_admin, and list_hosts tools for AI assistants with tiered access control - CLI mode: Manual deployment commands with subject alias support via environment variables Key components: - internal/messages: Request/response types with validation - internal/nats: Client wrapper with NKey authentication - internal/deploy: Executor with timeout and lock for concurrency - internal/listener: Subject template expansion and request handling - internal/cli: Deploy logic with alias resolution - internal/mcp: MCP server with mcp-go integration - nixos/module.nix: NixOS module with hardened systemd service Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-07 04:19:47 +01:00
parent ad7d1a650c
commit fa49e9322a
27 changed files with 2929 additions and 26 deletions
--- a/internal/deploy/executor.go
+++ b/internal/deploy/executor.go
@@ -0,0 +1,112 @@
+package deploy
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"os/exec"
+	"time"
+
+	"git.t-juice.club/torjus/homelab-deploy/internal/messages"
+)
+
+// Executor handles the execution of nixos-rebuild commands.
+type Executor struct {
+	flakeURL string
+	hostname string
+	timeout  time.Duration
+}
+
+// NewExecutor creates a new deployment executor.
+func NewExecutor(flakeURL, hostname string, timeout time.Duration) *Executor {
+	return &Executor{
+		flakeURL: flakeURL,
+		hostname: hostname,
+		timeout:  timeout,
+	}
+}
+
+// Result contains the result of a deployment execution.
+type Result struct {
+	Success  bool
+	ExitCode int
+	Stdout   string
+	Stderr   string
+	Error    error
+}
+
+// ValidateRevision checks if a revision exists in the remote repository.
+// It uses git ls-remote to verify the ref exists.
+func (e *Executor) ValidateRevision(ctx context.Context, revision string) error {
+	// Extract the base URL for git ls-remote
+	// flakeURL is like git+https://git.example.com/user/repo.git
+	// We need to strip the git+ prefix for git ls-remote
+	gitURL := e.flakeURL
+	if len(gitURL) > 4 && gitURL[:4] == "git+" {
+		gitURL = gitURL[4:]
+	}
+
+	ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
+	defer cancel()
+
+	cmd := exec.CommandContext(ctx, "git", "ls-remote", "--exit-code", gitURL, revision)
+	var stderr bytes.Buffer
+	cmd.Stderr = &stderr
+
+	if err := cmd.Run(); err != nil {
+		if ctx.Err() == context.DeadlineExceeded {
+			return fmt.Errorf("timeout validating revision")
+		}
+		return fmt.Errorf("revision %q not found: %w", revision, err)
+	}
+
+	return nil
+}
+
+// Execute runs nixos-rebuild with the specified action and revision.
+func (e *Executor) Execute(ctx context.Context, action messages.Action, revision string) *Result {
+	ctx, cancel := context.WithTimeout(ctx, e.timeout)
+	defer cancel()
+
+	// Build the flake reference: <flake-url>?ref=<revision>#<hostname>
+	flakeRef := fmt.Sprintf("%s?ref=%s#%s", e.flakeURL, revision, e.hostname)
+
+	cmd := exec.CommandContext(ctx, "nixos-rebuild", string(action), "--flake", flakeRef)
+
+	var stdout, stderr bytes.Buffer
+	cmd.Stdout = &stdout
+	cmd.Stderr = &stderr
+
+	err := cmd.Run()
+
+	result := &Result{
+		Stdout: stdout.String(),
+		Stderr: stderr.String(),
+	}
+
+	if err != nil {
+		result.Success = false
+		result.Error = err
+
+		if ctx.Err() == context.DeadlineExceeded {
+			result.Error = fmt.Errorf("deployment timed out after %v", e.timeout)
+		}
+
+		if exitErr, ok := err.(*exec.ExitError); ok {
+			result.ExitCode = exitErr.ExitCode()
+		} else {
+			result.ExitCode = -1
+		}
+	} else {
+		result.Success = true
+		result.ExitCode = 0
+	}
+
+	return result
+}
+
+// BuildCommand returns the command that would be executed (for logging/debugging).
+func (e *Executor) BuildCommand(action messages.Action, revision string) string {
+	flakeRef := fmt.Sprintf("%s?ref=%s#%s", e.flakeURL, revision, e.hostname)
+	return fmt.Sprintf("nixos-rebuild %s --flake %s", action, flakeRef)
+}
--- a/internal/deploy/executor_test.go
+++ b/internal/deploy/executor_test.go
@@ -0,0 +1,76 @@
+package deploy
+
+import (
+	"testing"
+	"time"
+
+	"git.t-juice.club/torjus/homelab-deploy/internal/messages"
+)
+
+func TestExecutor_BuildCommand(t *testing.T) {
+	tests := []struct {
+		name     string
+		flakeURL string
+		hostname string
+		action   messages.Action
+		revision string
+		want     string
+	}{
+		{
+			name:     "switch action",
+			flakeURL: "git+https://git.example.com/user/nixos-configs.git",
+			hostname: "ns1",
+			action:   messages.ActionSwitch,
+			revision: "master",
+			want:     "nixos-rebuild switch --flake git+https://git.example.com/user/nixos-configs.git?ref=master#ns1",
+		},
+		{
+			name:     "boot action with commit hash",
+			flakeURL: "git+https://git.example.com/user/nixos-configs.git",
+			hostname: "web1",
+			action:   messages.ActionBoot,
+			revision: "abc123def456",
+			want:     "nixos-rebuild boot --flake git+https://git.example.com/user/nixos-configs.git?ref=abc123def456#web1",
+		},
+		{
+			name:     "test action with feature branch",
+			flakeURL: "git+ssh://git@github.com/org/repo.git",
+			hostname: "test-host",
+			action:   messages.ActionTest,
+			revision: "feature/new-feature",
+			want:     "nixos-rebuild test --flake git+ssh://git@github.com/org/repo.git?ref=feature/new-feature#test-host",
+		},
+		{
+			name:     "dry-activate action",
+			flakeURL: "git+https://git.example.com/repo.git",
+			hostname: "prod-1",
+			action:   messages.ActionDryActivate,
+			revision: "v1.0.0",
+			want:     "nixos-rebuild dry-activate --flake git+https://git.example.com/repo.git?ref=v1.0.0#prod-1",
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			e := NewExecutor(tc.flakeURL, tc.hostname, 10*time.Minute)
+			got := e.BuildCommand(tc.action, tc.revision)
+			if got != tc.want {
+				t.Errorf("BuildCommand() = %q, want %q", got, tc.want)
+			}
+		})
+	}
+}
+
+func TestNewExecutor(t *testing.T) {
+	e := NewExecutor("git+https://example.com/repo.git", "host1", 5*time.Minute)
+
+	if e.flakeURL != "git+https://example.com/repo.git" {
+		t.Errorf("flakeURL = %q, want %q", e.flakeURL, "git+https://example.com/repo.git")
+	}
+	if e.hostname != "host1" {
+		t.Errorf("hostname = %q, want %q", e.hostname, "host1")
+	}
+	if e.timeout != 5*time.Minute {
+		t.Errorf("timeout = %v, want %v", e.timeout, 5*time.Minute)
+	}
+}
--- a/internal/deploy/lock.go
+++ b/internal/deploy/lock.go
@@ -0,0 +1,56 @@
+// Package deploy provides deployment execution logic.
+package deploy
+
+import (
+	"sync"
+)
+
+// Lock provides a simple in-memory lock for single-deployment concurrency control.
+type Lock struct {
+	mu     sync.Mutex
+	held   bool
+	holder string
+}
+
+// NewLock creates a new deployment lock.
+func NewLock() *Lock {
+	return &Lock{}
+}
+
+// TryAcquire attempts to acquire the lock. Returns true if successful.
+// The holder parameter identifies who is holding the lock.
+func (l *Lock) TryAcquire(holder string) bool {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	if l.held {
+		return false
+	}
+
+	l.held = true
+	l.holder = holder
+	return true
+}
+
+// Release releases the lock.
+func (l *Lock) Release() {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	l.held = false
+	l.holder = ""
+}
+
+// IsHeld returns true if the lock is currently held.
+func (l *Lock) IsHeld() bool {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	return l.held
+}
+
+// Holder returns the current holder of the lock, or empty string if not held.
+func (l *Lock) Holder() string {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	return l.holder
+}
--- a/internal/deploy/lock_test.go
+++ b/internal/deploy/lock_test.go
@@ -0,0 +1,98 @@
+package deploy
+
+import (
+	"sync"
+	"testing"
+)
+
+func TestLock_TryAcquire(t *testing.T) {
+	l := NewLock()
+
+	// First acquire should succeed
+	if !l.TryAcquire("request-1") {
+		t.Error("first TryAcquire should succeed")
+	}
+
+	// Second acquire should fail
+	if l.TryAcquire("request-2") {
+		t.Error("second TryAcquire should fail while lock is held")
+	}
+
+	// Verify holder
+	if got := l.Holder(); got != "request-1" {
+		t.Errorf("Holder() = %q, want %q", got, "request-1")
+	}
+
+	// Release and try again
+	l.Release()
+
+	if !l.TryAcquire("request-3") {
+		t.Error("TryAcquire should succeed after Release")
+	}
+
+	if got := l.Holder(); got != "request-3" {
+		t.Errorf("Holder() = %q, want %q", got, "request-3")
+	}
+}
+
+func TestLock_IsHeld(t *testing.T) {
+	l := NewLock()
+
+	if l.IsHeld() {
+		t.Error("new lock should not be held")
+	}
+
+	l.TryAcquire("test")
+
+	if !l.IsHeld() {
+		t.Error("lock should be held after TryAcquire")
+	}
+
+	l.Release()
+
+	if l.IsHeld() {
+		t.Error("lock should not be held after Release")
+	}
+}
+
+func TestLock_Concurrent(t *testing.T) {
+	l := NewLock()
+	var wg sync.WaitGroup
+	acquired := make(chan string, 100)
+
+	// Try to acquire from multiple goroutines
+	for i := range 100 {
+		wg.Add(1)
+		go func(id int) {
+			defer wg.Done()
+			holder := string(rune('A' + (id % 26)))
+			if l.TryAcquire(holder) {
+				acquired <- holder
+			}
+		}(i)
+	}
+
+	wg.Wait()
+	close(acquired)
+
+	// Only one should have succeeded
+	count := 0
+	for range acquired {
+		count++
+	}
+
+	if count != 1 {
+		t.Errorf("expected exactly 1 successful acquire, got %d", count)
+	}
+}
+
+func TestLock_ReleaseUnheld(t *testing.T) {
+	l := NewLock()
+
+	// Releasing an unheld lock should not panic
+	l.Release()
+
+	if l.IsHeld() {
+		t.Error("lock should not be held after Release on unheld lock")
+	}
+}