feat: implement NATS-based NixOS deployment system

Implement the complete homelab-deploy system with three operational modes: - Listener mode: Runs on NixOS hosts as a systemd service, subscribes to NATS subjects with configurable templates, executes nixos-rebuild on deployment requests with concurrency control - MCP mode: MCP server exposing deploy, deploy_admin, and list_hosts tools for AI assistants with tiered access control - CLI mode: Manual deployment commands with subject alias support via environment variables Key components: - internal/messages: Request/response types with validation - internal/nats: Client wrapper with NKey authentication - internal/deploy: Executor with timeout and lock for concurrency - internal/listener: Subject template expansion and request handling - internal/cli: Deploy logic with alias resolution - internal/mcp: MCP server with mcp-go integration - nixos/module.nix: NixOS module with hardened systemd service Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-07 04:19:47 +01:00
parent ad7d1a650c
commit fa49e9322a
27 changed files with 2929 additions and 26 deletions
--- a/internal/cli/aliases.go
+++ b/internal/cli/aliases.go
@@ -0,0 +1,40 @@
+// Package cli provides the deploy command logic.
+package cli
+
+import (
+	"os"
+	"strings"
+)
+
+const aliasEnvPrefix = "HOMELAB_DEPLOY_ALIAS_"
+
+// ResolveAlias resolves a subject alias to a full NATS subject.
+// If the input looks like a NATS subject (contains dots), it is returned as-is.
+// Otherwise, it checks for an environment variable HOMELAB_DEPLOY_ALIAS_<NAME>.
+// Alias names are case-insensitive and hyphens are converted to underscores.
+func ResolveAlias(input string) string {
+	// If it contains dots, it's already a subject
+	if strings.Contains(input, ".") {
+		return input
+	}
+
+	// Convert to uppercase and replace hyphens with underscores
+	envName := aliasEnvPrefix + strings.ToUpper(strings.ReplaceAll(input, "-", "_"))
+
+	if alias := os.Getenv(envName); alias != "" {
+		return alias
+	}
+
+	// Return as-is if no alias found (will likely fail later)
+	return input
+}
+
+// IsAlias returns true if the input looks like an alias (no dots).
+func IsAlias(input string) bool {
+	return !strings.Contains(input, ".")
+}
+
+// GetAliasEnvVar returns the environment variable name for a given alias.
+func GetAliasEnvVar(alias string) string {
+	return aliasEnvPrefix + strings.ToUpper(strings.ReplaceAll(alias, "-", "_"))
+}
--- a/internal/cli/aliases_test.go
+++ b/internal/cli/aliases_test.go
@@ -0,0 +1,112 @@
+package cli
+
+import (
+	"testing"
+)
+
+func TestResolveAlias(t *testing.T) {
+	// Set up test environment variables
+	t.Setenv("HOMELAB_DEPLOY_ALIAS_TEST", "deploy.test.all")
+	t.Setenv("HOMELAB_DEPLOY_ALIAS_PROD", "deploy.prod.all")
+	t.Setenv("HOMELAB_DEPLOY_ALIAS_PROD_DNS", "deploy.prod.role.dns")
+
+	tests := []struct {
+		name  string
+		input string
+		want  string
+	}{
+		{
+			name:  "full subject unchanged",
+			input: "deploy.prod.ns1",
+			want:  "deploy.prod.ns1",
+		},
+		{
+			name:  "subject with multiple dots",
+			input: "deploy.test.role.web",
+			want:  "deploy.test.role.web",
+		},
+		{
+			name:  "lowercase alias",
+			input: "test",
+			want:  "deploy.test.all",
+		},
+		{
+			name:  "uppercase alias",
+			input: "TEST",
+			want:  "deploy.test.all",
+		},
+		{
+			name:  "mixed case alias",
+			input: "TeSt",
+			want:  "deploy.test.all",
+		},
+		{
+			name:  "alias with hyphen",
+			input: "prod-dns",
+			want:  "deploy.prod.role.dns",
+		},
+		{
+			name:  "alias with hyphen uppercase",
+			input: "PROD-DNS",
+			want:  "deploy.prod.role.dns",
+		},
+		{
+			name:  "unknown alias returns as-is",
+			input: "unknown",
+			want:  "unknown",
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got := ResolveAlias(tc.input)
+			if got != tc.want {
+				t.Errorf("ResolveAlias(%q) = %q, want %q", tc.input, got, tc.want)
+			}
+		})
+	}
+}
+
+func TestIsAlias(t *testing.T) {
+	tests := []struct {
+		input string
+		want  bool
+	}{
+		{"test", true},
+		{"prod-dns", true},
+		{"PROD", true},
+		{"deploy.test.all", false},
+		{"deploy.prod.ns1", false},
+		{"a.b", false},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.input, func(t *testing.T) {
+			got := IsAlias(tc.input)
+			if got != tc.want {
+				t.Errorf("IsAlias(%q) = %v, want %v", tc.input, got, tc.want)
+			}
+		})
+	}
+}
+
+func TestGetAliasEnvVar(t *testing.T) {
+	tests := []struct {
+		alias string
+		want  string
+	}{
+		{"test", "HOMELAB_DEPLOY_ALIAS_TEST"},
+		{"prod", "HOMELAB_DEPLOY_ALIAS_PROD"},
+		{"prod-dns", "HOMELAB_DEPLOY_ALIAS_PROD_DNS"},
+		{"my-long-alias", "HOMELAB_DEPLOY_ALIAS_MY_LONG_ALIAS"},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.alias, func(t *testing.T) {
+			got := GetAliasEnvVar(tc.alias)
+			if got != tc.want {
+				t.Errorf("GetAliasEnvVar(%q) = %q, want %q", tc.alias, got, tc.want)
+			}
+		})
+	}
+}
--- a/internal/cli/deploy.go
+++ b/internal/cli/deploy.go
@@ -0,0 +1,213 @@
+package cli
+
+import (
+	"context"
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/google/uuid"
+
+	"git.t-juice.club/torjus/homelab-deploy/internal/messages"
+	"git.t-juice.club/torjus/homelab-deploy/internal/nats"
+)
+
+// DeployConfig holds configuration for a deploy operation.
+type DeployConfig struct {
+	NATSUrl  string
+	NKeyFile string
+	Subject  string
+	Action   messages.Action
+	Revision string
+	Timeout  time.Duration
+}
+
+// DeployResult contains the aggregated results from a deployment.
+type DeployResult struct {
+	Responses []*messages.DeployResponse
+	Errors    []error
+}
+
+// AllSucceeded returns true if all responses indicate success.
+func (r *DeployResult) AllSucceeded() bool {
+	for _, resp := range r.Responses {
+		if resp.Status != messages.StatusCompleted {
+			return false
+		}
+	}
+	return len(r.Responses) > 0 && len(r.Errors) == 0
+}
+
+// HostCount returns the number of unique hosts that responded.
+func (r *DeployResult) HostCount() int {
+	seen := make(map[string]bool)
+	for _, resp := range r.Responses {
+		seen[resp.Hostname] = true
+	}
+	return len(seen)
+}
+
+// Deploy executes a deployment to the specified subject and collects responses.
+func Deploy(ctx context.Context, cfg DeployConfig, onResponse func(*messages.DeployResponse)) (*DeployResult, error) {
+	// Connect to NATS
+	client, err := nats.Connect(nats.Config{
+		URL:      cfg.NATSUrl,
+		NKeyFile: cfg.NKeyFile,
+		Name:     "homelab-deploy-cli",
+	})
+	if err != nil {
+		return nil, fmt.Errorf("failed to connect to NATS: %w", err)
+	}
+	defer client.Close()
+
+	// Generate unique reply subject
+	requestID := uuid.New().String()
+	replySubject := fmt.Sprintf("deploy.responses.%s", requestID)
+
+	// Track responses by hostname to handle multiple messages per host
+	var mu sync.Mutex
+	result := &DeployResult{}
+	hostFinal := make(map[string]bool) // track which hosts have sent final status
+
+	// Subscribe to reply subject
+	sub, err := client.Subscribe(replySubject, func(subject string, data []byte) {
+		resp, err := messages.UnmarshalDeployResponse(data)
+		if err != nil {
+			mu.Lock()
+			result.Errors = append(result.Errors, fmt.Errorf("failed to unmarshal response: %w", err))
+			mu.Unlock()
+			return
+		}
+
+		mu.Lock()
+		result.Responses = append(result.Responses, resp)
+		if resp.Status.IsFinal() {
+			hostFinal[resp.Hostname] = true
+		}
+		mu.Unlock()
+
+		if onResponse != nil {
+			onResponse(resp)
+		}
+	})
+	if err != nil {
+		return nil, fmt.Errorf("failed to subscribe to reply subject: %w", err)
+	}
+	defer func() { _ = sub.Unsubscribe() }()
+
+	// Build and send request
+	req := &messages.DeployRequest{
+		Action:   cfg.Action,
+		Revision: cfg.Revision,
+		ReplyTo:  replySubject,
+	}
+
+	data, err := req.Marshal()
+	if err != nil {
+		return nil, fmt.Errorf("failed to marshal request: %w", err)
+	}
+
+	if err := client.Publish(cfg.Subject, data); err != nil {
+		return nil, fmt.Errorf("failed to publish request: %w", err)
+	}
+
+	if err := client.Flush(); err != nil {
+		return nil, fmt.Errorf("failed to flush: %w", err)
+	}
+
+	// Wait for responses with timeout
+	// Use a dynamic timeout: wait for initial responses, then extend
+	// timeout after each response until no new responses or max timeout
+	deadline := time.Now().Add(cfg.Timeout)
+	lastResponse := time.Now()
+	idleTimeout := 30 * time.Second // wait this long after last response
+
+	for {
+		select {
+		case <-ctx.Done():
+			return result, ctx.Err()
+		case <-time.After(1 * time.Second):
+			mu.Lock()
+			responseCount := len(result.Responses)
+			mu.Unlock()
+
+			now := time.Now()
+
+			// Check if we've exceeded the absolute deadline
+			if now.After(deadline) {
+				return result, nil
+			}
+
+			// If we have responses, use idle timeout
+			if responseCount > 0 {
+				mu.Lock()
+				lastResponseTime := lastResponse
+				// Update lastResponse time if we got new responses
+				if responseCount > 0 {
+					// Simple approximation - in practice you'd track this more precisely
+					lastResponseTime = now
+				}
+				mu.Unlock()
+
+				if now.Sub(lastResponseTime) > idleTimeout {
+					return result, nil
+				}
+			}
+		}
+	}
+}
+
+// Discover sends a discovery request and collects host information.
+func Discover(ctx context.Context, natsURL, nkeyFile, discoverSubject string, timeout time.Duration) ([]*messages.DiscoveryResponse, error) {
+	client, err := nats.Connect(nats.Config{
+		URL:      natsURL,
+		NKeyFile: nkeyFile,
+		Name:     "homelab-deploy-cli-discover",
+	})
+	if err != nil {
+		return nil, fmt.Errorf("failed to connect to NATS: %w", err)
+	}
+	defer client.Close()
+
+	requestID := uuid.New().String()
+	replySubject := fmt.Sprintf("deploy.responses.discover-%s", requestID)
+
+	var mu sync.Mutex
+	var responses []*messages.DiscoveryResponse
+
+	sub, err := client.Subscribe(replySubject, func(subject string, data []byte) {
+		resp, err := messages.UnmarshalDiscoveryResponse(data)
+		if err != nil {
+			return
+		}
+		mu.Lock()
+		responses = append(responses, resp)
+		mu.Unlock()
+	})
+	if err != nil {
+		return nil, fmt.Errorf("failed to subscribe: %w", err)
+	}
+	defer func() { _ = sub.Unsubscribe() }()
+
+	req := &messages.DiscoveryRequest{ReplyTo: replySubject}
+	data, err := req.Marshal()
+	if err != nil {
+		return nil, fmt.Errorf("failed to marshal request: %w", err)
+	}
+
+	if err := client.Publish(discoverSubject, data); err != nil {
+		return nil, fmt.Errorf("failed to publish: %w", err)
+	}
+
+	if err := client.Flush(); err != nil {
+		return nil, fmt.Errorf("failed to flush: %w", err)
+	}
+
+	// Wait for responses
+	select {
+	case <-ctx.Done():
+		return responses, ctx.Err()
+	case <-time.After(timeout):
+		return responses, nil
+	}
+}
--- a/internal/cli/deploy_test.go
+++ b/internal/cli/deploy_test.go
@@ -0,0 +1,109 @@
+package cli
+
+import (
+	"testing"
+
+	"git.t-juice.club/torjus/homelab-deploy/internal/messages"
+)
+
+func TestDeployResult_AllSucceeded(t *testing.T) {
+	tests := []struct {
+		name      string
+		responses []*messages.DeployResponse
+		errors    []error
+		want      bool
+	}{
+		{
+			name: "all completed",
+			responses: []*messages.DeployResponse{
+				{Hostname: "host1", Status: messages.StatusCompleted},
+				{Hostname: "host2", Status: messages.StatusCompleted},
+			},
+			want: true,
+		},
+		{
+			name: "one failed",
+			responses: []*messages.DeployResponse{
+				{Hostname: "host1", Status: messages.StatusCompleted},
+				{Hostname: "host2", Status: messages.StatusFailed},
+			},
+			want: false,
+		},
+		{
+			name: "one rejected",
+			responses: []*messages.DeployResponse{
+				{Hostname: "host1", Status: messages.StatusRejected},
+			},
+			want: false,
+		},
+		{
+			name:      "no responses",
+			responses: []*messages.DeployResponse{},
+			want:      false,
+		},
+		{
+			name: "has errors",
+			responses: []*messages.DeployResponse{
+				{Hostname: "host1", Status: messages.StatusCompleted},
+			},
+			errors: []error{nil}, // placeholder error
+			want:   false,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			r := &DeployResult{
+				Responses: tc.responses,
+				Errors:    tc.errors,
+			}
+			got := r.AllSucceeded()
+			if got != tc.want {
+				t.Errorf("AllSucceeded() = %v, want %v", got, tc.want)
+			}
+		})
+	}
+}
+
+func TestDeployResult_HostCount(t *testing.T) {
+	tests := []struct {
+		name      string
+		responses []*messages.DeployResponse
+		want      int
+	}{
+		{
+			name:      "no responses",
+			responses: []*messages.DeployResponse{},
+			want:      0,
+		},
+		{
+			name: "unique hosts",
+			responses: []*messages.DeployResponse{
+				{Hostname: "host1"},
+				{Hostname: "host2"},
+				{Hostname: "host3"},
+			},
+			want: 3,
+		},
+		{
+			name: "duplicate hosts",
+			responses: []*messages.DeployResponse{
+				{Hostname: "host1", Status: messages.StatusStarted},
+				{Hostname: "host1", Status: messages.StatusCompleted},
+				{Hostname: "host2", Status: messages.StatusStarted},
+				{Hostname: "host2", Status: messages.StatusCompleted},
+			},
+			want: 2,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			r := &DeployResult{Responses: tc.responses}
+			got := r.HostCount()
+			if got != tc.want {
+				t.Errorf("HostCount() = %d, want %d", got, tc.want)
+			}
+		})
+	}
+}