feat: implement NATS-based NixOS deployment system
Implement the complete homelab-deploy system with three operational modes: - Listener mode: Runs on NixOS hosts as a systemd service, subscribes to NATS subjects with configurable templates, executes nixos-rebuild on deployment requests with concurrency control - MCP mode: MCP server exposing deploy, deploy_admin, and list_hosts tools for AI assistants with tiered access control - CLI mode: Manual deployment commands with subject alias support via environment variables Key components: - internal/messages: Request/response types with validation - internal/nats: Client wrapper with NKey authentication - internal/deploy: Executor with timeout and lock for concurrency - internal/listener: Subject template expansion and request handling - internal/cli: Deploy logic with alias resolution - internal/mcp: MCP server with mcp-go integration - nixos/module.nix: NixOS module with hardened systemd service Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
112
internal/deploy/executor.go
Normal file
112
internal/deploy/executor.go
Normal file
@@ -0,0 +1,112 @@
|
||||
package deploy
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"time"
|
||||
|
||||
"git.t-juice.club/torjus/homelab-deploy/internal/messages"
|
||||
)
|
||||
|
||||
// Executor handles the execution of nixos-rebuild commands.
|
||||
type Executor struct {
|
||||
flakeURL string
|
||||
hostname string
|
||||
timeout time.Duration
|
||||
}
|
||||
|
||||
// NewExecutor creates a new deployment executor.
|
||||
func NewExecutor(flakeURL, hostname string, timeout time.Duration) *Executor {
|
||||
return &Executor{
|
||||
flakeURL: flakeURL,
|
||||
hostname: hostname,
|
||||
timeout: timeout,
|
||||
}
|
||||
}
|
||||
|
||||
// Result contains the result of a deployment execution.
|
||||
type Result struct {
|
||||
Success bool
|
||||
ExitCode int
|
||||
Stdout string
|
||||
Stderr string
|
||||
Error error
|
||||
}
|
||||
|
||||
// ValidateRevision checks if a revision exists in the remote repository.
|
||||
// It uses git ls-remote to verify the ref exists.
|
||||
func (e *Executor) ValidateRevision(ctx context.Context, revision string) error {
|
||||
// Extract the base URL for git ls-remote
|
||||
// flakeURL is like git+https://git.example.com/user/repo.git
|
||||
// We need to strip the git+ prefix for git ls-remote
|
||||
gitURL := e.flakeURL
|
||||
if len(gitURL) > 4 && gitURL[:4] == "git+" {
|
||||
gitURL = gitURL[4:]
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
cmd := exec.CommandContext(ctx, "git", "ls-remote", "--exit-code", gitURL, revision)
|
||||
var stderr bytes.Buffer
|
||||
cmd.Stderr = &stderr
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
if ctx.Err() == context.DeadlineExceeded {
|
||||
return fmt.Errorf("timeout validating revision")
|
||||
}
|
||||
return fmt.Errorf("revision %q not found: %w", revision, err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Execute runs nixos-rebuild with the specified action and revision.
|
||||
func (e *Executor) Execute(ctx context.Context, action messages.Action, revision string) *Result {
|
||||
ctx, cancel := context.WithTimeout(ctx, e.timeout)
|
||||
defer cancel()
|
||||
|
||||
// Build the flake reference: <flake-url>?ref=<revision>#<hostname>
|
||||
flakeRef := fmt.Sprintf("%s?ref=%s#%s", e.flakeURL, revision, e.hostname)
|
||||
|
||||
cmd := exec.CommandContext(ctx, "nixos-rebuild", string(action), "--flake", flakeRef)
|
||||
|
||||
var stdout, stderr bytes.Buffer
|
||||
cmd.Stdout = &stdout
|
||||
cmd.Stderr = &stderr
|
||||
|
||||
err := cmd.Run()
|
||||
|
||||
result := &Result{
|
||||
Stdout: stdout.String(),
|
||||
Stderr: stderr.String(),
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
result.Success = false
|
||||
result.Error = err
|
||||
|
||||
if ctx.Err() == context.DeadlineExceeded {
|
||||
result.Error = fmt.Errorf("deployment timed out after %v", e.timeout)
|
||||
}
|
||||
|
||||
if exitErr, ok := err.(*exec.ExitError); ok {
|
||||
result.ExitCode = exitErr.ExitCode()
|
||||
} else {
|
||||
result.ExitCode = -1
|
||||
}
|
||||
} else {
|
||||
result.Success = true
|
||||
result.ExitCode = 0
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// BuildCommand returns the command that would be executed (for logging/debugging).
|
||||
func (e *Executor) BuildCommand(action messages.Action, revision string) string {
|
||||
flakeRef := fmt.Sprintf("%s?ref=%s#%s", e.flakeURL, revision, e.hostname)
|
||||
return fmt.Sprintf("nixos-rebuild %s --flake %s", action, flakeRef)
|
||||
}
|
||||
76
internal/deploy/executor_test.go
Normal file
76
internal/deploy/executor_test.go
Normal file
@@ -0,0 +1,76 @@
|
||||
package deploy
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"git.t-juice.club/torjus/homelab-deploy/internal/messages"
|
||||
)
|
||||
|
||||
func TestExecutor_BuildCommand(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
flakeURL string
|
||||
hostname string
|
||||
action messages.Action
|
||||
revision string
|
||||
want string
|
||||
}{
|
||||
{
|
||||
name: "switch action",
|
||||
flakeURL: "git+https://git.example.com/user/nixos-configs.git",
|
||||
hostname: "ns1",
|
||||
action: messages.ActionSwitch,
|
||||
revision: "master",
|
||||
want: "nixos-rebuild switch --flake git+https://git.example.com/user/nixos-configs.git?ref=master#ns1",
|
||||
},
|
||||
{
|
||||
name: "boot action with commit hash",
|
||||
flakeURL: "git+https://git.example.com/user/nixos-configs.git",
|
||||
hostname: "web1",
|
||||
action: messages.ActionBoot,
|
||||
revision: "abc123def456",
|
||||
want: "nixos-rebuild boot --flake git+https://git.example.com/user/nixos-configs.git?ref=abc123def456#web1",
|
||||
},
|
||||
{
|
||||
name: "test action with feature branch",
|
||||
flakeURL: "git+ssh://git@github.com/org/repo.git",
|
||||
hostname: "test-host",
|
||||
action: messages.ActionTest,
|
||||
revision: "feature/new-feature",
|
||||
want: "nixos-rebuild test --flake git+ssh://git@github.com/org/repo.git?ref=feature/new-feature#test-host",
|
||||
},
|
||||
{
|
||||
name: "dry-activate action",
|
||||
flakeURL: "git+https://git.example.com/repo.git",
|
||||
hostname: "prod-1",
|
||||
action: messages.ActionDryActivate,
|
||||
revision: "v1.0.0",
|
||||
want: "nixos-rebuild dry-activate --flake git+https://git.example.com/repo.git?ref=v1.0.0#prod-1",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
e := NewExecutor(tc.flakeURL, tc.hostname, 10*time.Minute)
|
||||
got := e.BuildCommand(tc.action, tc.revision)
|
||||
if got != tc.want {
|
||||
t.Errorf("BuildCommand() = %q, want %q", got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewExecutor(t *testing.T) {
|
||||
e := NewExecutor("git+https://example.com/repo.git", "host1", 5*time.Minute)
|
||||
|
||||
if e.flakeURL != "git+https://example.com/repo.git" {
|
||||
t.Errorf("flakeURL = %q, want %q", e.flakeURL, "git+https://example.com/repo.git")
|
||||
}
|
||||
if e.hostname != "host1" {
|
||||
t.Errorf("hostname = %q, want %q", e.hostname, "host1")
|
||||
}
|
||||
if e.timeout != 5*time.Minute {
|
||||
t.Errorf("timeout = %v, want %v", e.timeout, 5*time.Minute)
|
||||
}
|
||||
}
|
||||
56
internal/deploy/lock.go
Normal file
56
internal/deploy/lock.go
Normal file
@@ -0,0 +1,56 @@
|
||||
// Package deploy provides deployment execution logic.
|
||||
package deploy
|
||||
|
||||
import (
|
||||
"sync"
|
||||
)
|
||||
|
||||
// Lock provides a simple in-memory lock for single-deployment concurrency control.
|
||||
type Lock struct {
|
||||
mu sync.Mutex
|
||||
held bool
|
||||
holder string
|
||||
}
|
||||
|
||||
// NewLock creates a new deployment lock.
|
||||
func NewLock() *Lock {
|
||||
return &Lock{}
|
||||
}
|
||||
|
||||
// TryAcquire attempts to acquire the lock. Returns true if successful.
|
||||
// The holder parameter identifies who is holding the lock.
|
||||
func (l *Lock) TryAcquire(holder string) bool {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
|
||||
if l.held {
|
||||
return false
|
||||
}
|
||||
|
||||
l.held = true
|
||||
l.holder = holder
|
||||
return true
|
||||
}
|
||||
|
||||
// Release releases the lock.
|
||||
func (l *Lock) Release() {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
|
||||
l.held = false
|
||||
l.holder = ""
|
||||
}
|
||||
|
||||
// IsHeld returns true if the lock is currently held.
|
||||
func (l *Lock) IsHeld() bool {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
return l.held
|
||||
}
|
||||
|
||||
// Holder returns the current holder of the lock, or empty string if not held.
|
||||
func (l *Lock) Holder() string {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
return l.holder
|
||||
}
|
||||
98
internal/deploy/lock_test.go
Normal file
98
internal/deploy/lock_test.go
Normal file
@@ -0,0 +1,98 @@
|
||||
package deploy
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestLock_TryAcquire(t *testing.T) {
|
||||
l := NewLock()
|
||||
|
||||
// First acquire should succeed
|
||||
if !l.TryAcquire("request-1") {
|
||||
t.Error("first TryAcquire should succeed")
|
||||
}
|
||||
|
||||
// Second acquire should fail
|
||||
if l.TryAcquire("request-2") {
|
||||
t.Error("second TryAcquire should fail while lock is held")
|
||||
}
|
||||
|
||||
// Verify holder
|
||||
if got := l.Holder(); got != "request-1" {
|
||||
t.Errorf("Holder() = %q, want %q", got, "request-1")
|
||||
}
|
||||
|
||||
// Release and try again
|
||||
l.Release()
|
||||
|
||||
if !l.TryAcquire("request-3") {
|
||||
t.Error("TryAcquire should succeed after Release")
|
||||
}
|
||||
|
||||
if got := l.Holder(); got != "request-3" {
|
||||
t.Errorf("Holder() = %q, want %q", got, "request-3")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLock_IsHeld(t *testing.T) {
|
||||
l := NewLock()
|
||||
|
||||
if l.IsHeld() {
|
||||
t.Error("new lock should not be held")
|
||||
}
|
||||
|
||||
l.TryAcquire("test")
|
||||
|
||||
if !l.IsHeld() {
|
||||
t.Error("lock should be held after TryAcquire")
|
||||
}
|
||||
|
||||
l.Release()
|
||||
|
||||
if l.IsHeld() {
|
||||
t.Error("lock should not be held after Release")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLock_Concurrent(t *testing.T) {
|
||||
l := NewLock()
|
||||
var wg sync.WaitGroup
|
||||
acquired := make(chan string, 100)
|
||||
|
||||
// Try to acquire from multiple goroutines
|
||||
for i := range 100 {
|
||||
wg.Add(1)
|
||||
go func(id int) {
|
||||
defer wg.Done()
|
||||
holder := string(rune('A' + (id % 26)))
|
||||
if l.TryAcquire(holder) {
|
||||
acquired <- holder
|
||||
}
|
||||
}(i)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
close(acquired)
|
||||
|
||||
// Only one should have succeeded
|
||||
count := 0
|
||||
for range acquired {
|
||||
count++
|
||||
}
|
||||
|
||||
if count != 1 {
|
||||
t.Errorf("expected exactly 1 successful acquire, got %d", count)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLock_ReleaseUnheld(t *testing.T) {
|
||||
l := NewLock()
|
||||
|
||||
// Releasing an unheld lock should not panic
|
||||
l.Release()
|
||||
|
||||
if l.IsHeld() {
|
||||
t.Error("lock should not be held after Release on unheld lock")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user