This repository has been archived on 2026-03-09. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
homelab-deploy/cmd/homelab-deploy/main.go
Torjus Håkestad 746e30b24f fix: initialize counter and histogram metrics at startup
Counter and histogram metrics were absent from Prometheus scrapes until
the first deployment occurred, making it impossible to distinguish
"no deployments" from "exporter not running" in dashboards and alerts.

Initialize all expected label combinations with zero values when the
collector is created so metrics appear in every scrape from startup.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-08 21:29:36 +01:00

377 lines
9.8 KiB
Go

package main
import (
"context"
"fmt"
"log/slog"
"os"
"os/signal"
"syscall"
"time"
deploycli "git.t-juice.club/torjus/homelab-deploy/internal/cli"
"git.t-juice.club/torjus/homelab-deploy/internal/listener"
"git.t-juice.club/torjus/homelab-deploy/internal/mcp"
"git.t-juice.club/torjus/homelab-deploy/internal/messages"
"github.com/urfave/cli/v3"
)
const version = "0.1.11"
func main() {
app := &cli.Command{
Name: "homelab-deploy",
Usage: "Message-based NixOS deployment system using NATS",
Version: version,
Commands: []*cli.Command{
listenerCommand(),
mcpCommand(),
deployCommand(),
listHostsCommand(),
},
}
if err := app.Run(context.Background(), os.Args); err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(1)
}
}
func listenerCommand() *cli.Command {
return &cli.Command{
Name: "listener",
Usage: "Run as a deployment listener (systemd service mode)",
Flags: []cli.Flag{
&cli.StringFlag{
Name: "hostname",
Usage: "Hostname for this listener",
Required: true,
},
&cli.StringFlag{
Name: "tier",
Usage: "Deployment tier (test or prod)",
Required: true,
},
&cli.StringFlag{
Name: "role",
Usage: "Role for role-based deployment targeting",
},
&cli.StringFlag{
Name: "nats-url",
Usage: "NATS server URL",
Required: true,
},
&cli.StringFlag{
Name: "nkey-file",
Usage: "Path to NKey seed file for NATS authentication",
Required: true,
},
&cli.StringFlag{
Name: "flake-url",
Usage: "Git flake URL for nixos-rebuild",
Required: true,
},
&cli.IntFlag{
Name: "timeout",
Usage: "Deployment timeout in seconds",
Value: 600,
},
&cli.StringSliceFlag{
Name: "deploy-subject",
Usage: "NATS subject to subscribe to for deployments (can be repeated)",
Value: []string{
"deploy.<tier>.<hostname>",
"deploy.<tier>.all",
"deploy.<tier>.role.<role>",
},
},
&cli.StringFlag{
Name: "discover-subject",
Usage: "NATS subject for host discovery requests",
Value: "deploy.discover",
},
&cli.BoolFlag{
Name: "metrics-enabled",
Usage: "Enable Prometheus metrics endpoint",
},
&cli.StringFlag{
Name: "metrics-addr",
Usage: "Address for Prometheus metrics HTTP server",
Value: ":9972",
},
&cli.IntFlag{
Name: "heartbeat-interval",
Usage: "Interval in seconds for sending status updates during deployment (0 to disable)",
Value: 15,
},
},
Action: func(ctx context.Context, c *cli.Command) error {
tier := c.String("tier")
if tier != "test" && tier != "prod" {
return fmt.Errorf("tier must be 'test' or 'prod', got %q", tier)
}
cfg := listener.Config{
Hostname: c.String("hostname"),
Tier: tier,
Role: c.String("role"),
NATSUrl: c.String("nats-url"),
NKeyFile: c.String("nkey-file"),
FlakeURL: c.String("flake-url"),
Timeout: time.Duration(c.Int("timeout")) * time.Second,
HeartbeatInterval: time.Duration(c.Int("heartbeat-interval")) * time.Second,
DeploySubjects: c.StringSlice("deploy-subject"),
DiscoverSubject: c.String("discover-subject"),
MetricsEnabled: c.Bool("metrics-enabled"),
MetricsAddr: c.String("metrics-addr"),
Version: version,
}
logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
Level: slog.LevelInfo,
}))
l := listener.New(cfg, logger)
// Handle shutdown signals
ctx, cancel := signal.NotifyContext(ctx, syscall.SIGINT, syscall.SIGTERM)
defer cancel()
return l.Run(ctx)
},
}
}
func mcpCommand() *cli.Command {
return &cli.Command{
Name: "mcp",
Usage: "Run as an MCP server for AI assistants",
Flags: []cli.Flag{
&cli.StringFlag{
Name: "nats-url",
Usage: "NATS server URL",
Required: true,
},
&cli.StringFlag{
Name: "nkey-file",
Usage: "Path to NKey seed file for NATS authentication",
Required: true,
},
&cli.BoolFlag{
Name: "enable-admin",
Usage: "Enable admin deployment tool for all tiers",
},
&cli.StringFlag{
Name: "admin-nkey-file",
Usage: "Path to admin NKey seed file (required if --enable-admin)",
},
&cli.StringFlag{
Name: "discover-subject",
Usage: "NATS subject for host discovery",
Value: "deploy.discover",
},
&cli.IntFlag{
Name: "timeout",
Usage: "Timeout in seconds for deployment operations",
Value: 900,
},
},
Action: func(_ context.Context, c *cli.Command) error {
enableAdmin := c.Bool("enable-admin")
adminNKeyFile := c.String("admin-nkey-file")
if enableAdmin && adminNKeyFile == "" {
return fmt.Errorf("--admin-nkey-file is required when --enable-admin is set")
}
cfg := mcp.ServerConfig{
NATSUrl: c.String("nats-url"),
NKeyFile: c.String("nkey-file"),
EnableAdmin: enableAdmin,
AdminNKeyFile: adminNKeyFile,
DiscoverSubject: c.String("discover-subject"),
Timeout: time.Duration(c.Int("timeout")) * time.Second,
}
s := mcp.New(cfg)
return s.Run()
},
}
}
func deployCommand() *cli.Command {
return &cli.Command{
Name: "deploy",
Usage: "Deploy to a target subject",
ArgsUsage: "<subject>",
Flags: []cli.Flag{
&cli.StringFlag{
Name: "nats-url",
Usage: "NATS server URL",
Sources: cli.EnvVars("HOMELAB_DEPLOY_NATS_URL"),
Required: true,
},
&cli.StringFlag{
Name: "nkey-file",
Usage: "Path to NKey seed file for NATS authentication",
Sources: cli.EnvVars("HOMELAB_DEPLOY_NKEY_FILE"),
Required: true,
},
&cli.StringFlag{
Name: "branch",
Usage: "Git branch or commit to deploy",
Sources: cli.EnvVars("HOMELAB_DEPLOY_BRANCH"),
Value: "master",
},
&cli.StringFlag{
Name: "action",
Usage: "nixos-rebuild action (switch, boot, test, dry-activate)",
Sources: cli.EnvVars("HOMELAB_DEPLOY_ACTION"),
Value: "switch",
},
&cli.IntFlag{
Name: "timeout",
Usage: "Timeout in seconds for collecting responses",
Sources: cli.EnvVars("HOMELAB_DEPLOY_TIMEOUT"),
Value: 900,
},
},
Action: func(ctx context.Context, c *cli.Command) error {
if c.Args().Len() < 1 {
return fmt.Errorf("subject argument required")
}
subjectArg := c.Args().First()
subject := deploycli.ResolveAlias(subjectArg)
if deploycli.IsAlias(subjectArg) && subject != subjectArg {
fmt.Printf("Resolved alias %q to %q\n", subjectArg, subject)
}
action := messages.Action(c.String("action"))
if !action.Valid() {
return fmt.Errorf("invalid action: %q", action)
}
cfg := deploycli.DeployConfig{
NATSUrl: c.String("nats-url"),
NKeyFile: c.String("nkey-file"),
Subject: subject,
Action: action,
Revision: c.String("branch"),
Timeout: time.Duration(c.Int("timeout")) * time.Second,
}
fmt.Printf("Deploying to %s (action=%s, revision=%s)\n", subject, action, cfg.Revision)
// Handle shutdown signals
ctx, cancel := signal.NotifyContext(ctx, syscall.SIGINT, syscall.SIGTERM)
defer cancel()
result, err := deploycli.Deploy(ctx, cfg, func(resp *messages.DeployResponse) {
status := string(resp.Status)
if resp.Error != nil {
status = fmt.Sprintf("%s (%s)", status, *resp.Error)
}
fmt.Printf("[%s] %s: %s\n", resp.Hostname, status, resp.Message)
})
if err != nil {
return fmt.Errorf("deploy failed: %w", err)
}
fmt.Printf("\nDeployment complete: %d hosts responded\n", result.HostCount())
if !result.AllSucceeded() {
return fmt.Errorf("some deployments failed")
}
return nil
},
}
}
func listHostsCommand() *cli.Command {
return &cli.Command{
Name: "list-hosts",
Usage: "List available deployment targets",
Flags: []cli.Flag{
&cli.StringFlag{
Name: "nats-url",
Usage: "NATS server URL",
Sources: cli.EnvVars("HOMELAB_DEPLOY_NATS_URL"),
Required: true,
},
&cli.StringFlag{
Name: "nkey-file",
Usage: "Path to NKey seed file for NATS authentication",
Sources: cli.EnvVars("HOMELAB_DEPLOY_NKEY_FILE"),
Required: true,
},
&cli.StringFlag{
Name: "tier",
Usage: "Filter by tier (test or prod)",
Sources: cli.EnvVars("HOMELAB_DEPLOY_TIER"),
},
&cli.StringFlag{
Name: "discover-subject",
Usage: "NATS subject for host discovery",
Sources: cli.EnvVars("HOMELAB_DEPLOY_DISCOVER_SUBJECT"),
Value: "deploy.discover",
},
&cli.IntFlag{
Name: "timeout",
Usage: "Timeout in seconds for discovery",
Sources: cli.EnvVars("HOMELAB_DEPLOY_DISCOVER_TIMEOUT"),
Value: 5,
},
},
Action: func(ctx context.Context, c *cli.Command) error {
tierFilter := c.String("tier")
if tierFilter != "" && tierFilter != "test" && tierFilter != "prod" {
return fmt.Errorf("tier must be 'test' or 'prod', got %q", tierFilter)
}
// Handle shutdown signals
ctx, cancel := signal.NotifyContext(ctx, syscall.SIGINT, syscall.SIGTERM)
defer cancel()
responses, err := deploycli.Discover(
ctx,
c.String("nats-url"),
c.String("nkey-file"),
c.String("discover-subject"),
time.Duration(c.Int("timeout"))*time.Second,
)
if err != nil {
return fmt.Errorf("discovery failed: %w", err)
}
if len(responses) == 0 {
fmt.Println("No hosts responded to discovery request")
return nil
}
fmt.Println("Available deployment targets:")
fmt.Println()
for _, resp := range responses {
if tierFilter != "" && resp.Tier != tierFilter {
continue
}
role := resp.Role
if role == "" {
role = "(none)"
}
fmt.Printf("- %s (tier=%s, role=%s)\n", resp.Hostname, resp.Tier, role)
for _, subj := range resp.DeploySubjects {
fmt.Printf(" %s\n", subj)
}
}
return nil
},
}
}