Compare commits
23 Commits
9f205fee5e
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
713d1e7584
|
|||
|
2d26de5055
|
|||
| e5e8be86ec | |||
|
3ac5d9777f
|
|||
|
1a23847d31
|
|||
|
c13914bf5a
|
|||
| a8aab16d0e | |||
|
00899489ac
|
|||
|
c52e88ca7e
|
|||
|
08f1fcc6ac
|
|||
|
14f5b31faf
|
|||
|
277a49a666
|
|||
|
bc02393c5a
|
|||
|
746e30b24f
|
|||
|
fd0d63b103
|
|||
|
36a74b8cf9
|
|||
|
79db119d1c
|
|||
|
56365835c7
|
|||
|
95b795dcfd
|
|||
|
71d6aa8b61
|
|||
|
2c97b6140c
|
|||
|
efacb13b86
|
|||
|
ac3c9c7de6
|
296
README.md
296
README.md
@@ -4,11 +4,12 @@ A message-based deployment system for NixOS configurations using NATS for messag
|
||||
|
||||
## Overview
|
||||
|
||||
The `homelab-deploy` binary provides three operational modes:
|
||||
The `homelab-deploy` binary provides four operational modes:
|
||||
|
||||
1. **Listener mode** - Runs on each NixOS host as a systemd service, subscribing to NATS subjects and executing `nixos-rebuild` when deployment requests arrive
|
||||
2. **MCP mode** - Runs as an MCP (Model Context Protocol) server, exposing deployment tools for AI assistants
|
||||
3. **CLI mode** - Manual deployment commands for administrators
|
||||
2. **Builder mode** - Runs on a dedicated build host, subscribing to NATS subjects and executing `nix build` to pre-build configurations
|
||||
3. **MCP mode** - Runs as an MCP (Model Context Protocol) server, exposing deployment tools for AI assistants
|
||||
4. **CLI mode** - Manual deployment and build commands for administrators
|
||||
|
||||
## Installation
|
||||
|
||||
@@ -61,6 +62,8 @@ homelab-deploy listener \
|
||||
| `--timeout` | No | Deployment timeout in seconds (default: 600) |
|
||||
| `--deploy-subject` | No | NATS subjects to subscribe to (repeatable) |
|
||||
| `--discover-subject` | No | Discovery subject (default: `deploy.discover`) |
|
||||
| `--metrics-enabled` | No | Enable Prometheus metrics endpoint |
|
||||
| `--metrics-addr` | No | Metrics HTTP server address (default: `:9972`) |
|
||||
|
||||
#### Subject Templates
|
||||
|
||||
@@ -126,6 +129,82 @@ homelab-deploy deploy prod-dns --nats-url ... --nkey-file ...
|
||||
|
||||
Alias lookup: `HOMELAB_DEPLOY_ALIAS_<NAME>` where name is uppercased and hyphens become underscores.
|
||||
|
||||
### Builder Mode
|
||||
|
||||
Run on a dedicated build host to pre-build NixOS configurations:
|
||||
|
||||
```bash
|
||||
homelab-deploy builder \
|
||||
--nats-url nats://nats.example.com:4222 \
|
||||
--nkey-file /run/secrets/builder.nkey \
|
||||
--config /etc/homelab-deploy/builder.yaml \
|
||||
--timeout 1800 \
|
||||
--metrics-enabled \
|
||||
--metrics-addr :9973
|
||||
```
|
||||
|
||||
#### Builder Configuration File
|
||||
|
||||
The builder uses a YAML configuration file to define allowed repositories:
|
||||
|
||||
```yaml
|
||||
repos:
|
||||
nixos-servers:
|
||||
url: "git+https://git.example.com/org/nixos-servers.git"
|
||||
default_branch: "master"
|
||||
homelab:
|
||||
url: "git+ssh://git@github.com/user/homelab.git"
|
||||
default_branch: "main"
|
||||
```
|
||||
|
||||
#### Builder Flags
|
||||
|
||||
| Flag | Required | Description |
|
||||
|------|----------|-------------|
|
||||
| `--nats-url` | Yes | NATS server URL |
|
||||
| `--nkey-file` | Yes | Path to NKey seed file |
|
||||
| `--config` | Yes | Path to builder configuration file |
|
||||
| `--timeout` | No | Build timeout per host in seconds (default: 1800) |
|
||||
| `--metrics-enabled` | No | Enable Prometheus metrics endpoint |
|
||||
| `--metrics-addr` | No | Metrics HTTP server address (default: `:9973`) |
|
||||
|
||||
### Build Command
|
||||
|
||||
Trigger a build on the build server:
|
||||
|
||||
```bash
|
||||
# Build all hosts in a repository
|
||||
homelab-deploy build nixos-servers --all \
|
||||
--nats-url nats://nats.example.com:4222 \
|
||||
--nkey-file /run/secrets/deployer.nkey
|
||||
|
||||
# Build a specific host
|
||||
homelab-deploy build nixos-servers myhost \
|
||||
--nats-url nats://nats.example.com:4222 \
|
||||
--nkey-file /run/secrets/deployer.nkey
|
||||
|
||||
# Build with a specific branch
|
||||
homelab-deploy build nixos-servers --all --branch feature-x \
|
||||
--nats-url nats://nats.example.com:4222 \
|
||||
--nkey-file /run/secrets/deployer.nkey
|
||||
|
||||
# JSON output for scripting
|
||||
homelab-deploy build nixos-servers --all --json \
|
||||
--nats-url nats://nats.example.com:4222 \
|
||||
--nkey-file /run/secrets/deployer.nkey
|
||||
```
|
||||
|
||||
#### Build Flags
|
||||
|
||||
| Flag | Required | Env Var | Description |
|
||||
|------|----------|---------|-------------|
|
||||
| `--nats-url` | Yes | `HOMELAB_DEPLOY_NATS_URL` | NATS server URL |
|
||||
| `--nkey-file` | Yes | `HOMELAB_DEPLOY_NKEY_FILE` | Path to NKey seed file |
|
||||
| `--branch` | No | `HOMELAB_DEPLOY_BRANCH` | Git branch (uses repo default if not specified) |
|
||||
| `--all` | No | - | Build all hosts in the repository |
|
||||
| `--timeout` | No | `HOMELAB_DEPLOY_BUILD_TIMEOUT` | Response timeout in seconds (default: 3600) |
|
||||
| `--json` | No | - | Output results as JSON |
|
||||
|
||||
### MCP Server Mode
|
||||
|
||||
Run as an MCP server for AI assistant integration:
|
||||
@@ -142,6 +221,12 @@ homelab-deploy mcp \
|
||||
--nkey-file /run/secrets/mcp.nkey \
|
||||
--enable-admin \
|
||||
--admin-nkey-file /run/secrets/admin.nkey
|
||||
|
||||
# With build tool enabled
|
||||
homelab-deploy mcp \
|
||||
--nats-url nats://nats.example.com:4222 \
|
||||
--nkey-file /run/secrets/mcp.nkey \
|
||||
--enable-builds
|
||||
```
|
||||
|
||||
#### MCP Tools
|
||||
@@ -151,6 +236,7 @@ homelab-deploy mcp \
|
||||
| `deploy` | Deploy to test-tier hosts only |
|
||||
| `deploy_admin` | Deploy to any tier (requires `--enable-admin`) |
|
||||
| `list_hosts` | Discover available deployment targets |
|
||||
| `build` | Trigger builds on the build server (requires `--enable-builds`) |
|
||||
|
||||
#### Tool Parameters
|
||||
|
||||
@@ -165,6 +251,12 @@ homelab-deploy mcp \
|
||||
**list_hosts:**
|
||||
- `tier` - Filter by tier (optional)
|
||||
|
||||
**build:**
|
||||
- `repo` - Repository name (required, must match builder config)
|
||||
- `target` - Target hostname (optional, defaults to all)
|
||||
- `all` - Build all hosts (default if no target specified)
|
||||
- `branch` - Git branch (uses repo default if not specified)
|
||||
|
||||
## NixOS Module
|
||||
|
||||
Add the module to your NixOS configuration:
|
||||
@@ -209,6 +301,9 @@ Add the module to your NixOS configuration:
|
||||
| `deploySubjects` | list of string | see below | Subjects to subscribe to |
|
||||
| `discoverSubject` | string | `"deploy.discover"` | Discovery subject |
|
||||
| `environment` | attrs | `{}` | Additional environment variables |
|
||||
| `metrics.enable` | bool | `false` | Enable Prometheus metrics endpoint |
|
||||
| `metrics.address` | string | `":9972"` | Metrics HTTP server address |
|
||||
| `metrics.openFirewall` | bool | `false` | Open firewall for metrics port |
|
||||
|
||||
Default `deploySubjects`:
|
||||
```nix
|
||||
@@ -219,6 +314,157 @@ Default `deploySubjects`:
|
||||
]
|
||||
```
|
||||
|
||||
### Builder Module Options
|
||||
|
||||
| Option | Type | Default | Description |
|
||||
|--------|------|---------|-------------|
|
||||
| `enable` | bool | `false` | Enable the builder service |
|
||||
| `package` | package | from flake | Package to use |
|
||||
| `natsUrl` | string | required | NATS server URL |
|
||||
| `nkeyFile` | path | required | Path to NKey seed file |
|
||||
| `configFile` | path | `null` | Path to builder config file (alternative to `settings`) |
|
||||
| `settings.repos` | attrs | `{}` | Repository configuration (see below) |
|
||||
| `timeout` | int | `1800` | Build timeout per host in seconds |
|
||||
| `environment` | attrs | `{}` | Additional environment variables |
|
||||
| `metrics.enable` | bool | `false` | Enable Prometheus metrics endpoint |
|
||||
| `metrics.address` | string | `":9973"` | Metrics HTTP server address |
|
||||
| `metrics.openFirewall` | bool | `false` | Open firewall for metrics port |
|
||||
|
||||
Each entry in `settings.repos` is an attribute set with:
|
||||
|
||||
| Option | Type | Default | Description |
|
||||
|--------|------|---------|-------------|
|
||||
| `url` | string | required | Git flake URL (must start with `git+https://`, `git+ssh://`, or `git+file://`) |
|
||||
| `defaultBranch` | string | `"master"` | Default branch to build when not specified |
|
||||
|
||||
Example builder configuration using `settings`:
|
||||
|
||||
```nix
|
||||
services.homelab-deploy.builder = {
|
||||
enable = true;
|
||||
natsUrl = "nats://nats.example.com:4222";
|
||||
nkeyFile = "/run/secrets/homelab-deploy-builder-nkey";
|
||||
settings.repos = {
|
||||
nixos-servers = {
|
||||
url = "git+https://git.example.com/org/nixos-servers.git";
|
||||
defaultBranch = "master";
|
||||
};
|
||||
homelab = {
|
||||
url = "git+ssh://git@github.com/user/homelab.git";
|
||||
defaultBranch = "main";
|
||||
};
|
||||
};
|
||||
metrics = {
|
||||
enable = true;
|
||||
address = ":9973";
|
||||
openFirewall = true;
|
||||
};
|
||||
};
|
||||
```
|
||||
|
||||
Alternatively, you can use `configFile` to point to an external YAML file:
|
||||
|
||||
```nix
|
||||
services.homelab-deploy.builder = {
|
||||
enable = true;
|
||||
natsUrl = "nats://nats.example.com:4222";
|
||||
nkeyFile = "/run/secrets/homelab-deploy-builder-nkey";
|
||||
configFile = "/etc/homelab-deploy/builder.yaml";
|
||||
};
|
||||
```
|
||||
|
||||
## Prometheus Metrics
|
||||
|
||||
The listener can expose Prometheus metrics for monitoring deployment operations.
|
||||
|
||||
### Enabling Metrics
|
||||
|
||||
**CLI:**
|
||||
```bash
|
||||
homelab-deploy listener \
|
||||
--hostname myhost \
|
||||
--tier prod \
|
||||
--nats-url nats://nats.example.com:4222 \
|
||||
--nkey-file /run/secrets/listener.nkey \
|
||||
--flake-url git+https://git.example.com/user/nixos-configs.git \
|
||||
--metrics-enabled \
|
||||
--metrics-addr :9972
|
||||
```
|
||||
|
||||
**NixOS module:**
|
||||
```nix
|
||||
services.homelab-deploy.listener = {
|
||||
enable = true;
|
||||
tier = "prod";
|
||||
natsUrl = "nats://nats.example.com:4222";
|
||||
nkeyFile = "/run/secrets/homelab-deploy-nkey";
|
||||
flakeUrl = "git+https://git.example.com/user/nixos-configs.git";
|
||||
metrics = {
|
||||
enable = true;
|
||||
address = ":9972";
|
||||
openFirewall = true; # Optional: open firewall for Prometheus scraping
|
||||
};
|
||||
};
|
||||
```
|
||||
|
||||
### Available Metrics
|
||||
|
||||
| Metric | Type | Labels | Description |
|
||||
|--------|------|--------|-------------|
|
||||
| `homelab_deploy_deployments_total` | Counter | `status`, `action`, `error_code` | Total deployment requests processed |
|
||||
| `homelab_deploy_deployment_duration_seconds` | Histogram | `action`, `success` | Deployment execution time |
|
||||
| `homelab_deploy_deployment_in_progress` | Gauge | - | 1 if deployment running, 0 otherwise |
|
||||
| `homelab_deploy_info` | Gauge | `hostname`, `tier`, `role`, `version` | Static instance metadata |
|
||||
|
||||
**Label values:**
|
||||
- `status`: `completed`, `failed`, `rejected`
|
||||
- `action`: `switch`, `boot`, `test`, `dry-activate`
|
||||
- `error_code`: `invalid_action`, `invalid_revision`, `already_running`, `build_failed`, `timeout`, or empty
|
||||
- `success`: `true`, `false`
|
||||
|
||||
### HTTP Endpoints
|
||||
|
||||
| Endpoint | Description |
|
||||
|----------|-------------|
|
||||
| `/metrics` | Prometheus metrics in text format |
|
||||
| `/health` | Health check (returns `ok`) |
|
||||
|
||||
### Example Prometheus Queries
|
||||
|
||||
```promql
|
||||
# Average deployment duration (last hour)
|
||||
rate(homelab_deploy_deployment_duration_seconds_sum[1h]) /
|
||||
rate(homelab_deploy_deployment_duration_seconds_count[1h])
|
||||
|
||||
# Deployment success rate (last 24 hours)
|
||||
sum(rate(homelab_deploy_deployments_total{status="completed"}[24h])) /
|
||||
sum(rate(homelab_deploy_deployments_total{status=~"completed|failed"}[24h]))
|
||||
|
||||
# 95th percentile deployment time
|
||||
histogram_quantile(0.95, rate(homelab_deploy_deployment_duration_seconds_bucket[1h]))
|
||||
|
||||
# Currently running deployments across all hosts
|
||||
sum(homelab_deploy_deployment_in_progress)
|
||||
```
|
||||
|
||||
### Builder Metrics
|
||||
|
||||
When running in builder mode, additional metrics are available:
|
||||
|
||||
| Metric | Type | Labels | Description |
|
||||
|--------|------|--------|-------------|
|
||||
| `homelab_deploy_builds_total` | Counter | `repo`, `status` | Total builds processed |
|
||||
| `homelab_deploy_build_host_total` | Counter | `repo`, `host`, `status` | Total host builds processed |
|
||||
| `homelab_deploy_build_duration_seconds` | Histogram | `repo`, `host` | Build execution time per host |
|
||||
| `homelab_deploy_build_last_timestamp` | Gauge | `repo` | Timestamp of last build attempt |
|
||||
| `homelab_deploy_build_last_success_timestamp` | Gauge | `repo` | Timestamp of last successful build |
|
||||
| `homelab_deploy_build_last_failure_timestamp` | Gauge | `repo` | Timestamp of last failed build |
|
||||
|
||||
**Label values:**
|
||||
- `status`: `success`, `failure`
|
||||
- `repo`: Repository name from config
|
||||
- `host`: Host name being built
|
||||
|
||||
## Message Protocol
|
||||
|
||||
### Deploy Request
|
||||
@@ -246,6 +492,37 @@ Default `deploySubjects`:
|
||||
|
||||
**Error codes:** `invalid_revision`, `invalid_action`, `already_running`, `build_failed`, `timeout`
|
||||
|
||||
### Build Request
|
||||
|
||||
```json
|
||||
{
|
||||
"repo": "nixos-servers",
|
||||
"target": "all",
|
||||
"branch": "main",
|
||||
"reply_to": "build.responses.abc123"
|
||||
}
|
||||
```
|
||||
|
||||
### Build Response
|
||||
|
||||
```json
|
||||
{
|
||||
"status": "completed",
|
||||
"message": "built 5/5 hosts successfully",
|
||||
"results": [
|
||||
{"host": "host1", "success": true, "duration_seconds": 120.5},
|
||||
{"host": "host2", "success": true, "duration_seconds": 95.3}
|
||||
],
|
||||
"total_duration_seconds": 450.2,
|
||||
"succeeded": 5,
|
||||
"failed": 0
|
||||
}
|
||||
```
|
||||
|
||||
**Status values:** `started`, `progress`, `completed`, `failed`, `rejected`
|
||||
|
||||
Progress updates include `host`, `host_success`, `hosts_completed`, and `hosts_total` fields.
|
||||
|
||||
## NATS Authentication
|
||||
|
||||
All connections use NKey authentication. Generate keys with:
|
||||
@@ -275,13 +552,22 @@ The deployment system uses the following NATS subject hierarchy:
|
||||
- `deploy.prod.all` - Deploy to all production hosts
|
||||
- `deploy.prod.role.dns` - Deploy to all DNS servers in production
|
||||
|
||||
### Build Subjects
|
||||
|
||||
| Subject Pattern | Purpose |
|
||||
|-----------------|---------|
|
||||
| `build.<repo>.*` | Build requests for a repository |
|
||||
| `build.<repo>.all` | Build all hosts in a repository |
|
||||
| `build.<repo>.<hostname>` | Build a specific host |
|
||||
|
||||
### Response Subjects
|
||||
|
||||
| Subject Pattern | Purpose |
|
||||
|-----------------|---------|
|
||||
| `deploy.responses.<uuid>` | Unique reply subject for each deployment request |
|
||||
| `build.responses.<uuid>` | Unique reply subject for each build request |
|
||||
|
||||
Deployers create a unique response subject for each request and include it in the `reply_to` field. Listeners publish status updates to this subject.
|
||||
Deployers and build clients create a unique response subject for each request and include it in the `reply_to` field. Listeners and builders publish status updates to this subject.
|
||||
|
||||
### Discovery Subject
|
||||
|
||||
@@ -372,7 +658,9 @@ authorization {
|
||||
| Credential Type | Publish | Subscribe |
|
||||
|-----------------|---------|-----------|
|
||||
| Listener | `deploy.responses.>`, `deploy.discover` | Own subjects, `deploy.discover` |
|
||||
| Builder | `build.responses.>` | `build.<repo>.*` for each configured repo |
|
||||
| Test deployer | `deploy.test.>`, `deploy.discover` | `deploy.responses.>`, `deploy.discover` |
|
||||
| Build client | `build.<repo>.*` | `build.responses.>` |
|
||||
| Admin deployer | `deploy.>` | `deploy.>` |
|
||||
|
||||
### Generating NKeys
|
||||
|
||||
@@ -9,14 +9,15 @@ import (
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
deploycli "git.t-juice.club/torjus/homelab-deploy/internal/cli"
|
||||
"git.t-juice.club/torjus/homelab-deploy/internal/listener"
|
||||
"git.t-juice.club/torjus/homelab-deploy/internal/mcp"
|
||||
"git.t-juice.club/torjus/homelab-deploy/internal/messages"
|
||||
"code.t-juice.club/torjus/homelab-deploy/internal/builder"
|
||||
deploycli "code.t-juice.club/torjus/homelab-deploy/internal/cli"
|
||||
"code.t-juice.club/torjus/homelab-deploy/internal/listener"
|
||||
"code.t-juice.club/torjus/homelab-deploy/internal/mcp"
|
||||
"code.t-juice.club/torjus/homelab-deploy/internal/messages"
|
||||
"github.com/urfave/cli/v3"
|
||||
)
|
||||
|
||||
const version = "0.1.4"
|
||||
const version = "0.2.5"
|
||||
|
||||
func main() {
|
||||
app := &cli.Command{
|
||||
@@ -25,8 +26,11 @@ func main() {
|
||||
Version: version,
|
||||
Commands: []*cli.Command{
|
||||
listenerCommand(),
|
||||
builderCommand(),
|
||||
mcpCommand(),
|
||||
deployCommand(),
|
||||
buildCommand(),
|
||||
listHostsCommand(),
|
||||
},
|
||||
}
|
||||
|
||||
@@ -89,6 +93,20 @@ func listenerCommand() *cli.Command {
|
||||
Usage: "NATS subject for host discovery requests",
|
||||
Value: "deploy.discover",
|
||||
},
|
||||
&cli.BoolFlag{
|
||||
Name: "metrics-enabled",
|
||||
Usage: "Enable Prometheus metrics endpoint",
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "metrics-addr",
|
||||
Usage: "Address for Prometheus metrics HTTP server",
|
||||
Value: ":9972",
|
||||
},
|
||||
&cli.IntFlag{
|
||||
Name: "heartbeat-interval",
|
||||
Usage: "Interval in seconds for sending status updates during deployment (0 to disable)",
|
||||
Value: 15,
|
||||
},
|
||||
},
|
||||
Action: func(ctx context.Context, c *cli.Command) error {
|
||||
tier := c.String("tier")
|
||||
@@ -97,15 +115,19 @@ func listenerCommand() *cli.Command {
|
||||
}
|
||||
|
||||
cfg := listener.Config{
|
||||
Hostname: c.String("hostname"),
|
||||
Tier: tier,
|
||||
Role: c.String("role"),
|
||||
NATSUrl: c.String("nats-url"),
|
||||
NKeyFile: c.String("nkey-file"),
|
||||
FlakeURL: c.String("flake-url"),
|
||||
Timeout: time.Duration(c.Int("timeout")) * time.Second,
|
||||
DeploySubjects: c.StringSlice("deploy-subject"),
|
||||
DiscoverSubject: c.String("discover-subject"),
|
||||
Hostname: c.String("hostname"),
|
||||
Tier: tier,
|
||||
Role: c.String("role"),
|
||||
NATSUrl: c.String("nats-url"),
|
||||
NKeyFile: c.String("nkey-file"),
|
||||
FlakeURL: c.String("flake-url"),
|
||||
Timeout: time.Duration(c.Int("timeout")) * time.Second,
|
||||
HeartbeatInterval: time.Duration(c.Int("heartbeat-interval")) * time.Second,
|
||||
DeploySubjects: c.StringSlice("deploy-subject"),
|
||||
DiscoverSubject: c.String("discover-subject"),
|
||||
MetricsEnabled: c.Bool("metrics-enabled"),
|
||||
MetricsAddr: c.String("metrics-addr"),
|
||||
Version: version,
|
||||
}
|
||||
|
||||
logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
|
||||
@@ -156,6 +178,10 @@ func mcpCommand() *cli.Command {
|
||||
Usage: "Timeout in seconds for deployment operations",
|
||||
Value: 900,
|
||||
},
|
||||
&cli.BoolFlag{
|
||||
Name: "enable-builds",
|
||||
Usage: "Enable build tool",
|
||||
},
|
||||
},
|
||||
Action: func(_ context.Context, c *cli.Command) error {
|
||||
enableAdmin := c.Bool("enable-admin")
|
||||
@@ -170,6 +196,7 @@ func mcpCommand() *cli.Command {
|
||||
NKeyFile: c.String("nkey-file"),
|
||||
EnableAdmin: enableAdmin,
|
||||
AdminNKeyFile: adminNKeyFile,
|
||||
EnableBuilds: c.Bool("enable-builds"),
|
||||
DiscoverSubject: c.String("discover-subject"),
|
||||
Timeout: time.Duration(c.Int("timeout")) * time.Second,
|
||||
}
|
||||
@@ -270,3 +297,289 @@ func deployCommand() *cli.Command {
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func listHostsCommand() *cli.Command {
|
||||
return &cli.Command{
|
||||
Name: "list-hosts",
|
||||
Usage: "List available deployment targets",
|
||||
Flags: []cli.Flag{
|
||||
&cli.StringFlag{
|
||||
Name: "nats-url",
|
||||
Usage: "NATS server URL",
|
||||
Sources: cli.EnvVars("HOMELAB_DEPLOY_NATS_URL"),
|
||||
Required: true,
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "nkey-file",
|
||||
Usage: "Path to NKey seed file for NATS authentication",
|
||||
Sources: cli.EnvVars("HOMELAB_DEPLOY_NKEY_FILE"),
|
||||
Required: true,
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "tier",
|
||||
Usage: "Filter by tier (test or prod)",
|
||||
Sources: cli.EnvVars("HOMELAB_DEPLOY_TIER"),
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "discover-subject",
|
||||
Usage: "NATS subject for host discovery",
|
||||
Sources: cli.EnvVars("HOMELAB_DEPLOY_DISCOVER_SUBJECT"),
|
||||
Value: "deploy.discover",
|
||||
},
|
||||
&cli.IntFlag{
|
||||
Name: "timeout",
|
||||
Usage: "Timeout in seconds for discovery",
|
||||
Sources: cli.EnvVars("HOMELAB_DEPLOY_DISCOVER_TIMEOUT"),
|
||||
Value: 5,
|
||||
},
|
||||
},
|
||||
Action: func(ctx context.Context, c *cli.Command) error {
|
||||
tierFilter := c.String("tier")
|
||||
if tierFilter != "" && tierFilter != "test" && tierFilter != "prod" {
|
||||
return fmt.Errorf("tier must be 'test' or 'prod', got %q", tierFilter)
|
||||
}
|
||||
|
||||
// Handle shutdown signals
|
||||
ctx, cancel := signal.NotifyContext(ctx, syscall.SIGINT, syscall.SIGTERM)
|
||||
defer cancel()
|
||||
|
||||
responses, err := deploycli.Discover(
|
||||
ctx,
|
||||
c.String("nats-url"),
|
||||
c.String("nkey-file"),
|
||||
c.String("discover-subject"),
|
||||
time.Duration(c.Int("timeout"))*time.Second,
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("discovery failed: %w", err)
|
||||
}
|
||||
|
||||
if len(responses) == 0 {
|
||||
fmt.Println("No hosts responded to discovery request")
|
||||
return nil
|
||||
}
|
||||
|
||||
fmt.Println("Available deployment targets:")
|
||||
fmt.Println()
|
||||
|
||||
for _, resp := range responses {
|
||||
if tierFilter != "" && resp.Tier != tierFilter {
|
||||
continue
|
||||
}
|
||||
|
||||
role := resp.Role
|
||||
if role == "" {
|
||||
role = "(none)"
|
||||
}
|
||||
|
||||
fmt.Printf("- %s (tier=%s, role=%s)\n", resp.Hostname, resp.Tier, role)
|
||||
for _, subj := range resp.DeploySubjects {
|
||||
fmt.Printf(" %s\n", subj)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func builderCommand() *cli.Command {
|
||||
return &cli.Command{
|
||||
Name: "builder",
|
||||
Usage: "Run as a build server (systemd service mode)",
|
||||
Flags: []cli.Flag{
|
||||
&cli.StringFlag{
|
||||
Name: "nats-url",
|
||||
Usage: "NATS server URL",
|
||||
Required: true,
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "nkey-file",
|
||||
Usage: "Path to NKey seed file for NATS authentication",
|
||||
Required: true,
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "config",
|
||||
Usage: "Path to builder configuration file",
|
||||
Required: true,
|
||||
},
|
||||
&cli.IntFlag{
|
||||
Name: "timeout",
|
||||
Usage: "Build timeout in seconds per host",
|
||||
Value: 1800,
|
||||
},
|
||||
&cli.BoolFlag{
|
||||
Name: "metrics-enabled",
|
||||
Usage: "Enable Prometheus metrics endpoint",
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "metrics-addr",
|
||||
Usage: "Address for Prometheus metrics HTTP server",
|
||||
Value: ":9973",
|
||||
},
|
||||
},
|
||||
Action: func(ctx context.Context, c *cli.Command) error {
|
||||
repoCfg, err := builder.LoadConfig(c.String("config"))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to load config: %w", err)
|
||||
}
|
||||
|
||||
cfg := builder.BuilderConfig{
|
||||
NATSUrl: c.String("nats-url"),
|
||||
NKeyFile: c.String("nkey-file"),
|
||||
ConfigFile: c.String("config"),
|
||||
Timeout: time.Duration(c.Int("timeout")) * time.Second,
|
||||
MetricsEnabled: c.Bool("metrics-enabled"),
|
||||
MetricsAddr: c.String("metrics-addr"),
|
||||
}
|
||||
|
||||
logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
|
||||
Level: slog.LevelInfo,
|
||||
}))
|
||||
|
||||
b := builder.New(cfg, repoCfg, logger)
|
||||
|
||||
// Handle shutdown signals
|
||||
ctx, cancel := signal.NotifyContext(ctx, syscall.SIGINT, syscall.SIGTERM)
|
||||
defer cancel()
|
||||
|
||||
return b.Run(ctx)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func buildCommand() *cli.Command {
|
||||
return &cli.Command{
|
||||
Name: "build",
|
||||
Usage: "Trigger a build on the build server",
|
||||
ArgsUsage: "<repo> [hostname]",
|
||||
Flags: []cli.Flag{
|
||||
&cli.StringFlag{
|
||||
Name: "nats-url",
|
||||
Usage: "NATS server URL",
|
||||
Sources: cli.EnvVars("HOMELAB_DEPLOY_NATS_URL"),
|
||||
Required: true,
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "nkey-file",
|
||||
Usage: "Path to NKey seed file for NATS authentication",
|
||||
Sources: cli.EnvVars("HOMELAB_DEPLOY_NKEY_FILE"),
|
||||
Required: true,
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "branch",
|
||||
Usage: "Git branch to build (uses repo default if not specified)",
|
||||
Sources: cli.EnvVars("HOMELAB_DEPLOY_BRANCH"),
|
||||
},
|
||||
&cli.BoolFlag{
|
||||
Name: "all",
|
||||
Usage: "Build all hosts in the repo",
|
||||
},
|
||||
&cli.IntFlag{
|
||||
Name: "timeout",
|
||||
Usage: "Timeout in seconds for collecting responses",
|
||||
Sources: cli.EnvVars("HOMELAB_DEPLOY_BUILD_TIMEOUT"),
|
||||
Value: 3600,
|
||||
},
|
||||
&cli.BoolFlag{
|
||||
Name: "json",
|
||||
Usage: "Output results as JSON",
|
||||
},
|
||||
},
|
||||
Action: func(ctx context.Context, c *cli.Command) error {
|
||||
if c.Args().Len() < 1 {
|
||||
return fmt.Errorf("repo argument required")
|
||||
}
|
||||
|
||||
repo := c.Args().First()
|
||||
target := c.Args().Get(1)
|
||||
all := c.Bool("all")
|
||||
|
||||
if target == "" && !all {
|
||||
return fmt.Errorf("must specify hostname or --all")
|
||||
}
|
||||
if target != "" && all {
|
||||
return fmt.Errorf("cannot specify both hostname and --all")
|
||||
}
|
||||
if all {
|
||||
target = "all"
|
||||
}
|
||||
|
||||
cfg := deploycli.BuildConfig{
|
||||
NATSUrl: c.String("nats-url"),
|
||||
NKeyFile: c.String("nkey-file"),
|
||||
Repo: repo,
|
||||
Target: target,
|
||||
Branch: c.String("branch"),
|
||||
Timeout: time.Duration(c.Int("timeout")) * time.Second,
|
||||
}
|
||||
|
||||
jsonOutput := c.Bool("json")
|
||||
if !jsonOutput {
|
||||
branchStr := cfg.Branch
|
||||
if branchStr == "" {
|
||||
branchStr = "(default)"
|
||||
}
|
||||
fmt.Printf("Building %s target=%s branch=%s\n", repo, target, branchStr)
|
||||
}
|
||||
|
||||
// Handle shutdown signals
|
||||
ctx, cancel := signal.NotifyContext(ctx, syscall.SIGINT, syscall.SIGTERM)
|
||||
defer cancel()
|
||||
|
||||
result, err := deploycli.Build(ctx, cfg, func(resp *messages.BuildResponse) {
|
||||
if jsonOutput {
|
||||
return
|
||||
}
|
||||
switch resp.Status {
|
||||
case messages.BuildStatusStarted:
|
||||
fmt.Printf("Started: %s\n", resp.Message)
|
||||
case messages.BuildStatusProgress:
|
||||
successStr := "..."
|
||||
if resp.HostSuccess != nil {
|
||||
if *resp.HostSuccess {
|
||||
successStr = "success"
|
||||
} else {
|
||||
successStr = "failed"
|
||||
}
|
||||
}
|
||||
fmt.Printf("[%d/%d] %s: %s\n", resp.HostsCompleted, resp.HostsTotal, resp.Host, successStr)
|
||||
case messages.BuildStatusCompleted, messages.BuildStatusFailed:
|
||||
fmt.Printf("\n%s\n", resp.Message)
|
||||
case messages.BuildStatusRejected:
|
||||
fmt.Printf("Rejected: %s\n", resp.Message)
|
||||
}
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("build failed: %w", err)
|
||||
}
|
||||
|
||||
if jsonOutput {
|
||||
data, err := result.MarshalJSON()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal result: %w", err)
|
||||
}
|
||||
fmt.Println(string(data))
|
||||
} else if result.FinalResponse != nil {
|
||||
fmt.Printf("\nBuild complete: %d succeeded, %d failed (%.1fs)\n",
|
||||
result.FinalResponse.Succeeded,
|
||||
result.FinalResponse.Failed,
|
||||
result.FinalResponse.TotalDurationSeconds)
|
||||
for _, hr := range result.FinalResponse.Results {
|
||||
if !hr.Success {
|
||||
fmt.Printf("\n--- %s (error: %s) ---\n", hr.Host, hr.Error)
|
||||
if hr.Output != "" {
|
||||
fmt.Println(hr.Output)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !result.AllSucceeded() {
|
||||
return fmt.Errorf("some builds failed")
|
||||
}
|
||||
|
||||
return nil
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
6
flake.lock
generated
6
flake.lock
generated
@@ -2,11 +2,11 @@
|
||||
"nodes": {
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1770197578,
|
||||
"narHash": "sha256-AYqlWrX09+HvGs8zM6ebZ1pwUqjkfpnv8mewYwAo+iM=",
|
||||
"lastModified": 1770562336,
|
||||
"narHash": "sha256-ub1gpAONMFsT/GU2hV6ZWJjur8rJ6kKxdm9IlCT0j84=",
|
||||
"owner": "nixos",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "00c21e4c93d963c50d4c0c89bfa84ed6e0694df2",
|
||||
"rev": "d6c71932130818840fc8fe9509cf50be8c64634f",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
@@ -26,7 +26,7 @@
|
||||
pname = "homelab-deploy";
|
||||
inherit version;
|
||||
src = ./.;
|
||||
vendorHash = "sha256-JXa+obN62zrrwXlplqojY7dvEunUqDdSTee6N8c5JTg=";
|
||||
vendorHash = "sha256-CN+l0JbQu+HDfotkt3PUFzBexHCHpCKIIZpAQRyojBk=";
|
||||
subPackages = [ "cmd/homelab-deploy" ];
|
||||
};
|
||||
default = self.packages.${system}.homelab-deploy;
|
||||
|
||||
14
go.mod
14
go.mod
@@ -1,4 +1,4 @@
|
||||
module git.t-juice.club/torjus/homelab-deploy
|
||||
module code.t-juice.club/torjus/homelab-deploy
|
||||
|
||||
go 1.25.5
|
||||
|
||||
@@ -7,20 +7,30 @@ require (
|
||||
github.com/mark3labs/mcp-go v0.43.2
|
||||
github.com/nats-io/nats.go v1.48.0
|
||||
github.com/nats-io/nkeys v0.4.15
|
||||
github.com/prometheus/client_golang v1.23.2
|
||||
github.com/urfave/cli/v3 v3.6.2
|
||||
gopkg.in/yaml.v3 v3.0.1
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/bahlo/generic-list-go v0.2.0 // indirect
|
||||
github.com/beorn7/perks v1.0.1 // indirect
|
||||
github.com/buger/jsonparser v1.1.1 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||
github.com/invopop/jsonschema v0.13.0 // indirect
|
||||
github.com/klauspost/compress v1.18.0 // indirect
|
||||
github.com/kylelemons/godebug v1.1.0 // indirect
|
||||
github.com/mailru/easyjson v0.7.7 // indirect
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||
github.com/nats-io/nuid v1.0.1 // indirect
|
||||
github.com/prometheus/client_model v0.6.2 // indirect
|
||||
github.com/prometheus/common v0.66.1 // indirect
|
||||
github.com/prometheus/procfs v0.16.1 // indirect
|
||||
github.com/spf13/cast v1.7.1 // indirect
|
||||
github.com/wk8/go-ordered-map/v2 v2.1.8 // indirect
|
||||
github.com/yosida95/uritemplate/v3 v3.0.2 // indirect
|
||||
go.yaml.in/yaml/v2 v2.4.2 // indirect
|
||||
golang.org/x/crypto v0.47.0 // indirect
|
||||
golang.org/x/sys v0.40.0 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
google.golang.org/protobuf v1.36.8 // indirect
|
||||
)
|
||||
|
||||
33
go.sum
33
go.sum
@@ -1,13 +1,17 @@
|
||||
github.com/bahlo/generic-list-go v0.2.0 h1:5sz/EEAK+ls5wF+NeqDpk5+iNdMDXrh3z3nPnH1Wvgk=
|
||||
github.com/bahlo/generic-list-go v0.2.0/go.mod h1:2KvAjgMlE5NNynlg/5iLrrCCZ2+5xWbdbCW3pNTGyYg=
|
||||
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
|
||||
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
|
||||
github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMUs=
|
||||
github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0=
|
||||
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
|
||||
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8=
|
||||
github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0=
|
||||
github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
|
||||
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
|
||||
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/invopop/jsonschema v0.13.0 h1:KvpoAJWEjR3uD9Kbm2HWJmqsEaHt8lBUpd0qHcIi21E=
|
||||
@@ -19,10 +23,14 @@ github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
|
||||
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
|
||||
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
|
||||
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
|
||||
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
|
||||
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
|
||||
github.com/mark3labs/mcp-go v0.43.2 h1:21PUSlWWiSbUPQwXIJ5WKlETixpFpq+WBpbMGDSVy/I=
|
||||
github.com/mark3labs/mcp-go v0.43.2/go.mod h1:YnJfOL382MIWDx1kMY+2zsRHU/q78dBg9aFb8W6Thdw=
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
|
||||
github.com/nats-io/nats.go v1.48.0 h1:pSFyXApG+yWU/TgbKCjmm5K4wrHu86231/w84qRVR+U=
|
||||
github.com/nats-io/nats.go v1.48.0/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g=
|
||||
github.com/nats-io/nkeys v0.4.15 h1:JACV5jRVO9V856KOapQ7x+EY8Jo3qw1vJt/9Jpwzkk4=
|
||||
@@ -31,8 +39,16 @@ github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=
|
||||
github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8=
|
||||
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
|
||||
github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
|
||||
github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
|
||||
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
|
||||
github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
|
||||
github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs=
|
||||
github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA=
|
||||
github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
|
||||
github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
|
||||
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
|
||||
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
|
||||
github.com/spf13/cast v1.7.1 h1:cuNEagBQEHWN1FnbGEjCXL2szYEXqfJPbP2HNUaca9Y=
|
||||
github.com/spf13/cast v1.7.1/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo=
|
||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||
@@ -43,11 +59,18 @@ github.com/wk8/go-ordered-map/v2 v2.1.8 h1:5h/BUHu93oj4gIdvHHHGsScSTMijfx5PeYkE/
|
||||
github.com/wk8/go-ordered-map/v2 v2.1.8/go.mod h1:5nJHM5DyteebpVlHnWMV0rPz6Zp7+xBAnxjb1X5vnTw=
|
||||
github.com/yosida95/uritemplate/v3 v3.0.2 h1:Ed3Oyj9yrmi9087+NczuL5BwkIc4wvTb5zIM+UJPGz4=
|
||||
github.com/yosida95/uritemplate/v3 v3.0.2/go.mod h1:ILOh0sOhIJR3+L/8afwt/kE++YT040gmv5BQTMR2HP4=
|
||||
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
|
||||
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
|
||||
go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI=
|
||||
go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU=
|
||||
golang.org/x/crypto v0.47.0 h1:V6e3FRj+n4dbpw86FJ8Fv7XVOql7TEwpHapKoMJ/GO8=
|
||||
golang.org/x/crypto v0.47.0/go.mod h1:ff3Y9VzzKbwSSEzWqJsJVBnWmRwRSHt/6Op5n9bQc4A=
|
||||
golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ=
|
||||
golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
|
||||
google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc=
|
||||
google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
|
||||
377
internal/builder/builder.go
Normal file
377
internal/builder/builder.go
Normal file
@@ -0,0 +1,377 @@
|
||||
package builder
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"code.t-juice.club/torjus/homelab-deploy/internal/messages"
|
||||
"code.t-juice.club/torjus/homelab-deploy/internal/metrics"
|
||||
"code.t-juice.club/torjus/homelab-deploy/internal/nats"
|
||||
)
|
||||
|
||||
// hostnameRegex validates hostnames from flake output.
|
||||
// Allows: alphanumeric, dashes, underscores, dots.
|
||||
var hostnameRegex = regexp.MustCompile(`^[a-zA-Z0-9._-]+$`)
|
||||
|
||||
// truncateOutputLines truncates output to the first and last N lines if it exceeds 2*N lines,
|
||||
// returning the result as a slice of strings.
|
||||
func truncateOutputLines(output string, keepLines int) []string {
|
||||
lines := strings.Split(output, "\n")
|
||||
if len(lines) <= keepLines*2 {
|
||||
return lines
|
||||
}
|
||||
head := lines[:keepLines]
|
||||
tail := lines[len(lines)-keepLines:]
|
||||
omitted := len(lines) - keepLines*2
|
||||
result := make([]string, 0, keepLines*2+1)
|
||||
result = append(result, head...)
|
||||
result = append(result, fmt.Sprintf("... (%d lines omitted) ...", omitted))
|
||||
result = append(result, tail...)
|
||||
return result
|
||||
}
|
||||
|
||||
// truncateOutput truncates output to the first and last N lines if it exceeds 2*N lines.
|
||||
func truncateOutput(output string, keepLines int) string {
|
||||
lines := strings.Split(output, "\n")
|
||||
if len(lines) <= keepLines*2 {
|
||||
return output
|
||||
}
|
||||
head := lines[:keepLines]
|
||||
tail := lines[len(lines)-keepLines:]
|
||||
omitted := len(lines) - keepLines*2
|
||||
return strings.Join(head, "\n") + fmt.Sprintf("\n\n... (%d lines omitted) ...\n\n", omitted) + strings.Join(tail, "\n")
|
||||
}
|
||||
|
||||
// BuilderConfig holds the configuration for the builder.
|
||||
type BuilderConfig struct {
|
||||
NATSUrl string
|
||||
NKeyFile string
|
||||
ConfigFile string
|
||||
Timeout time.Duration
|
||||
MetricsEnabled bool
|
||||
MetricsAddr string
|
||||
}
|
||||
|
||||
// Builder handles build requests from NATS.
|
||||
type Builder struct {
|
||||
cfg BuilderConfig
|
||||
repoCfg *Config
|
||||
client *nats.Client
|
||||
executor *Executor
|
||||
lock sync.Mutex
|
||||
busy bool
|
||||
logger *slog.Logger
|
||||
|
||||
// metrics server and collector (nil if metrics disabled)
|
||||
metricsServer *metrics.Server
|
||||
metrics *metrics.BuildCollector
|
||||
}
|
||||
|
||||
// New creates a new builder with the given configuration.
|
||||
func New(cfg BuilderConfig, repoCfg *Config, logger *slog.Logger) *Builder {
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
|
||||
b := &Builder{
|
||||
cfg: cfg,
|
||||
repoCfg: repoCfg,
|
||||
executor: NewExecutor(cfg.Timeout),
|
||||
logger: logger,
|
||||
}
|
||||
|
||||
if cfg.MetricsEnabled {
|
||||
b.metricsServer = metrics.NewServer(metrics.ServerConfig{
|
||||
Addr: cfg.MetricsAddr,
|
||||
Logger: logger,
|
||||
})
|
||||
b.metrics = metrics.NewBuildCollector(b.metricsServer.Registry())
|
||||
}
|
||||
|
||||
return b
|
||||
}
|
||||
|
||||
// Run starts the builder and blocks until the context is cancelled.
|
||||
func (b *Builder) Run(ctx context.Context) error {
|
||||
// Start metrics server if enabled
|
||||
if b.metricsServer != nil {
|
||||
if err := b.metricsServer.Start(); err != nil {
|
||||
return fmt.Errorf("failed to start metrics server: %w", err)
|
||||
}
|
||||
defer func() {
|
||||
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
_ = b.metricsServer.Shutdown(shutdownCtx)
|
||||
}()
|
||||
}
|
||||
|
||||
// Connect to NATS
|
||||
b.logger.Info("connecting to NATS", "url", b.cfg.NATSUrl)
|
||||
|
||||
client, err := nats.Connect(nats.Config{
|
||||
URL: b.cfg.NATSUrl,
|
||||
NKeyFile: b.cfg.NKeyFile,
|
||||
Name: "homelab-deploy-builder",
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to connect to NATS: %w", err)
|
||||
}
|
||||
b.client = client
|
||||
defer b.client.Close()
|
||||
|
||||
b.logger.Info("connected to NATS")
|
||||
|
||||
// Subscribe to build subjects for each repo
|
||||
for repoName := range b.repoCfg.Repos {
|
||||
// Subscribe to build.<repo>.all and build.<repo>.<hostname>
|
||||
allSubject := fmt.Sprintf("build.%s.*", repoName)
|
||||
b.logger.Info("subscribing to build subject", "subject", allSubject)
|
||||
if _, err := b.client.Subscribe(allSubject, b.handleBuildRequest); err != nil {
|
||||
return fmt.Errorf("failed to subscribe to %s: %w", allSubject, err)
|
||||
}
|
||||
}
|
||||
|
||||
b.logger.Info("builder started", "repos", len(b.repoCfg.Repos))
|
||||
|
||||
// Wait for context cancellation
|
||||
<-ctx.Done()
|
||||
b.logger.Info("shutting down builder")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (b *Builder) handleBuildRequest(subject string, data []byte) {
|
||||
req, err := messages.UnmarshalBuildRequest(data)
|
||||
if err != nil {
|
||||
b.logger.Error("failed to unmarshal build request",
|
||||
"subject", subject,
|
||||
"error", err,
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
b.logger.Info("received build request",
|
||||
"subject", subject,
|
||||
"repo", req.Repo,
|
||||
"target", req.Target,
|
||||
"branch", req.Branch,
|
||||
"reply_to", req.ReplyTo,
|
||||
)
|
||||
|
||||
// Validate request
|
||||
if err := req.Validate(); err != nil {
|
||||
b.logger.Warn("invalid build request", "error", err)
|
||||
b.sendResponse(req.ReplyTo, messages.NewBuildResponse(
|
||||
messages.BuildStatusRejected,
|
||||
err.Error(),
|
||||
))
|
||||
return
|
||||
}
|
||||
|
||||
// Get repo config
|
||||
repo, err := b.repoCfg.GetRepo(req.Repo)
|
||||
if err != nil {
|
||||
b.logger.Warn("unknown repo", "repo", req.Repo)
|
||||
b.sendResponse(req.ReplyTo, messages.NewBuildResponse(
|
||||
messages.BuildStatusRejected,
|
||||
fmt.Sprintf("unknown repo: %s", req.Repo),
|
||||
))
|
||||
return
|
||||
}
|
||||
|
||||
// Try to acquire lock
|
||||
b.lock.Lock()
|
||||
if b.busy {
|
||||
b.lock.Unlock()
|
||||
b.logger.Warn("build already in progress")
|
||||
b.sendResponse(req.ReplyTo, messages.NewBuildResponse(
|
||||
messages.BuildStatusRejected,
|
||||
"another build is already in progress",
|
||||
))
|
||||
return
|
||||
}
|
||||
b.busy = true
|
||||
b.lock.Unlock()
|
||||
|
||||
defer func() {
|
||||
b.lock.Lock()
|
||||
b.busy = false
|
||||
b.lock.Unlock()
|
||||
}()
|
||||
|
||||
// Use default branch if not specified
|
||||
branch := req.Branch
|
||||
if branch == "" {
|
||||
branch = repo.DefaultBranch
|
||||
}
|
||||
|
||||
// Determine hosts to build
|
||||
var hosts []string
|
||||
if req.Target == "all" {
|
||||
// List hosts from flake
|
||||
b.sendResponse(req.ReplyTo, messages.NewBuildResponse(
|
||||
messages.BuildStatusStarted,
|
||||
"discovering hosts...",
|
||||
))
|
||||
|
||||
hosts, err = b.executor.ListHosts(context.Background(), repo.URL, branch)
|
||||
if err != nil {
|
||||
b.logger.Error("failed to list hosts", "error", err)
|
||||
b.sendResponse(req.ReplyTo, messages.NewBuildResponse(
|
||||
messages.BuildStatusFailed,
|
||||
fmt.Sprintf("failed to list hosts: %v", err),
|
||||
).WithError(err.Error()))
|
||||
if b.metrics != nil {
|
||||
b.metrics.RecordBuildFailure(req.Repo, "")
|
||||
}
|
||||
return
|
||||
}
|
||||
// Filter out hostnames with invalid characters (security: prevent injection)
|
||||
validHosts := make([]string, 0, len(hosts))
|
||||
for _, host := range hosts {
|
||||
if hostnameRegex.MatchString(host) {
|
||||
validHosts = append(validHosts, host)
|
||||
} else {
|
||||
b.logger.Warn("skipping hostname with invalid characters", "hostname", host)
|
||||
}
|
||||
}
|
||||
hosts = validHosts
|
||||
// Sort hosts for consistent ordering
|
||||
sort.Strings(hosts)
|
||||
} else {
|
||||
hosts = []string{req.Target}
|
||||
}
|
||||
|
||||
if len(hosts) == 0 {
|
||||
b.sendResponse(req.ReplyTo, messages.NewBuildResponse(
|
||||
messages.BuildStatusFailed,
|
||||
"no hosts to build",
|
||||
))
|
||||
return
|
||||
}
|
||||
|
||||
// Send started response
|
||||
b.sendResponse(req.ReplyTo, &messages.BuildResponse{
|
||||
Status: messages.BuildStatusStarted,
|
||||
Message: fmt.Sprintf("building %d host(s)", len(hosts)),
|
||||
HostsTotal: len(hosts),
|
||||
})
|
||||
|
||||
// Build each host sequentially
|
||||
startTime := time.Now()
|
||||
results := make([]messages.BuildHostResult, 0, len(hosts))
|
||||
succeeded := 0
|
||||
failed := 0
|
||||
|
||||
for i, host := range hosts {
|
||||
hostStart := time.Now()
|
||||
b.logger.Info("building host",
|
||||
"host", host,
|
||||
"repo", req.Repo,
|
||||
"rev", branch,
|
||||
"progress", fmt.Sprintf("%d/%d", i+1, len(hosts)),
|
||||
"command", b.executor.BuildCommand(repo.URL, branch, host),
|
||||
)
|
||||
|
||||
result := b.executor.Build(context.Background(), repo.URL, branch, host)
|
||||
hostDuration := time.Since(hostStart).Seconds()
|
||||
|
||||
hostResult := messages.BuildHostResult{
|
||||
Host: host,
|
||||
Success: result.Success,
|
||||
DurationSeconds: hostDuration,
|
||||
}
|
||||
if !result.Success {
|
||||
if result.Error != nil {
|
||||
hostResult.Error = result.Error.Error()
|
||||
}
|
||||
if result.Stderr != "" {
|
||||
hostResult.Output = truncateOutput(result.Stderr, 50)
|
||||
}
|
||||
}
|
||||
results = append(results, hostResult)
|
||||
|
||||
if result.Success {
|
||||
succeeded++
|
||||
b.logger.Info("host build succeeded", "host", host, "repo", req.Repo, "rev", branch, "duration", hostDuration)
|
||||
if b.metrics != nil {
|
||||
b.metrics.RecordHostBuildSuccess(req.Repo, host, hostDuration)
|
||||
}
|
||||
} else {
|
||||
failed++
|
||||
b.logger.Error("host build failed", "host", host, "repo", req.Repo, "rev", branch, "error", hostResult.Error)
|
||||
if result.Stderr != "" {
|
||||
for _, line := range truncateOutputLines(result.Stderr, 50) {
|
||||
b.logger.Warn("build output", "host", host, "repo", req.Repo, "line", line)
|
||||
}
|
||||
}
|
||||
if b.metrics != nil {
|
||||
b.metrics.RecordHostBuildFailure(req.Repo, host, hostDuration)
|
||||
}
|
||||
}
|
||||
|
||||
// Send progress update
|
||||
success := result.Success
|
||||
b.sendResponse(req.ReplyTo, &messages.BuildResponse{
|
||||
Status: messages.BuildStatusProgress,
|
||||
Host: host,
|
||||
HostSuccess: &success,
|
||||
HostsCompleted: i + 1,
|
||||
HostsTotal: len(hosts),
|
||||
})
|
||||
}
|
||||
|
||||
totalDuration := time.Since(startTime).Seconds()
|
||||
|
||||
// Send final response
|
||||
status := messages.BuildStatusCompleted
|
||||
message := fmt.Sprintf("built %d/%d hosts successfully", succeeded, len(hosts))
|
||||
if failed > 0 {
|
||||
status = messages.BuildStatusFailed
|
||||
message = fmt.Sprintf("build failed: %d/%d hosts failed", failed, len(hosts))
|
||||
}
|
||||
|
||||
b.sendResponse(req.ReplyTo, &messages.BuildResponse{
|
||||
Status: status,
|
||||
Message: message,
|
||||
Results: results,
|
||||
TotalDurationSeconds: totalDuration,
|
||||
Succeeded: succeeded,
|
||||
Failed: failed,
|
||||
})
|
||||
|
||||
// Record overall build metrics
|
||||
if b.metrics != nil {
|
||||
if failed == 0 {
|
||||
b.metrics.RecordBuildSuccess(req.Repo)
|
||||
} else {
|
||||
b.metrics.RecordBuildFailure(req.Repo, "")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (b *Builder) sendResponse(replyTo string, resp *messages.BuildResponse) {
|
||||
data, err := resp.Marshal()
|
||||
if err != nil {
|
||||
b.logger.Error("failed to marshal build response", "error", err)
|
||||
return
|
||||
}
|
||||
|
||||
if err := b.client.Publish(replyTo, data); err != nil {
|
||||
b.logger.Error("failed to publish build response",
|
||||
"reply_to", replyTo,
|
||||
"error", err,
|
||||
)
|
||||
}
|
||||
|
||||
// Flush to ensure response is sent immediately
|
||||
if err := b.client.Flush(); err != nil {
|
||||
b.logger.Error("failed to flush", "error", err)
|
||||
}
|
||||
}
|
||||
164
internal/builder/builder_test.go
Normal file
164
internal/builder/builder_test.go
Normal file
@@ -0,0 +1,164 @@
|
||||
package builder
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestTruncateOutput(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
keepLines int
|
||||
wantLines int
|
||||
wantOmit bool
|
||||
}{
|
||||
{
|
||||
name: "short output unchanged",
|
||||
input: "line1\nline2\nline3",
|
||||
keepLines: 50,
|
||||
wantLines: 3,
|
||||
wantOmit: false,
|
||||
},
|
||||
{
|
||||
name: "exactly at threshold unchanged",
|
||||
input: strings.Join(makeLines(100), "\n"),
|
||||
keepLines: 50,
|
||||
wantLines: 100,
|
||||
wantOmit: false,
|
||||
},
|
||||
{
|
||||
name: "over threshold truncated",
|
||||
input: strings.Join(makeLines(150), "\n"),
|
||||
keepLines: 50,
|
||||
wantLines: 103, // 50 + 1 (empty) + 1 (omitted msg) + 1 (empty) + 50
|
||||
wantOmit: true,
|
||||
},
|
||||
{
|
||||
name: "large output truncated",
|
||||
input: strings.Join(makeLines(1000), "\n"),
|
||||
keepLines: 50,
|
||||
wantLines: 103,
|
||||
wantOmit: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got := truncateOutput(tt.input, tt.keepLines)
|
||||
gotLines := strings.Split(got, "\n")
|
||||
|
||||
if len(gotLines) != tt.wantLines {
|
||||
t.Errorf("got %d lines, want %d", len(gotLines), tt.wantLines)
|
||||
}
|
||||
|
||||
hasOmit := strings.Contains(got, "lines omitted")
|
||||
if hasOmit != tt.wantOmit {
|
||||
t.Errorf("got omit marker = %v, want %v", hasOmit, tt.wantOmit)
|
||||
}
|
||||
|
||||
if tt.wantOmit {
|
||||
// Verify first and last lines are preserved
|
||||
inputLines := strings.Split(tt.input, "\n")
|
||||
firstLine := inputLines[0]
|
||||
lastLine := inputLines[len(inputLines)-1]
|
||||
if !strings.HasPrefix(got, firstLine+"\n") {
|
||||
t.Errorf("first line not preserved, got prefix %q, want %q",
|
||||
gotLines[0], firstLine)
|
||||
}
|
||||
if !strings.HasSuffix(got, lastLine) {
|
||||
t.Errorf("last line not preserved, got suffix %q, want %q",
|
||||
gotLines[len(gotLines)-1], lastLine)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func makeLines(n int) []string {
|
||||
lines := make([]string, n)
|
||||
for i := range lines {
|
||||
lines[i] = "line " + strings.Repeat("x", i%80)
|
||||
}
|
||||
return lines
|
||||
}
|
||||
|
||||
func TestTruncateOutputLines(t *testing.T) {
|
||||
t.Run("short output returns all lines", func(t *testing.T) {
|
||||
input := "line1\nline2\nline3"
|
||||
got := truncateOutputLines(input, 50)
|
||||
if len(got) != 3 {
|
||||
t.Errorf("got %d lines, want 3", len(got))
|
||||
}
|
||||
if got[0] != "line1" || got[1] != "line2" || got[2] != "line3" {
|
||||
t.Errorf("unexpected lines: %v", got)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("over threshold returns head + marker + tail", func(t *testing.T) {
|
||||
lines := makeLines(200)
|
||||
input := strings.Join(lines, "\n")
|
||||
got := truncateOutputLines(input, 50)
|
||||
|
||||
// Should be 50 head + 1 marker + 50 tail = 101
|
||||
if len(got) != 101 {
|
||||
t.Errorf("got %d lines, want 101", len(got))
|
||||
}
|
||||
|
||||
// Check first and last lines preserved
|
||||
if got[0] != lines[0] {
|
||||
t.Errorf("first line = %q, want %q", got[0], lines[0])
|
||||
}
|
||||
if got[len(got)-1] != lines[len(lines)-1] {
|
||||
t.Errorf("last line = %q, want %q", got[len(got)-1], lines[len(lines)-1])
|
||||
}
|
||||
|
||||
// Check omitted marker
|
||||
marker := got[50]
|
||||
expected := fmt.Sprintf("... (%d lines omitted) ...", 100)
|
||||
if marker != expected {
|
||||
t.Errorf("marker = %q, want %q", marker, expected)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("exactly at threshold returns all lines", func(t *testing.T) {
|
||||
lines := makeLines(100)
|
||||
input := strings.Join(lines, "\n")
|
||||
got := truncateOutputLines(input, 50)
|
||||
if len(got) != 100 {
|
||||
t.Errorf("got %d lines, want 100", len(got))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestTruncateOutputPreservesContent(t *testing.T) {
|
||||
// Create input with distinct first and last lines
|
||||
lines := make([]string, 200)
|
||||
for i := range lines {
|
||||
lines[i] = "middle"
|
||||
}
|
||||
lines[0] = "FIRST"
|
||||
lines[49] = "LAST_OF_HEAD"
|
||||
lines[150] = "FIRST_OF_TAIL"
|
||||
lines[199] = "LAST"
|
||||
|
||||
input := strings.Join(lines, "\n")
|
||||
got := truncateOutput(input, 50)
|
||||
|
||||
if !strings.Contains(got, "FIRST") {
|
||||
t.Error("missing FIRST")
|
||||
}
|
||||
if !strings.Contains(got, "LAST_OF_HEAD") {
|
||||
t.Error("missing LAST_OF_HEAD")
|
||||
}
|
||||
if !strings.Contains(got, "FIRST_OF_TAIL") {
|
||||
t.Error("missing FIRST_OF_TAIL")
|
||||
}
|
||||
if !strings.Contains(got, "LAST") {
|
||||
t.Error("missing LAST")
|
||||
}
|
||||
if !strings.Contains(got, "(100 lines omitted)") {
|
||||
t.Errorf("wrong omitted count, got: %s", got)
|
||||
}
|
||||
}
|
||||
96
internal/builder/config.go
Normal file
96
internal/builder/config.go
Normal file
@@ -0,0 +1,96 @@
|
||||
package builder
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
// repoNameRegex validates repository names for safe use in NATS subjects.
|
||||
// Only allows alphanumeric, dashes, and underscores (no dots or wildcards).
|
||||
var repoNameRegex = regexp.MustCompile(`^[a-zA-Z0-9_-]+$`)
|
||||
|
||||
// validURLPrefixes are the allowed prefixes for repository URLs.
|
||||
var validURLPrefixes = []string{
|
||||
"git+https://",
|
||||
"git+ssh://",
|
||||
"git+file://",
|
||||
}
|
||||
|
||||
// RepoConfig holds configuration for a single repository.
|
||||
type RepoConfig struct {
|
||||
URL string `yaml:"url"`
|
||||
DefaultBranch string `yaml:"default_branch"`
|
||||
}
|
||||
|
||||
// Config holds the builder configuration.
|
||||
type Config struct {
|
||||
Repos map[string]RepoConfig `yaml:"repos"`
|
||||
}
|
||||
|
||||
// LoadConfig loads configuration from a YAML file.
|
||||
func LoadConfig(path string) (*Config, error) {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read config file: %w", err)
|
||||
}
|
||||
|
||||
var cfg Config
|
||||
if err := yaml.Unmarshal(data, &cfg); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse config file: %w", err)
|
||||
}
|
||||
|
||||
if err := cfg.Validate(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &cfg, nil
|
||||
}
|
||||
|
||||
// Validate checks that the configuration is valid.
|
||||
func (c *Config) Validate() error {
|
||||
if len(c.Repos) == 0 {
|
||||
return fmt.Errorf("no repos configured")
|
||||
}
|
||||
|
||||
for name, repo := range c.Repos {
|
||||
// Validate repo name for safe use in NATS subjects
|
||||
if !repoNameRegex.MatchString(name) {
|
||||
return fmt.Errorf("repo name %q contains invalid characters (only alphanumeric, dash, underscore allowed)", name)
|
||||
}
|
||||
|
||||
if repo.URL == "" {
|
||||
return fmt.Errorf("repo %q: url is required", name)
|
||||
}
|
||||
|
||||
// Validate URL format
|
||||
validURL := false
|
||||
for _, prefix := range validURLPrefixes {
|
||||
if strings.HasPrefix(repo.URL, prefix) {
|
||||
validURL = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !validURL {
|
||||
return fmt.Errorf("repo %q: url must start with git+https://, git+ssh://, or git+file://", name)
|
||||
}
|
||||
|
||||
if repo.DefaultBranch == "" {
|
||||
return fmt.Errorf("repo %q: default_branch is required", name)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetRepo returns the configuration for a repository, or an error if not found.
|
||||
func (c *Config) GetRepo(name string) (*RepoConfig, error) {
|
||||
repo, ok := c.Repos[name]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("repo %q not found in configuration", name)
|
||||
}
|
||||
return &repo, nil
|
||||
}
|
||||
116
internal/builder/executor.go
Normal file
116
internal/builder/executor.go
Normal file
@@ -0,0 +1,116 @@
|
||||
package builder
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Executor handles the execution of nix build commands.
|
||||
type Executor struct {
|
||||
timeout time.Duration
|
||||
}
|
||||
|
||||
// NewExecutor creates a new build executor.
|
||||
func NewExecutor(timeout time.Duration) *Executor {
|
||||
return &Executor{
|
||||
timeout: timeout,
|
||||
}
|
||||
}
|
||||
|
||||
// BuildResult contains the result of a build execution.
|
||||
type BuildResult struct {
|
||||
Success bool
|
||||
ExitCode int
|
||||
Stdout string
|
||||
Stderr string
|
||||
Error error
|
||||
}
|
||||
|
||||
// FlakeShowResult contains the parsed output of nix flake show.
|
||||
type FlakeShowResult struct {
|
||||
NixosConfigurations map[string]any `json:"nixosConfigurations"`
|
||||
}
|
||||
|
||||
// ListHosts returns the list of hosts (nixosConfigurations) available in a flake.
|
||||
func (e *Executor) ListHosts(ctx context.Context, flakeURL, branch string) ([]string, error) {
|
||||
ctx, cancel := context.WithTimeout(ctx, 60*time.Second)
|
||||
defer cancel()
|
||||
|
||||
flakeRef := fmt.Sprintf("%s?ref=%s", flakeURL, branch)
|
||||
cmd := exec.CommandContext(ctx, "nix", "flake", "show", "--json", flakeRef)
|
||||
|
||||
var stdout, stderr bytes.Buffer
|
||||
cmd.Stdout = &stdout
|
||||
cmd.Stderr = &stderr
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
if ctx.Err() == context.DeadlineExceeded {
|
||||
return nil, fmt.Errorf("timeout listing hosts")
|
||||
}
|
||||
return nil, fmt.Errorf("failed to list hosts: %w\n%s", err, stderr.String())
|
||||
}
|
||||
|
||||
var result FlakeShowResult
|
||||
if err := json.Unmarshal(stdout.Bytes(), &result); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse flake show output: %w", err)
|
||||
}
|
||||
|
||||
hosts := make([]string, 0, len(result.NixosConfigurations))
|
||||
for host := range result.NixosConfigurations {
|
||||
hosts = append(hosts, host)
|
||||
}
|
||||
|
||||
return hosts, nil
|
||||
}
|
||||
|
||||
// Build builds a single host's system configuration.
|
||||
func (e *Executor) Build(ctx context.Context, flakeURL, branch, host string) *BuildResult {
|
||||
ctx, cancel := context.WithTimeout(ctx, e.timeout)
|
||||
defer cancel()
|
||||
|
||||
// Build the flake reference for the system toplevel
|
||||
flakeRef := fmt.Sprintf("%s?ref=%s#nixosConfigurations.%s.config.system.build.toplevel", flakeURL, branch, host)
|
||||
|
||||
cmd := exec.CommandContext(ctx, "nix", "build", "--no-link", flakeRef)
|
||||
|
||||
var stdout, stderr bytes.Buffer
|
||||
cmd.Stdout = &stdout
|
||||
cmd.Stderr = &stderr
|
||||
|
||||
err := cmd.Run()
|
||||
|
||||
result := &BuildResult{
|
||||
Stdout: stdout.String(),
|
||||
Stderr: stderr.String(),
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
result.Success = false
|
||||
result.Error = err
|
||||
|
||||
if ctx.Err() == context.DeadlineExceeded {
|
||||
result.Error = fmt.Errorf("build timed out after %v", e.timeout)
|
||||
}
|
||||
|
||||
if exitErr, ok := err.(*exec.ExitError); ok {
|
||||
result.ExitCode = exitErr.ExitCode()
|
||||
} else {
|
||||
result.ExitCode = -1
|
||||
}
|
||||
} else {
|
||||
result.Success = true
|
||||
result.ExitCode = 0
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// BuildCommand returns the command that would be executed (for logging/debugging).
|
||||
func (e *Executor) BuildCommand(flakeURL, branch, host string) string {
|
||||
flakeRef := fmt.Sprintf("%s?ref=%s#nixosConfigurations.%s.config.system.build.toplevel", flakeURL, branch, host)
|
||||
return fmt.Sprintf("nix build --no-link %s", flakeRef)
|
||||
}
|
||||
140
internal/cli/build.go
Normal file
140
internal/cli/build.go
Normal file
@@ -0,0 +1,140 @@
|
||||
package cli
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
|
||||
"code.t-juice.club/torjus/homelab-deploy/internal/messages"
|
||||
"code.t-juice.club/torjus/homelab-deploy/internal/nats"
|
||||
)
|
||||
|
||||
// BuildConfig holds configuration for a build operation.
|
||||
type BuildConfig struct {
|
||||
NATSUrl string
|
||||
NKeyFile string
|
||||
Repo string
|
||||
Target string
|
||||
Branch string
|
||||
Timeout time.Duration
|
||||
}
|
||||
|
||||
// BuildResult contains the aggregated results from a build.
|
||||
type BuildResult struct {
|
||||
Responses []*messages.BuildResponse
|
||||
FinalResponse *messages.BuildResponse
|
||||
Errors []error
|
||||
}
|
||||
|
||||
// AllSucceeded returns true if the build completed successfully.
|
||||
func (r *BuildResult) AllSucceeded() bool {
|
||||
if len(r.Errors) > 0 {
|
||||
return false
|
||||
}
|
||||
if r.FinalResponse == nil {
|
||||
return false
|
||||
}
|
||||
return r.FinalResponse.Status == messages.BuildStatusCompleted && r.FinalResponse.Failed == 0
|
||||
}
|
||||
|
||||
// MarshalJSON returns the JSON representation of the build result.
|
||||
func (r *BuildResult) MarshalJSON() ([]byte, error) {
|
||||
if r.FinalResponse != nil {
|
||||
return json.Marshal(r.FinalResponse)
|
||||
}
|
||||
return json.Marshal(map[string]any{
|
||||
"status": "unknown",
|
||||
"responses": r.Responses,
|
||||
"errors": r.Errors,
|
||||
})
|
||||
}
|
||||
|
||||
// Build triggers a build and collects responses.
|
||||
func Build(ctx context.Context, cfg BuildConfig, onResponse func(*messages.BuildResponse)) (*BuildResult, error) {
|
||||
// Connect to NATS
|
||||
client, err := nats.Connect(nats.Config{
|
||||
URL: cfg.NATSUrl,
|
||||
NKeyFile: cfg.NKeyFile,
|
||||
Name: "homelab-deploy-build-cli",
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to connect to NATS: %w", err)
|
||||
}
|
||||
defer client.Close()
|
||||
|
||||
// Generate unique reply subject
|
||||
requestID := uuid.New().String()
|
||||
replySubject := fmt.Sprintf("build.responses.%s", requestID)
|
||||
|
||||
var mu sync.Mutex
|
||||
result := &BuildResult{}
|
||||
done := make(chan struct{})
|
||||
|
||||
// Subscribe to reply subject
|
||||
sub, err := client.Subscribe(replySubject, func(subject string, data []byte) {
|
||||
resp, err := messages.UnmarshalBuildResponse(data)
|
||||
if err != nil {
|
||||
mu.Lock()
|
||||
result.Errors = append(result.Errors, fmt.Errorf("failed to unmarshal response: %w", err))
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
|
||||
mu.Lock()
|
||||
result.Responses = append(result.Responses, resp)
|
||||
if resp.Status.IsFinal() {
|
||||
result.FinalResponse = resp
|
||||
select {
|
||||
case <-done:
|
||||
default:
|
||||
close(done)
|
||||
}
|
||||
}
|
||||
mu.Unlock()
|
||||
|
||||
if onResponse != nil {
|
||||
onResponse(resp)
|
||||
}
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to subscribe to reply subject: %w", err)
|
||||
}
|
||||
defer func() { _ = sub.Unsubscribe() }()
|
||||
|
||||
// Build and send request
|
||||
req := &messages.BuildRequest{
|
||||
Repo: cfg.Repo,
|
||||
Target: cfg.Target,
|
||||
Branch: cfg.Branch,
|
||||
ReplyTo: replySubject,
|
||||
}
|
||||
|
||||
data, err := req.Marshal()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to marshal request: %w", err)
|
||||
}
|
||||
|
||||
// Publish to build.<repo>.<target>
|
||||
buildSubject := fmt.Sprintf("build.%s.%s", cfg.Repo, cfg.Target)
|
||||
if err := client.Publish(buildSubject, data); err != nil {
|
||||
return nil, fmt.Errorf("failed to publish request: %w", err)
|
||||
}
|
||||
|
||||
if err := client.Flush(); err != nil {
|
||||
return nil, fmt.Errorf("failed to flush: %w", err)
|
||||
}
|
||||
|
||||
// Wait for final response or timeout
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return result, ctx.Err()
|
||||
case <-done:
|
||||
return result, nil
|
||||
case <-time.After(cfg.Timeout):
|
||||
return result, nil
|
||||
}
|
||||
}
|
||||
@@ -8,8 +8,8 @@ import (
|
||||
|
||||
"github.com/google/uuid"
|
||||
|
||||
"git.t-juice.club/torjus/homelab-deploy/internal/messages"
|
||||
"git.t-juice.club/torjus/homelab-deploy/internal/nats"
|
||||
"code.t-juice.club/torjus/homelab-deploy/internal/messages"
|
||||
"code.t-juice.club/torjus/homelab-deploy/internal/nats"
|
||||
)
|
||||
|
||||
// DeployConfig holds configuration for a deploy operation.
|
||||
@@ -28,14 +28,32 @@ type DeployResult struct {
|
||||
Errors []error
|
||||
}
|
||||
|
||||
// AllSucceeded returns true if all responses indicate success.
|
||||
// AllSucceeded returns true if all hosts' final responses indicate success.
|
||||
func (r *DeployResult) AllSucceeded() bool {
|
||||
if len(r.Errors) > 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
// Track the final status for each host
|
||||
finalStatus := make(map[string]messages.Status)
|
||||
for _, resp := range r.Responses {
|
||||
if resp.Status != messages.StatusCompleted {
|
||||
if resp.Status.IsFinal() {
|
||||
finalStatus[resp.Hostname] = resp.Status
|
||||
}
|
||||
}
|
||||
|
||||
// Need at least one host with a final status
|
||||
if len(finalStatus) == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
// All final statuses must be completed
|
||||
for _, status := range finalStatus {
|
||||
if status != messages.StatusCompleted {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return len(r.Responses) > 0 && len(r.Errors) == 0
|
||||
return true
|
||||
}
|
||||
|
||||
// HostCount returns the number of unique hosts that responded.
|
||||
|
||||
@@ -3,7 +3,7 @@ package cli
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"git.t-juice.club/torjus/homelab-deploy/internal/messages"
|
||||
"code.t-juice.club/torjus/homelab-deploy/internal/messages"
|
||||
)
|
||||
|
||||
func TestDeployResult_AllSucceeded(t *testing.T) {
|
||||
@@ -49,6 +49,40 @@ func TestDeployResult_AllSucceeded(t *testing.T) {
|
||||
errors: []error{nil}, // placeholder error
|
||||
want: false,
|
||||
},
|
||||
{
|
||||
name: "with intermediate responses - success",
|
||||
responses: []*messages.DeployResponse{
|
||||
{Hostname: "host1", Status: messages.StatusStarted},
|
||||
{Hostname: "host1", Status: messages.StatusCompleted},
|
||||
},
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "with intermediate responses - failure",
|
||||
responses: []*messages.DeployResponse{
|
||||
{Hostname: "host1", Status: messages.StatusStarted},
|
||||
{Hostname: "host1", Status: messages.StatusFailed},
|
||||
},
|
||||
want: false,
|
||||
},
|
||||
{
|
||||
name: "multiple hosts with intermediate responses",
|
||||
responses: []*messages.DeployResponse{
|
||||
{Hostname: "host1", Status: messages.StatusStarted},
|
||||
{Hostname: "host2", Status: messages.StatusStarted},
|
||||
{Hostname: "host1", Status: messages.StatusCompleted},
|
||||
{Hostname: "host2", Status: messages.StatusCompleted},
|
||||
},
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "only intermediate responses - no final",
|
||||
responses: []*messages.DeployResponse{
|
||||
{Hostname: "host1", Status: messages.StatusStarted},
|
||||
{Hostname: "host1", Status: messages.StatusAccepted},
|
||||
},
|
||||
want: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
|
||||
@@ -7,7 +7,7 @@ import (
|
||||
"os/exec"
|
||||
"time"
|
||||
|
||||
"git.t-juice.club/torjus/homelab-deploy/internal/messages"
|
||||
"code.t-juice.club/torjus/homelab-deploy/internal/messages"
|
||||
)
|
||||
|
||||
// Executor handles the execution of nixos-rebuild commands.
|
||||
@@ -35,6 +35,15 @@ type Result struct {
|
||||
Error error
|
||||
}
|
||||
|
||||
// ExecuteOptions contains optional settings for Execute.
|
||||
type ExecuteOptions struct {
|
||||
// HeartbeatInterval is how often to call the heartbeat callback.
|
||||
// If zero, no heartbeat is sent.
|
||||
HeartbeatInterval time.Duration
|
||||
// HeartbeatCallback is called periodically with elapsed time while the command runs.
|
||||
HeartbeatCallback func(elapsed time.Duration)
|
||||
}
|
||||
|
||||
// ValidateRevision checks if a revision exists in the remote repository.
|
||||
// It uses git ls-remote to verify the ref exists.
|
||||
func (e *Executor) ValidateRevision(ctx context.Context, revision string) error {
|
||||
@@ -65,6 +74,11 @@ func (e *Executor) ValidateRevision(ctx context.Context, revision string) error
|
||||
|
||||
// Execute runs nixos-rebuild with the specified action and revision.
|
||||
func (e *Executor) Execute(ctx context.Context, action messages.Action, revision string) *Result {
|
||||
return e.ExecuteWithOptions(ctx, action, revision, nil)
|
||||
}
|
||||
|
||||
// ExecuteWithOptions runs nixos-rebuild with the specified action, revision, and options.
|
||||
func (e *Executor) ExecuteWithOptions(ctx context.Context, action messages.Action, revision string, opts *ExecuteOptions) *Result {
|
||||
ctx, cancel := context.WithTimeout(ctx, e.timeout)
|
||||
defer cancel()
|
||||
|
||||
@@ -77,7 +91,41 @@ func (e *Executor) Execute(ctx context.Context, action messages.Action, revision
|
||||
cmd.Stdout = &stdout
|
||||
cmd.Stderr = &stderr
|
||||
|
||||
err := cmd.Run()
|
||||
// Start the command
|
||||
startTime := time.Now()
|
||||
if err := cmd.Start(); err != nil {
|
||||
return &Result{
|
||||
Success: false,
|
||||
ExitCode: -1,
|
||||
Error: fmt.Errorf("failed to start command: %w", err),
|
||||
}
|
||||
}
|
||||
|
||||
// Set up heartbeat if configured
|
||||
var heartbeatDone chan struct{}
|
||||
if opts != nil && opts.HeartbeatInterval > 0 && opts.HeartbeatCallback != nil {
|
||||
heartbeatDone = make(chan struct{})
|
||||
go func() {
|
||||
ticker := time.NewTicker(opts.HeartbeatInterval)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-heartbeatDone:
|
||||
return
|
||||
case <-ticker.C:
|
||||
opts.HeartbeatCallback(time.Since(startTime))
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// Wait for command to complete
|
||||
err := cmd.Wait()
|
||||
|
||||
// Stop heartbeat goroutine
|
||||
if heartbeatDone != nil {
|
||||
close(heartbeatDone)
|
||||
}
|
||||
|
||||
result := &Result{
|
||||
Stdout: stdout.String(),
|
||||
|
||||
@@ -4,7 +4,7 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"git.t-juice.club/torjus/homelab-deploy/internal/messages"
|
||||
"code.t-juice.club/torjus/homelab-deploy/internal/messages"
|
||||
)
|
||||
|
||||
func TestExecutor_BuildCommand(t *testing.T) {
|
||||
|
||||
@@ -6,22 +6,27 @@ import (
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"git.t-juice.club/torjus/homelab-deploy/internal/deploy"
|
||||
"git.t-juice.club/torjus/homelab-deploy/internal/messages"
|
||||
"git.t-juice.club/torjus/homelab-deploy/internal/nats"
|
||||
"code.t-juice.club/torjus/homelab-deploy/internal/deploy"
|
||||
"code.t-juice.club/torjus/homelab-deploy/internal/messages"
|
||||
"code.t-juice.club/torjus/homelab-deploy/internal/metrics"
|
||||
"code.t-juice.club/torjus/homelab-deploy/internal/nats"
|
||||
)
|
||||
|
||||
// Config holds the configuration for the listener.
|
||||
type Config struct {
|
||||
Hostname string
|
||||
Tier string
|
||||
Role string
|
||||
NATSUrl string
|
||||
NKeyFile string
|
||||
FlakeURL string
|
||||
Timeout time.Duration
|
||||
DeploySubjects []string
|
||||
DiscoverSubject string
|
||||
Hostname string
|
||||
Tier string
|
||||
Role string
|
||||
NATSUrl string
|
||||
NKeyFile string
|
||||
FlakeURL string
|
||||
Timeout time.Duration
|
||||
HeartbeatInterval time.Duration
|
||||
DeploySubjects []string
|
||||
DiscoverSubject string
|
||||
MetricsEnabled bool
|
||||
MetricsAddr string
|
||||
Version string
|
||||
}
|
||||
|
||||
// Listener handles deployment requests from NATS.
|
||||
@@ -34,6 +39,14 @@ type Listener struct {
|
||||
|
||||
// Expanded subjects for discovery responses
|
||||
expandedSubjects []string
|
||||
|
||||
// restartCh signals that the listener should exit for restart
|
||||
// (e.g., after a successful switch deployment)
|
||||
restartCh chan struct{}
|
||||
|
||||
// metrics server and collector (nil if metrics disabled)
|
||||
metricsServer *metrics.Server
|
||||
metrics *metrics.Collector
|
||||
}
|
||||
|
||||
// New creates a new listener with the given configuration.
|
||||
@@ -42,16 +55,42 @@ func New(cfg Config, logger *slog.Logger) *Listener {
|
||||
logger = slog.Default()
|
||||
}
|
||||
|
||||
return &Listener{
|
||||
cfg: cfg,
|
||||
executor: deploy.NewExecutor(cfg.FlakeURL, cfg.Hostname, cfg.Timeout),
|
||||
lock: deploy.NewLock(),
|
||||
logger: logger,
|
||||
l := &Listener{
|
||||
cfg: cfg,
|
||||
executor: deploy.NewExecutor(cfg.FlakeURL, cfg.Hostname, cfg.Timeout),
|
||||
lock: deploy.NewLock(),
|
||||
logger: logger,
|
||||
restartCh: make(chan struct{}, 1),
|
||||
}
|
||||
|
||||
if cfg.MetricsEnabled {
|
||||
l.metricsServer = metrics.NewServer(metrics.ServerConfig{
|
||||
Addr: cfg.MetricsAddr,
|
||||
Logger: logger,
|
||||
})
|
||||
l.metrics = l.metricsServer.Collector()
|
||||
}
|
||||
|
||||
return l
|
||||
}
|
||||
|
||||
// Run starts the listener and blocks until the context is cancelled.
|
||||
func (l *Listener) Run(ctx context.Context) error {
|
||||
// Start metrics server if enabled
|
||||
if l.metricsServer != nil {
|
||||
if err := l.metricsServer.Start(); err != nil {
|
||||
return fmt.Errorf("failed to start metrics server: %w", err)
|
||||
}
|
||||
defer func() {
|
||||
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
_ = l.metricsServer.Shutdown(shutdownCtx)
|
||||
}()
|
||||
|
||||
// Set instance info metric
|
||||
l.metrics.SetInfo(l.cfg.Hostname, l.cfg.Tier, l.cfg.Role, l.cfg.Version)
|
||||
}
|
||||
|
||||
// Connect to NATS
|
||||
l.logger.Info("connecting to NATS",
|
||||
"url", l.cfg.NATSUrl,
|
||||
@@ -93,9 +132,13 @@ func (l *Listener) Run(ctx context.Context) error {
|
||||
|
||||
l.logger.Info("listener started", "deploy_subjects", l.expandedSubjects, "discover_subject", discoverSubject)
|
||||
|
||||
// Wait for context cancellation
|
||||
<-ctx.Done()
|
||||
l.logger.Info("shutting down listener")
|
||||
// Wait for context cancellation or restart signal
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
l.logger.Info("shutting down listener")
|
||||
case <-l.restartCh:
|
||||
l.logger.Info("exiting for restart after successful switch deployment")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -127,6 +170,9 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) {
|
||||
messages.StatusRejected,
|
||||
err.Error(),
|
||||
).WithError(messages.ErrorInvalidAction))
|
||||
if l.metrics != nil {
|
||||
l.metrics.RecordRejection(req.Action, messages.ErrorInvalidAction)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
@@ -141,6 +187,9 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) {
|
||||
messages.StatusRejected,
|
||||
"another deployment is already in progress",
|
||||
).WithError(messages.ErrorAlreadyRunning))
|
||||
if l.metrics != nil {
|
||||
l.metrics.RecordRejection(req.Action, messages.ErrorAlreadyRunning)
|
||||
}
|
||||
return
|
||||
}
|
||||
defer l.lock.Release()
|
||||
@@ -152,6 +201,12 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) {
|
||||
fmt.Sprintf("starting deployment: %s", l.executor.BuildCommand(req.Action, req.Revision)),
|
||||
))
|
||||
|
||||
// Record deployment start for metrics
|
||||
if l.metrics != nil {
|
||||
l.metrics.RecordDeploymentStart()
|
||||
}
|
||||
startTime := time.Now()
|
||||
|
||||
// Validate revision
|
||||
ctx := context.Background()
|
||||
if err := l.executor.ValidateRevision(ctx, req.Revision); err != nil {
|
||||
@@ -164,6 +219,10 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) {
|
||||
messages.StatusFailed,
|
||||
fmt.Sprintf("revision validation failed: %v", err),
|
||||
).WithError(messages.ErrorInvalidRevision))
|
||||
if l.metrics != nil {
|
||||
duration := time.Since(startTime).Seconds()
|
||||
l.metrics.RecordDeploymentFailure(req.Action, messages.ErrorInvalidRevision, duration)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
@@ -174,7 +233,23 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) {
|
||||
"command", l.executor.BuildCommand(req.Action, req.Revision),
|
||||
)
|
||||
|
||||
result := l.executor.Execute(ctx, req.Action, req.Revision)
|
||||
// Set up heartbeat options to send periodic status updates
|
||||
var opts *deploy.ExecuteOptions
|
||||
if l.cfg.HeartbeatInterval > 0 {
|
||||
opts = &deploy.ExecuteOptions{
|
||||
HeartbeatInterval: l.cfg.HeartbeatInterval,
|
||||
HeartbeatCallback: func(elapsed time.Duration) {
|
||||
l.sendResponse(req.ReplyTo, messages.NewDeployResponse(
|
||||
l.cfg.Hostname,
|
||||
messages.StatusRunning,
|
||||
fmt.Sprintf("deployment in progress (%s elapsed)", elapsed.Round(time.Second)),
|
||||
))
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
result := l.executor.ExecuteWithOptions(ctx, req.Action, req.Revision, opts)
|
||||
duration := time.Since(startTime).Seconds()
|
||||
|
||||
if result.Success {
|
||||
l.logger.Info("deployment completed successfully",
|
||||
@@ -185,6 +260,33 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) {
|
||||
messages.StatusCompleted,
|
||||
"deployment completed successfully",
|
||||
))
|
||||
// Flush to ensure the completed response is sent before we potentially restart
|
||||
if err := l.client.Flush(); err != nil {
|
||||
l.logger.Error("failed to flush completed response", "error", err)
|
||||
}
|
||||
if l.metrics != nil {
|
||||
l.metrics.RecordDeploymentEnd(req.Action, true, duration)
|
||||
}
|
||||
|
||||
// After a successful switch, signal restart so we pick up any new version
|
||||
if req.Action == messages.ActionSwitch {
|
||||
// Wait for metrics scrape before restarting (if metrics enabled)
|
||||
if l.metricsServer != nil {
|
||||
l.logger.Info("waiting for metrics scrape before restart")
|
||||
select {
|
||||
case <-l.metricsServer.ScrapeCh():
|
||||
l.logger.Info("metrics scraped, proceeding with restart")
|
||||
case <-time.After(60 * time.Second):
|
||||
l.logger.Warn("no metrics scrape within timeout, proceeding with restart anyway")
|
||||
}
|
||||
}
|
||||
|
||||
select {
|
||||
case l.restartCh <- struct{}{}:
|
||||
default:
|
||||
// Channel already has a signal pending
|
||||
}
|
||||
}
|
||||
} else {
|
||||
l.logger.Error("deployment failed",
|
||||
"exit_code", result.ExitCode,
|
||||
@@ -202,6 +304,9 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) {
|
||||
messages.StatusFailed,
|
||||
fmt.Sprintf("deployment failed (exit code %d): %s", result.ExitCode, result.Stderr),
|
||||
).WithError(errorCode))
|
||||
if l.metrics != nil {
|
||||
l.metrics.RecordDeploymentFailure(req.Action, errorCode, duration)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
109
internal/mcp/build_tools.go
Normal file
109
internal/mcp/build_tools.go
Normal file
@@ -0,0 +1,109 @@
|
||||
package mcp
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/mark3labs/mcp-go/mcp"
|
||||
|
||||
deploycli "code.t-juice.club/torjus/homelab-deploy/internal/cli"
|
||||
"code.t-juice.club/torjus/homelab-deploy/internal/messages"
|
||||
)
|
||||
|
||||
// BuildTool creates the build tool definition.
|
||||
func BuildTool() mcp.Tool {
|
||||
return mcp.NewTool(
|
||||
"build",
|
||||
mcp.WithDescription("Trigger a Nix build on the build server"),
|
||||
mcp.WithString("repo",
|
||||
mcp.Required(),
|
||||
mcp.Description("Repository name (must match builder config)"),
|
||||
),
|
||||
mcp.WithString("target",
|
||||
mcp.Description("Target hostname, or omit to build all hosts"),
|
||||
),
|
||||
mcp.WithBoolean("all",
|
||||
mcp.Description("Build all hosts in the repository (default if no target specified)"),
|
||||
),
|
||||
mcp.WithString("branch",
|
||||
mcp.Description("Git branch to build (uses repo default if not specified)"),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
// HandleBuild handles the build tool.
|
||||
func (h *ToolHandler) HandleBuild(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) {
|
||||
repo, err := request.RequireString("repo")
|
||||
if err != nil {
|
||||
return mcp.NewToolResultError("repo is required"), nil
|
||||
}
|
||||
|
||||
target := request.GetString("target", "")
|
||||
all := request.GetBool("all", false)
|
||||
branch := request.GetString("branch", "")
|
||||
|
||||
// Default to "all" if no target specified
|
||||
if target == "" {
|
||||
if !all {
|
||||
all = true
|
||||
}
|
||||
target = "all"
|
||||
}
|
||||
if all && target != "all" {
|
||||
return mcp.NewToolResultError("cannot specify both target and all"), nil
|
||||
}
|
||||
|
||||
cfg := deploycli.BuildConfig{
|
||||
NATSUrl: h.cfg.NATSUrl,
|
||||
NKeyFile: h.cfg.NKeyFile,
|
||||
Repo: repo,
|
||||
Target: target,
|
||||
Branch: branch,
|
||||
Timeout: h.cfg.Timeout,
|
||||
}
|
||||
|
||||
var output strings.Builder
|
||||
branchStr := branch
|
||||
if branchStr == "" {
|
||||
branchStr = "(default)"
|
||||
}
|
||||
output.WriteString(fmt.Sprintf("Building %s target=%s branch=%s\n\n", repo, target, branchStr))
|
||||
|
||||
result, err := deploycli.Build(ctx, cfg, func(resp *messages.BuildResponse) {
|
||||
switch resp.Status {
|
||||
case messages.BuildStatusStarted:
|
||||
output.WriteString(fmt.Sprintf("Started: %s\n", resp.Message))
|
||||
case messages.BuildStatusProgress:
|
||||
successStr := "..."
|
||||
if resp.HostSuccess != nil {
|
||||
if *resp.HostSuccess {
|
||||
successStr = "success"
|
||||
} else {
|
||||
successStr = "failed"
|
||||
}
|
||||
}
|
||||
output.WriteString(fmt.Sprintf("[%d/%d] %s: %s\n", resp.HostsCompleted, resp.HostsTotal, resp.Host, successStr))
|
||||
case messages.BuildStatusCompleted, messages.BuildStatusFailed:
|
||||
output.WriteString(fmt.Sprintf("\n%s\n", resp.Message))
|
||||
case messages.BuildStatusRejected:
|
||||
output.WriteString(fmt.Sprintf("Rejected: %s\n", resp.Message))
|
||||
}
|
||||
})
|
||||
if err != nil {
|
||||
return mcp.NewToolResultError(fmt.Sprintf("build failed: %v", err)), nil
|
||||
}
|
||||
|
||||
if result.FinalResponse != nil {
|
||||
output.WriteString(fmt.Sprintf("\nBuild complete: %d succeeded, %d failed (%.1fs)\n",
|
||||
result.FinalResponse.Succeeded,
|
||||
result.FinalResponse.Failed,
|
||||
result.FinalResponse.TotalDurationSeconds))
|
||||
}
|
||||
|
||||
if !result.AllSucceeded() {
|
||||
output.WriteString("WARNING: Some builds failed\n")
|
||||
}
|
||||
|
||||
return mcp.NewToolResultText(output.String()), nil
|
||||
}
|
||||
@@ -12,6 +12,7 @@ type ServerConfig struct {
|
||||
NKeyFile string
|
||||
EnableAdmin bool
|
||||
AdminNKeyFile string
|
||||
EnableBuilds bool
|
||||
DiscoverSubject string
|
||||
Timeout time.Duration
|
||||
}
|
||||
@@ -49,6 +50,11 @@ func New(cfg ServerConfig) *Server {
|
||||
s.AddTool(DeployAdminTool(), handler.HandleDeployAdmin)
|
||||
}
|
||||
|
||||
// Optionally register build tool
|
||||
if cfg.EnableBuilds {
|
||||
s.AddTool(BuildTool(), handler.HandleBuild)
|
||||
}
|
||||
|
||||
return &Server{
|
||||
cfg: cfg,
|
||||
server: s,
|
||||
|
||||
@@ -9,8 +9,8 @@ import (
|
||||
|
||||
"github.com/mark3labs/mcp-go/mcp"
|
||||
|
||||
deploycli "git.t-juice.club/torjus/homelab-deploy/internal/cli"
|
||||
"git.t-juice.club/torjus/homelab-deploy/internal/messages"
|
||||
deploycli "code.t-juice.club/torjus/homelab-deploy/internal/cli"
|
||||
"code.t-juice.club/torjus/homelab-deploy/internal/messages"
|
||||
)
|
||||
|
||||
// ToolConfig holds configuration for the MCP tools.
|
||||
|
||||
135
internal/messages/build.go
Normal file
135
internal/messages/build.go
Normal file
@@ -0,0 +1,135 @@
|
||||
package messages
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// BuildStatus represents the status of a build response.
|
||||
type BuildStatus string
|
||||
|
||||
const (
|
||||
BuildStatusStarted BuildStatus = "started"
|
||||
BuildStatusProgress BuildStatus = "progress"
|
||||
BuildStatusCompleted BuildStatus = "completed"
|
||||
BuildStatusFailed BuildStatus = "failed"
|
||||
BuildStatusRejected BuildStatus = "rejected"
|
||||
)
|
||||
|
||||
// IsFinal returns true if this status indicates a terminal state.
|
||||
func (s BuildStatus) IsFinal() bool {
|
||||
switch s {
|
||||
case BuildStatusCompleted, BuildStatusFailed, BuildStatusRejected:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// BuildRequest is the message sent to request a build.
|
||||
type BuildRequest struct {
|
||||
Repo string `json:"repo"` // Must match config
|
||||
Target string `json:"target"` // Hostname or "all"
|
||||
Branch string `json:"branch,omitempty"` // Optional, uses repo default
|
||||
ReplyTo string `json:"reply_to"`
|
||||
}
|
||||
|
||||
// Validate checks that the request is valid.
|
||||
func (r *BuildRequest) Validate() error {
|
||||
if r.Repo == "" {
|
||||
return fmt.Errorf("repo is required")
|
||||
}
|
||||
if !revisionRegex.MatchString(r.Repo) {
|
||||
return fmt.Errorf("invalid repo name format: %q", r.Repo)
|
||||
}
|
||||
if r.Target == "" {
|
||||
return fmt.Errorf("target is required")
|
||||
}
|
||||
// Target must be "all" or a valid hostname (same format as revision/branch)
|
||||
if r.Target != "all" && !revisionRegex.MatchString(r.Target) {
|
||||
return fmt.Errorf("invalid target format: %q", r.Target)
|
||||
}
|
||||
if r.Branch != "" && !revisionRegex.MatchString(r.Branch) {
|
||||
return fmt.Errorf("invalid branch format: %q", r.Branch)
|
||||
}
|
||||
if r.ReplyTo == "" {
|
||||
return fmt.Errorf("reply_to is required")
|
||||
}
|
||||
// Validate reply_to format to prevent publishing to arbitrary subjects
|
||||
if !strings.HasPrefix(r.ReplyTo, "build.responses.") {
|
||||
return fmt.Errorf("invalid reply_to format: must start with 'build.responses.'")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Marshal serializes the request to JSON.
|
||||
func (r *BuildRequest) Marshal() ([]byte, error) {
|
||||
return json.Marshal(r)
|
||||
}
|
||||
|
||||
// UnmarshalBuildRequest deserializes a request from JSON.
|
||||
func UnmarshalBuildRequest(data []byte) (*BuildRequest, error) {
|
||||
var r BuildRequest
|
||||
if err := json.Unmarshal(data, &r); err != nil {
|
||||
return nil, fmt.Errorf("failed to unmarshal build request: %w", err)
|
||||
}
|
||||
return &r, nil
|
||||
}
|
||||
|
||||
// BuildHostResult contains the result of building a single host.
|
||||
type BuildHostResult struct {
|
||||
Host string `json:"host"`
|
||||
Success bool `json:"success"`
|
||||
Error string `json:"error,omitempty"`
|
||||
Output string `json:"output,omitempty"`
|
||||
DurationSeconds float64 `json:"duration_seconds"`
|
||||
}
|
||||
|
||||
// BuildResponse is the message sent in response to a build request.
|
||||
type BuildResponse struct {
|
||||
Status BuildStatus `json:"status"`
|
||||
Message string `json:"message,omitempty"`
|
||||
|
||||
// Progress updates
|
||||
Host string `json:"host,omitempty"`
|
||||
HostSuccess *bool `json:"host_success,omitempty"`
|
||||
HostsCompleted int `json:"hosts_completed,omitempty"`
|
||||
HostsTotal int `json:"hosts_total,omitempty"`
|
||||
|
||||
// Final response
|
||||
Results []BuildHostResult `json:"results,omitempty"`
|
||||
TotalDurationSeconds float64 `json:"total_duration_seconds,omitempty"`
|
||||
Succeeded int `json:"succeeded,omitempty"`
|
||||
Failed int `json:"failed,omitempty"`
|
||||
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
// NewBuildResponse creates a new response with the given status and message.
|
||||
func NewBuildResponse(status BuildStatus, message string) *BuildResponse {
|
||||
return &BuildResponse{
|
||||
Status: status,
|
||||
Message: message,
|
||||
}
|
||||
}
|
||||
|
||||
// WithError adds an error message to the response.
|
||||
func (r *BuildResponse) WithError(err string) *BuildResponse {
|
||||
r.Error = err
|
||||
return r
|
||||
}
|
||||
|
||||
// Marshal serializes the response to JSON.
|
||||
func (r *BuildResponse) Marshal() ([]byte, error) {
|
||||
return json.Marshal(r)
|
||||
}
|
||||
|
||||
// UnmarshalBuildResponse deserializes a response from JSON.
|
||||
func UnmarshalBuildResponse(data []byte) (*BuildResponse, error) {
|
||||
var r BuildResponse
|
||||
if err := json.Unmarshal(data, &r); err != nil {
|
||||
return nil, fmt.Errorf("failed to unmarshal build response: %w", err)
|
||||
}
|
||||
return &r, nil
|
||||
}
|
||||
@@ -35,6 +35,7 @@ const (
|
||||
StatusAccepted Status = "accepted"
|
||||
StatusRejected Status = "rejected"
|
||||
StatusStarted Status = "started"
|
||||
StatusRunning Status = "running"
|
||||
StatusCompleted Status = "completed"
|
||||
StatusFailed Status = "failed"
|
||||
)
|
||||
|
||||
99
internal/metrics/build_metrics.go
Normal file
99
internal/metrics/build_metrics.go
Normal file
@@ -0,0 +1,99 @@
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
// BuildCollector holds all Prometheus metrics for the builder.
|
||||
type BuildCollector struct {
|
||||
buildsTotal *prometheus.CounterVec
|
||||
buildHostTotal *prometheus.CounterVec
|
||||
buildDuration *prometheus.HistogramVec
|
||||
buildLastTimestamp *prometheus.GaugeVec
|
||||
buildLastSuccessTime *prometheus.GaugeVec
|
||||
buildLastFailureTime *prometheus.GaugeVec
|
||||
}
|
||||
|
||||
// NewBuildCollector creates a new build metrics collector and registers it with the given registerer.
|
||||
func NewBuildCollector(reg prometheus.Registerer) *BuildCollector {
|
||||
c := &BuildCollector{
|
||||
buildsTotal: prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Name: "homelab_deploy_builds_total",
|
||||
Help: "Total builds processed",
|
||||
},
|
||||
[]string{"repo", "status"},
|
||||
),
|
||||
buildHostTotal: prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Name: "homelab_deploy_build_host_total",
|
||||
Help: "Total host builds processed",
|
||||
},
|
||||
[]string{"repo", "host", "status"},
|
||||
),
|
||||
buildDuration: prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Name: "homelab_deploy_build_duration_seconds",
|
||||
Help: "Build execution time per host",
|
||||
Buckets: []float64{5, 10, 30, 60, 120, 300, 600, 1800, 3600, 7200, 14400},
|
||||
},
|
||||
[]string{"repo", "host"},
|
||||
),
|
||||
buildLastTimestamp: prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Name: "homelab_deploy_build_last_timestamp",
|
||||
Help: "Timestamp of last build attempt",
|
||||
},
|
||||
[]string{"repo"},
|
||||
),
|
||||
buildLastSuccessTime: prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Name: "homelab_deploy_build_last_success_timestamp",
|
||||
Help: "Timestamp of last successful build",
|
||||
},
|
||||
[]string{"repo"},
|
||||
),
|
||||
buildLastFailureTime: prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Name: "homelab_deploy_build_last_failure_timestamp",
|
||||
Help: "Timestamp of last failed build",
|
||||
},
|
||||
[]string{"repo"},
|
||||
),
|
||||
}
|
||||
|
||||
reg.MustRegister(c.buildsTotal)
|
||||
reg.MustRegister(c.buildHostTotal)
|
||||
reg.MustRegister(c.buildDuration)
|
||||
reg.MustRegister(c.buildLastTimestamp)
|
||||
reg.MustRegister(c.buildLastSuccessTime)
|
||||
reg.MustRegister(c.buildLastFailureTime)
|
||||
|
||||
return c
|
||||
}
|
||||
|
||||
// RecordBuildSuccess records a successful build.
|
||||
func (c *BuildCollector) RecordBuildSuccess(repo string) {
|
||||
c.buildsTotal.WithLabelValues(repo, "success").Inc()
|
||||
c.buildLastTimestamp.WithLabelValues(repo).SetToCurrentTime()
|
||||
c.buildLastSuccessTime.WithLabelValues(repo).SetToCurrentTime()
|
||||
}
|
||||
|
||||
// RecordBuildFailure records a failed build.
|
||||
func (c *BuildCollector) RecordBuildFailure(repo, errorCode string) {
|
||||
c.buildsTotal.WithLabelValues(repo, "failure").Inc()
|
||||
c.buildLastTimestamp.WithLabelValues(repo).SetToCurrentTime()
|
||||
c.buildLastFailureTime.WithLabelValues(repo).SetToCurrentTime()
|
||||
}
|
||||
|
||||
// RecordHostBuildSuccess records a successful host build.
|
||||
func (c *BuildCollector) RecordHostBuildSuccess(repo, host string, durationSeconds float64) {
|
||||
c.buildHostTotal.WithLabelValues(repo, host, "success").Inc()
|
||||
c.buildDuration.WithLabelValues(repo, host).Observe(durationSeconds)
|
||||
}
|
||||
|
||||
// RecordHostBuildFailure records a failed host build.
|
||||
func (c *BuildCollector) RecordHostBuildFailure(repo, host string, durationSeconds float64) {
|
||||
c.buildHostTotal.WithLabelValues(repo, host, "failure").Inc()
|
||||
c.buildDuration.WithLabelValues(repo, host).Observe(durationSeconds)
|
||||
}
|
||||
125
internal/metrics/metrics.go
Normal file
125
internal/metrics/metrics.go
Normal file
@@ -0,0 +1,125 @@
|
||||
// Package metrics provides Prometheus metrics for the homelab-deploy listener.
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"code.t-juice.club/torjus/homelab-deploy/internal/messages"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
// Collector holds all Prometheus metrics for the listener.
|
||||
type Collector struct {
|
||||
deploymentsTotal *prometheus.CounterVec
|
||||
deploymentDuration *prometheus.HistogramVec
|
||||
deploymentInProgress prometheus.Gauge
|
||||
info *prometheus.GaugeVec
|
||||
}
|
||||
|
||||
// NewCollector creates a new metrics collector and registers it with the given registerer.
|
||||
func NewCollector(reg prometheus.Registerer) *Collector {
|
||||
c := &Collector{
|
||||
deploymentsTotal: prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Name: "homelab_deploy_deployments_total",
|
||||
Help: "Total deployment requests processed",
|
||||
},
|
||||
[]string{"status", "action", "error_code"},
|
||||
),
|
||||
deploymentDuration: prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Name: "homelab_deploy_deployment_duration_seconds",
|
||||
Help: "Deployment execution time",
|
||||
// Bucket boundaries for typical NixOS build times
|
||||
Buckets: []float64{30, 60, 120, 300, 600, 900, 1200, 1800},
|
||||
},
|
||||
[]string{"action", "success"},
|
||||
),
|
||||
deploymentInProgress: prometheus.NewGauge(
|
||||
prometheus.GaugeOpts{
|
||||
Name: "homelab_deploy_deployment_in_progress",
|
||||
Help: "1 if deployment running, 0 otherwise",
|
||||
},
|
||||
),
|
||||
info: prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Name: "homelab_deploy_info",
|
||||
Help: "Static instance metadata",
|
||||
},
|
||||
[]string{"hostname", "tier", "role", "version"},
|
||||
),
|
||||
}
|
||||
|
||||
reg.MustRegister(c.deploymentsTotal)
|
||||
reg.MustRegister(c.deploymentDuration)
|
||||
reg.MustRegister(c.deploymentInProgress)
|
||||
reg.MustRegister(c.info)
|
||||
|
||||
c.initMetrics()
|
||||
|
||||
return c
|
||||
}
|
||||
|
||||
// initMetrics initializes all metric label combinations with zero values.
|
||||
// This ensures metrics appear in Prometheus scrapes before any deployments occur.
|
||||
func (c *Collector) initMetrics() {
|
||||
actions := []messages.Action{
|
||||
messages.ActionSwitch,
|
||||
messages.ActionBoot,
|
||||
messages.ActionTest,
|
||||
messages.ActionDryActivate,
|
||||
}
|
||||
|
||||
// Initialize deployment counter for common status/action combinations
|
||||
for _, action := range actions {
|
||||
// Successful completions (no error code)
|
||||
c.deploymentsTotal.WithLabelValues("completed", string(action), "")
|
||||
// Failed deployments (no error code - from RecordDeploymentEnd)
|
||||
c.deploymentsTotal.WithLabelValues("failed", string(action), "")
|
||||
}
|
||||
|
||||
// Initialize histogram for all action/success combinations
|
||||
for _, action := range actions {
|
||||
c.deploymentDuration.WithLabelValues(string(action), "true")
|
||||
c.deploymentDuration.WithLabelValues(string(action), "false")
|
||||
}
|
||||
}
|
||||
|
||||
// SetInfo sets the static instance metadata.
|
||||
func (c *Collector) SetInfo(hostname, tier, role, version string) {
|
||||
c.info.WithLabelValues(hostname, tier, role, version).Set(1)
|
||||
}
|
||||
|
||||
// RecordDeploymentStart marks the start of a deployment.
|
||||
func (c *Collector) RecordDeploymentStart() {
|
||||
c.deploymentInProgress.Set(1)
|
||||
}
|
||||
|
||||
// RecordDeploymentEnd records the completion of a deployment.
|
||||
func (c *Collector) RecordDeploymentEnd(action messages.Action, success bool, durationSeconds float64) {
|
||||
c.deploymentInProgress.Set(0)
|
||||
|
||||
successLabel := "false"
|
||||
if success {
|
||||
successLabel = "true"
|
||||
}
|
||||
|
||||
c.deploymentDuration.WithLabelValues(string(action), successLabel).Observe(durationSeconds)
|
||||
|
||||
status := "completed"
|
||||
if !success {
|
||||
status = "failed"
|
||||
}
|
||||
|
||||
c.deploymentsTotal.WithLabelValues(status, string(action), "").Inc()
|
||||
}
|
||||
|
||||
// RecordDeploymentFailure records a deployment failure with an error code.
|
||||
func (c *Collector) RecordDeploymentFailure(action messages.Action, errorCode messages.ErrorCode, durationSeconds float64) {
|
||||
c.deploymentInProgress.Set(0)
|
||||
c.deploymentDuration.WithLabelValues(string(action), "false").Observe(durationSeconds)
|
||||
c.deploymentsTotal.WithLabelValues("failed", string(action), string(errorCode)).Inc()
|
||||
}
|
||||
|
||||
// RecordRejection records a rejected deployment request.
|
||||
func (c *Collector) RecordRejection(action messages.Action, errorCode messages.ErrorCode) {
|
||||
c.deploymentsTotal.WithLabelValues("rejected", string(action), string(errorCode)).Inc()
|
||||
}
|
||||
359
internal/metrics/metrics_test.go
Normal file
359
internal/metrics/metrics_test.go
Normal file
@@ -0,0 +1,359 @@
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"context"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"code.t-juice.club/torjus/homelab-deploy/internal/messages"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/testutil"
|
||||
)
|
||||
|
||||
func TestCollector_SetInfo(t *testing.T) {
|
||||
reg := prometheus.NewRegistry()
|
||||
c := NewCollector(reg)
|
||||
|
||||
c.SetInfo("testhost", "test", "web", "1.0.0")
|
||||
|
||||
expected := `
|
||||
# HELP homelab_deploy_info Static instance metadata
|
||||
# TYPE homelab_deploy_info gauge
|
||||
homelab_deploy_info{hostname="testhost",role="web",tier="test",version="1.0.0"} 1
|
||||
`
|
||||
if err := testutil.GatherAndCompare(reg, strings.NewReader(expected), "homelab_deploy_info"); err != nil {
|
||||
t.Errorf("unexpected metrics: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCollector_RecordDeploymentStart(t *testing.T) {
|
||||
reg := prometheus.NewRegistry()
|
||||
c := NewCollector(reg)
|
||||
|
||||
c.RecordDeploymentStart()
|
||||
|
||||
expected := `
|
||||
# HELP homelab_deploy_deployment_in_progress 1 if deployment running, 0 otherwise
|
||||
# TYPE homelab_deploy_deployment_in_progress gauge
|
||||
homelab_deploy_deployment_in_progress 1
|
||||
`
|
||||
if err := testutil.GatherAndCompare(reg, strings.NewReader(expected), "homelab_deploy_deployment_in_progress"); err != nil {
|
||||
t.Errorf("unexpected metrics: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCollector_RecordDeploymentEnd_Success(t *testing.T) {
|
||||
reg := prometheus.NewRegistry()
|
||||
c := NewCollector(reg)
|
||||
|
||||
c.RecordDeploymentStart()
|
||||
c.RecordDeploymentEnd(messages.ActionSwitch, true, 120.5)
|
||||
|
||||
// Check in_progress is 0
|
||||
inProgressExpected := `
|
||||
# HELP homelab_deploy_deployment_in_progress 1 if deployment running, 0 otherwise
|
||||
# TYPE homelab_deploy_deployment_in_progress gauge
|
||||
homelab_deploy_deployment_in_progress 0
|
||||
`
|
||||
if err := testutil.GatherAndCompare(reg, strings.NewReader(inProgressExpected), "homelab_deploy_deployment_in_progress"); err != nil {
|
||||
t.Errorf("unexpected in_progress metrics: %v", err)
|
||||
}
|
||||
|
||||
// Check counter incremented (includes all pre-initialized metrics)
|
||||
counterExpected := `
|
||||
# HELP homelab_deploy_deployments_total Total deployment requests processed
|
||||
# TYPE homelab_deploy_deployments_total counter
|
||||
homelab_deploy_deployments_total{action="boot",error_code="",status="completed"} 0
|
||||
homelab_deploy_deployments_total{action="boot",error_code="",status="failed"} 0
|
||||
homelab_deploy_deployments_total{action="dry-activate",error_code="",status="completed"} 0
|
||||
homelab_deploy_deployments_total{action="dry-activate",error_code="",status="failed"} 0
|
||||
homelab_deploy_deployments_total{action="switch",error_code="",status="completed"} 1
|
||||
homelab_deploy_deployments_total{action="switch",error_code="",status="failed"} 0
|
||||
homelab_deploy_deployments_total{action="test",error_code="",status="completed"} 0
|
||||
homelab_deploy_deployments_total{action="test",error_code="",status="failed"} 0
|
||||
`
|
||||
if err := testutil.GatherAndCompare(reg, strings.NewReader(counterExpected), "homelab_deploy_deployments_total"); err != nil {
|
||||
t.Errorf("unexpected counter metrics: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCollector_RecordDeploymentEnd_Failure(t *testing.T) {
|
||||
reg := prometheus.NewRegistry()
|
||||
c := NewCollector(reg)
|
||||
|
||||
c.RecordDeploymentStart()
|
||||
c.RecordDeploymentEnd(messages.ActionBoot, false, 60.0)
|
||||
|
||||
counterExpected := `
|
||||
# HELP homelab_deploy_deployments_total Total deployment requests processed
|
||||
# TYPE homelab_deploy_deployments_total counter
|
||||
homelab_deploy_deployments_total{action="boot",error_code="",status="completed"} 0
|
||||
homelab_deploy_deployments_total{action="boot",error_code="",status="failed"} 1
|
||||
homelab_deploy_deployments_total{action="dry-activate",error_code="",status="completed"} 0
|
||||
homelab_deploy_deployments_total{action="dry-activate",error_code="",status="failed"} 0
|
||||
homelab_deploy_deployments_total{action="switch",error_code="",status="completed"} 0
|
||||
homelab_deploy_deployments_total{action="switch",error_code="",status="failed"} 0
|
||||
homelab_deploy_deployments_total{action="test",error_code="",status="completed"} 0
|
||||
homelab_deploy_deployments_total{action="test",error_code="",status="failed"} 0
|
||||
`
|
||||
if err := testutil.GatherAndCompare(reg, strings.NewReader(counterExpected), "homelab_deploy_deployments_total"); err != nil {
|
||||
t.Errorf("unexpected counter metrics: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCollector_RecordDeploymentFailure(t *testing.T) {
|
||||
reg := prometheus.NewRegistry()
|
||||
c := NewCollector(reg)
|
||||
|
||||
c.RecordDeploymentStart()
|
||||
c.RecordDeploymentFailure(messages.ActionSwitch, messages.ErrorBuildFailed, 300.0)
|
||||
|
||||
counterExpected := `
|
||||
# HELP homelab_deploy_deployments_total Total deployment requests processed
|
||||
# TYPE homelab_deploy_deployments_total counter
|
||||
homelab_deploy_deployments_total{action="boot",error_code="",status="completed"} 0
|
||||
homelab_deploy_deployments_total{action="boot",error_code="",status="failed"} 0
|
||||
homelab_deploy_deployments_total{action="dry-activate",error_code="",status="completed"} 0
|
||||
homelab_deploy_deployments_total{action="dry-activate",error_code="",status="failed"} 0
|
||||
homelab_deploy_deployments_total{action="switch",error_code="",status="completed"} 0
|
||||
homelab_deploy_deployments_total{action="switch",error_code="",status="failed"} 0
|
||||
homelab_deploy_deployments_total{action="switch",error_code="build_failed",status="failed"} 1
|
||||
homelab_deploy_deployments_total{action="test",error_code="",status="completed"} 0
|
||||
homelab_deploy_deployments_total{action="test",error_code="",status="failed"} 0
|
||||
`
|
||||
if err := testutil.GatherAndCompare(reg, strings.NewReader(counterExpected), "homelab_deploy_deployments_total"); err != nil {
|
||||
t.Errorf("unexpected counter metrics: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCollector_RecordRejection(t *testing.T) {
|
||||
reg := prometheus.NewRegistry()
|
||||
c := NewCollector(reg)
|
||||
|
||||
c.RecordRejection(messages.ActionSwitch, messages.ErrorAlreadyRunning)
|
||||
|
||||
expected := `
|
||||
# HELP homelab_deploy_deployments_total Total deployment requests processed
|
||||
# TYPE homelab_deploy_deployments_total counter
|
||||
homelab_deploy_deployments_total{action="boot",error_code="",status="completed"} 0
|
||||
homelab_deploy_deployments_total{action="boot",error_code="",status="failed"} 0
|
||||
homelab_deploy_deployments_total{action="dry-activate",error_code="",status="completed"} 0
|
||||
homelab_deploy_deployments_total{action="dry-activate",error_code="",status="failed"} 0
|
||||
homelab_deploy_deployments_total{action="switch",error_code="",status="completed"} 0
|
||||
homelab_deploy_deployments_total{action="switch",error_code="",status="failed"} 0
|
||||
homelab_deploy_deployments_total{action="switch",error_code="already_running",status="rejected"} 1
|
||||
homelab_deploy_deployments_total{action="test",error_code="",status="completed"} 0
|
||||
homelab_deploy_deployments_total{action="test",error_code="",status="failed"} 0
|
||||
`
|
||||
if err := testutil.GatherAndCompare(reg, strings.NewReader(expected), "homelab_deploy_deployments_total"); err != nil {
|
||||
t.Errorf("unexpected metrics: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCollector_MetricsInitializedAtStartup(t *testing.T) {
|
||||
reg := prometheus.NewRegistry()
|
||||
_ = NewCollector(reg)
|
||||
|
||||
// Verify counter metrics are initialized with zero values before any deployments
|
||||
counterExpected := `
|
||||
# HELP homelab_deploy_deployments_total Total deployment requests processed
|
||||
# TYPE homelab_deploy_deployments_total counter
|
||||
homelab_deploy_deployments_total{action="boot",error_code="",status="completed"} 0
|
||||
homelab_deploy_deployments_total{action="boot",error_code="",status="failed"} 0
|
||||
homelab_deploy_deployments_total{action="dry-activate",error_code="",status="completed"} 0
|
||||
homelab_deploy_deployments_total{action="dry-activate",error_code="",status="failed"} 0
|
||||
homelab_deploy_deployments_total{action="switch",error_code="",status="completed"} 0
|
||||
homelab_deploy_deployments_total{action="switch",error_code="",status="failed"} 0
|
||||
homelab_deploy_deployments_total{action="test",error_code="",status="completed"} 0
|
||||
homelab_deploy_deployments_total{action="test",error_code="",status="failed"} 0
|
||||
`
|
||||
if err := testutil.GatherAndCompare(reg, strings.NewReader(counterExpected), "homelab_deploy_deployments_total"); err != nil {
|
||||
t.Errorf("counter metrics not initialized: %v", err)
|
||||
}
|
||||
|
||||
// Verify histogram metrics are initialized with zero values before any deployments
|
||||
histogramExpected := `
|
||||
# HELP homelab_deploy_deployment_duration_seconds Deployment execution time
|
||||
# TYPE homelab_deploy_deployment_duration_seconds histogram
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="boot",success="false",le="30"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="boot",success="false",le="60"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="boot",success="false",le="120"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="boot",success="false",le="300"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="boot",success="false",le="600"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="boot",success="false",le="900"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="boot",success="false",le="1200"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="boot",success="false",le="1800"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="boot",success="false",le="+Inf"} 0
|
||||
homelab_deploy_deployment_duration_seconds_sum{action="boot",success="false"} 0
|
||||
homelab_deploy_deployment_duration_seconds_count{action="boot",success="false"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="boot",success="true",le="30"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="boot",success="true",le="60"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="boot",success="true",le="120"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="boot",success="true",le="300"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="boot",success="true",le="600"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="boot",success="true",le="900"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="boot",success="true",le="1200"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="boot",success="true",le="1800"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="boot",success="true",le="+Inf"} 0
|
||||
homelab_deploy_deployment_duration_seconds_sum{action="boot",success="true"} 0
|
||||
homelab_deploy_deployment_duration_seconds_count{action="boot",success="true"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="dry-activate",success="false",le="30"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="dry-activate",success="false",le="60"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="dry-activate",success="false",le="120"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="dry-activate",success="false",le="300"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="dry-activate",success="false",le="600"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="dry-activate",success="false",le="900"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="dry-activate",success="false",le="1200"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="dry-activate",success="false",le="1800"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="dry-activate",success="false",le="+Inf"} 0
|
||||
homelab_deploy_deployment_duration_seconds_sum{action="dry-activate",success="false"} 0
|
||||
homelab_deploy_deployment_duration_seconds_count{action="dry-activate",success="false"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="dry-activate",success="true",le="30"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="dry-activate",success="true",le="60"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="dry-activate",success="true",le="120"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="dry-activate",success="true",le="300"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="dry-activate",success="true",le="600"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="dry-activate",success="true",le="900"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="dry-activate",success="true",le="1200"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="dry-activate",success="true",le="1800"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="dry-activate",success="true",le="+Inf"} 0
|
||||
homelab_deploy_deployment_duration_seconds_sum{action="dry-activate",success="true"} 0
|
||||
homelab_deploy_deployment_duration_seconds_count{action="dry-activate",success="true"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="switch",success="false",le="30"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="switch",success="false",le="60"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="switch",success="false",le="120"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="switch",success="false",le="300"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="switch",success="false",le="600"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="switch",success="false",le="900"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="switch",success="false",le="1200"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="switch",success="false",le="1800"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="switch",success="false",le="+Inf"} 0
|
||||
homelab_deploy_deployment_duration_seconds_sum{action="switch",success="false"} 0
|
||||
homelab_deploy_deployment_duration_seconds_count{action="switch",success="false"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="switch",success="true",le="30"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="switch",success="true",le="60"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="switch",success="true",le="120"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="switch",success="true",le="300"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="switch",success="true",le="600"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="switch",success="true",le="900"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="switch",success="true",le="1200"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="switch",success="true",le="1800"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="switch",success="true",le="+Inf"} 0
|
||||
homelab_deploy_deployment_duration_seconds_sum{action="switch",success="true"} 0
|
||||
homelab_deploy_deployment_duration_seconds_count{action="switch",success="true"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="test",success="false",le="30"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="test",success="false",le="60"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="test",success="false",le="120"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="test",success="false",le="300"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="test",success="false",le="600"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="test",success="false",le="900"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="test",success="false",le="1200"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="test",success="false",le="1800"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="test",success="false",le="+Inf"} 0
|
||||
homelab_deploy_deployment_duration_seconds_sum{action="test",success="false"} 0
|
||||
homelab_deploy_deployment_duration_seconds_count{action="test",success="false"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="test",success="true",le="30"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="test",success="true",le="60"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="test",success="true",le="120"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="test",success="true",le="300"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="test",success="true",le="600"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="test",success="true",le="900"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="test",success="true",le="1200"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="test",success="true",le="1800"} 0
|
||||
homelab_deploy_deployment_duration_seconds_bucket{action="test",success="true",le="+Inf"} 0
|
||||
homelab_deploy_deployment_duration_seconds_sum{action="test",success="true"} 0
|
||||
homelab_deploy_deployment_duration_seconds_count{action="test",success="true"} 0
|
||||
`
|
||||
if err := testutil.GatherAndCompare(reg, strings.NewReader(histogramExpected), "homelab_deploy_deployment_duration_seconds"); err != nil {
|
||||
t.Errorf("histogram metrics not initialized: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestServer_StartShutdown(t *testing.T) {
|
||||
srv := NewServer(ServerConfig{
|
||||
Addr: ":0", // Let OS pick a free port
|
||||
})
|
||||
|
||||
if err := srv.Start(); err != nil {
|
||||
t.Fatalf("failed to start server: %v", err)
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if err := srv.Shutdown(ctx); err != nil {
|
||||
t.Errorf("failed to shutdown server: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestServer_Endpoints(t *testing.T) {
|
||||
srv := NewServer(ServerConfig{
|
||||
Addr: "127.0.0.1:19972", // Use a fixed port for testing
|
||||
})
|
||||
|
||||
if err := srv.Start(); err != nil {
|
||||
t.Fatalf("failed to start server: %v", err)
|
||||
}
|
||||
|
||||
defer func() {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
_ = srv.Shutdown(ctx)
|
||||
}()
|
||||
|
||||
// Give server time to start
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
|
||||
t.Run("health endpoint", func(t *testing.T) {
|
||||
resp, err := http.Get("http://127.0.0.1:19972/health")
|
||||
if err != nil {
|
||||
t.Fatalf("failed to get health endpoint: %v", err)
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
t.Errorf("expected status 200, got %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
if string(body) != "ok" {
|
||||
t.Errorf("expected body 'ok', got %q", string(body))
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("metrics endpoint", func(t *testing.T) {
|
||||
// Set some info to have metrics to display
|
||||
srv.Collector().SetInfo("testhost", "test", "web", "1.0.0")
|
||||
|
||||
resp, err := http.Get("http://127.0.0.1:19972/metrics")
|
||||
if err != nil {
|
||||
t.Fatalf("failed to get metrics endpoint: %v", err)
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
t.Errorf("expected status 200, got %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
bodyStr := string(body)
|
||||
|
||||
if !strings.Contains(bodyStr, "homelab_deploy_info") {
|
||||
t.Error("expected metrics to contain homelab_deploy_info")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestServer_Collector(t *testing.T) {
|
||||
srv := NewServer(ServerConfig{
|
||||
Addr: ":0",
|
||||
})
|
||||
|
||||
collector := srv.Collector()
|
||||
if collector == nil {
|
||||
t.Error("expected non-nil collector")
|
||||
}
|
||||
}
|
||||
107
internal/metrics/server.go
Normal file
107
internal/metrics/server.go
Normal file
@@ -0,0 +1,107 @@
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
)
|
||||
|
||||
// ServerConfig holds configuration for the metrics server.
|
||||
type ServerConfig struct {
|
||||
Addr string
|
||||
Logger *slog.Logger
|
||||
}
|
||||
|
||||
// Server serves Prometheus metrics over HTTP.
|
||||
type Server struct {
|
||||
httpServer *http.Server
|
||||
registry *prometheus.Registry
|
||||
collector *Collector
|
||||
logger *slog.Logger
|
||||
scrapeCh chan struct{}
|
||||
}
|
||||
|
||||
// NewServer creates a new metrics server.
|
||||
func NewServer(cfg ServerConfig) *Server {
|
||||
logger := cfg.Logger
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
|
||||
registry := prometheus.NewRegistry()
|
||||
collector := NewCollector(registry)
|
||||
|
||||
scrapeCh := make(chan struct{})
|
||||
|
||||
metricsHandler := promhttp.HandlerFor(registry, promhttp.HandlerOpts{
|
||||
Registry: registry,
|
||||
})
|
||||
|
||||
mux := http.NewServeMux()
|
||||
mux.Handle("/metrics", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
metricsHandler.ServeHTTP(w, r)
|
||||
// Signal that a scrape occurred (non-blocking)
|
||||
select {
|
||||
case scrapeCh <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
}))
|
||||
mux.HandleFunc("/health", func(w http.ResponseWriter, _ *http.Request) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte("ok"))
|
||||
})
|
||||
|
||||
return &Server{
|
||||
httpServer: &http.Server{
|
||||
Addr: cfg.Addr,
|
||||
Handler: mux,
|
||||
ReadHeaderTimeout: 10 * time.Second,
|
||||
},
|
||||
registry: registry,
|
||||
collector: collector,
|
||||
logger: logger,
|
||||
scrapeCh: scrapeCh,
|
||||
}
|
||||
}
|
||||
|
||||
// Collector returns the metrics collector.
|
||||
func (s *Server) Collector() *Collector {
|
||||
return s.collector
|
||||
}
|
||||
|
||||
// Registry returns the Prometheus registry.
|
||||
func (s *Server) Registry() *prometheus.Registry {
|
||||
return s.registry
|
||||
}
|
||||
|
||||
// ScrapeCh returns a channel that receives a signal each time the metrics endpoint is scraped.
|
||||
func (s *Server) ScrapeCh() <-chan struct{} {
|
||||
return s.scrapeCh
|
||||
}
|
||||
|
||||
// Start starts the HTTP server in a goroutine.
|
||||
func (s *Server) Start() error {
|
||||
s.logger.Info("starting metrics server", "addr", s.httpServer.Addr)
|
||||
|
||||
go func() {
|
||||
if err := s.httpServer.ListenAndServe(); err != nil && err != http.ErrServerClosed {
|
||||
s.logger.Error("metrics server error", "error", err)
|
||||
}
|
||||
}()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Shutdown gracefully shuts down the server.
|
||||
func (s *Server) Shutdown(ctx context.Context) error {
|
||||
s.logger.Info("shutting down metrics server")
|
||||
if err := s.httpServer.Shutdown(ctx); err != nil {
|
||||
return fmt.Errorf("failed to shutdown metrics server: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
297
nixos/module.nix
297
nixos/module.nix
@@ -2,20 +2,60 @@
|
||||
{ config, lib, pkgs, ... }:
|
||||
|
||||
let
|
||||
cfg = config.services.homelab-deploy.listener;
|
||||
listenerCfg = config.services.homelab-deploy.listener;
|
||||
builderCfg = config.services.homelab-deploy.builder;
|
||||
|
||||
# Build command line arguments from configuration
|
||||
args = lib.concatStringsSep " " ([
|
||||
"--hostname ${lib.escapeShellArg cfg.hostname}"
|
||||
"--tier ${cfg.tier}"
|
||||
"--nats-url ${lib.escapeShellArg cfg.natsUrl}"
|
||||
"--nkey-file ${lib.escapeShellArg cfg.nkeyFile}"
|
||||
"--flake-url ${lib.escapeShellArg cfg.flakeUrl}"
|
||||
"--timeout ${toString cfg.timeout}"
|
||||
"--discover-subject ${lib.escapeShellArg cfg.discoverSubject}"
|
||||
# Generate YAML config from settings
|
||||
generatedConfigFile = pkgs.writeText "builder.yaml" (lib.generators.toYAML {} {
|
||||
repos = lib.mapAttrs (name: repo: {
|
||||
url = repo.url;
|
||||
default_branch = repo.defaultBranch;
|
||||
}) builderCfg.settings.repos;
|
||||
});
|
||||
|
||||
# Use provided configFile or generate from settings
|
||||
builderConfigFile =
|
||||
if builderCfg.configFile != null
|
||||
then builderCfg.configFile
|
||||
else generatedConfigFile;
|
||||
|
||||
# Build command line arguments for listener from configuration
|
||||
listenerArgs = lib.concatStringsSep " " ([
|
||||
"--hostname ${lib.escapeShellArg listenerCfg.hostname}"
|
||||
"--tier ${listenerCfg.tier}"
|
||||
"--nats-url ${lib.escapeShellArg listenerCfg.natsUrl}"
|
||||
"--nkey-file ${lib.escapeShellArg listenerCfg.nkeyFile}"
|
||||
"--flake-url ${lib.escapeShellArg listenerCfg.flakeUrl}"
|
||||
"--timeout ${toString listenerCfg.timeout}"
|
||||
"--discover-subject ${lib.escapeShellArg listenerCfg.discoverSubject}"
|
||||
]
|
||||
++ lib.optional (cfg.role != null) "--role ${lib.escapeShellArg cfg.role}"
|
||||
++ map (s: "--deploy-subject ${lib.escapeShellArg s}") cfg.deploySubjects);
|
||||
++ lib.optional (listenerCfg.role != null) "--role ${lib.escapeShellArg listenerCfg.role}"
|
||||
++ map (s: "--deploy-subject ${lib.escapeShellArg s}") listenerCfg.deploySubjects
|
||||
++ lib.optionals listenerCfg.metrics.enable [
|
||||
"--metrics-enabled"
|
||||
"--metrics-addr ${lib.escapeShellArg listenerCfg.metrics.address}"
|
||||
]);
|
||||
|
||||
# Build command line arguments for builder from configuration
|
||||
builderArgs = lib.concatStringsSep " " ([
|
||||
"--nats-url ${lib.escapeShellArg builderCfg.natsUrl}"
|
||||
"--nkey-file ${lib.escapeShellArg builderCfg.nkeyFile}"
|
||||
"--config ${builderConfigFile}"
|
||||
"--timeout ${toString builderCfg.timeout}"
|
||||
]
|
||||
++ lib.optionals builderCfg.metrics.enable [
|
||||
"--metrics-enabled"
|
||||
"--metrics-addr ${lib.escapeShellArg builderCfg.metrics.address}"
|
||||
]);
|
||||
|
||||
# Extract port from metrics address for firewall rule
|
||||
extractPort = addr: let
|
||||
# Handle both ":9972" and "0.0.0.0:9972" formats
|
||||
parts = lib.splitString ":" addr;
|
||||
in lib.toInt (lib.last parts);
|
||||
|
||||
listenerMetricsPort = extractPort listenerCfg.metrics.address;
|
||||
builderMetricsPort = extractPort builderCfg.metrics.address;
|
||||
|
||||
in
|
||||
{
|
||||
@@ -94,50 +134,205 @@ in
|
||||
description = "Additional environment variables for the service";
|
||||
example = { GIT_SSH_COMMAND = "ssh -i /run/secrets/deploy-key"; };
|
||||
};
|
||||
};
|
||||
|
||||
config = lib.mkIf cfg.enable {
|
||||
systemd.services.homelab-deploy-listener = {
|
||||
description = "homelab-deploy listener";
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
after = [ "network-online.target" ];
|
||||
wants = [ "network-online.target" ];
|
||||
metrics = {
|
||||
enable = lib.mkEnableOption "Prometheus metrics endpoint";
|
||||
|
||||
environment = cfg.environment // {
|
||||
# Nix needs a writable cache for git flake fetching
|
||||
XDG_CACHE_HOME = "/var/cache/homelab-deploy";
|
||||
address = lib.mkOption {
|
||||
type = lib.types.str;
|
||||
default = ":9972";
|
||||
description = "Address for Prometheus metrics HTTP server";
|
||||
example = "127.0.0.1:9972";
|
||||
};
|
||||
|
||||
path = [ pkgs.git config.system.build.nixos-rebuild ];
|
||||
|
||||
serviceConfig = {
|
||||
CacheDirectory = "homelab-deploy";
|
||||
Type = "simple";
|
||||
ExecStart = "${cfg.package}/bin/homelab-deploy listener ${args}";
|
||||
Restart = "always";
|
||||
RestartSec = 10;
|
||||
|
||||
# Hardening (compatible with nixos-rebuild requirements)
|
||||
# Note: Some options are relaxed because nixos-rebuild requires:
|
||||
# - Write access to /nix/store for building
|
||||
# - Ability to activate system configurations
|
||||
# - Network access for fetching from git/cache
|
||||
# - Namespace support for nix sandbox builds
|
||||
NoNewPrivileges = false;
|
||||
ProtectSystem = "false";
|
||||
ProtectHome = "read-only";
|
||||
PrivateTmp = true;
|
||||
PrivateDevices = true;
|
||||
ProtectKernelTunables = true;
|
||||
ProtectKernelModules = true;
|
||||
ProtectControlGroups = true;
|
||||
RestrictAddressFamilies = [ "AF_UNIX" "AF_INET" "AF_INET6" ];
|
||||
RestrictNamespaces = false;
|
||||
RestrictSUIDSGID = true;
|
||||
LockPersonality = true;
|
||||
MemoryDenyWriteExecute = false;
|
||||
SystemCallArchitectures = "native";
|
||||
openFirewall = lib.mkOption {
|
||||
type = lib.types.bool;
|
||||
default = false;
|
||||
description = "Open firewall for metrics port";
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
options.services.homelab-deploy.builder = {
|
||||
enable = lib.mkEnableOption "homelab-deploy builder service";
|
||||
|
||||
package = lib.mkOption {
|
||||
type = lib.types.package;
|
||||
default = self.packages.${pkgs.system}.homelab-deploy;
|
||||
description = "The homelab-deploy package to use";
|
||||
};
|
||||
|
||||
natsUrl = lib.mkOption {
|
||||
type = lib.types.str;
|
||||
description = "NATS server URL";
|
||||
example = "nats://nats.example.com:4222";
|
||||
};
|
||||
|
||||
nkeyFile = lib.mkOption {
|
||||
type = lib.types.path;
|
||||
description = "Path to NKey seed file for NATS authentication";
|
||||
example = "/run/secrets/homelab-deploy-builder-nkey";
|
||||
};
|
||||
|
||||
configFile = lib.mkOption {
|
||||
type = lib.types.nullOr lib.types.path;
|
||||
default = null;
|
||||
description = ''
|
||||
Path to builder configuration file (YAML).
|
||||
If not specified, a config file will be generated from the `settings` option.
|
||||
'';
|
||||
example = "/etc/homelab-deploy/builder.yaml";
|
||||
};
|
||||
|
||||
settings = {
|
||||
repos = lib.mkOption {
|
||||
type = lib.types.attrsOf (lib.types.submodule {
|
||||
options = {
|
||||
url = lib.mkOption {
|
||||
type = lib.types.str;
|
||||
description = "Git flake URL for the repository";
|
||||
example = "git+https://git.example.com/org/nixos-configs.git";
|
||||
};
|
||||
defaultBranch = lib.mkOption {
|
||||
type = lib.types.str;
|
||||
default = "master";
|
||||
description = "Default branch to build when not specified in request";
|
||||
example = "main";
|
||||
};
|
||||
};
|
||||
});
|
||||
default = {};
|
||||
description = ''
|
||||
Repository configuration for the builder.
|
||||
Each key is the repository name used in build requests.
|
||||
'';
|
||||
example = lib.literalExpression ''
|
||||
{
|
||||
nixos-servers = {
|
||||
url = "git+https://git.example.com/org/nixos-servers.git";
|
||||
defaultBranch = "master";
|
||||
};
|
||||
homelab = {
|
||||
url = "git+ssh://git@github.com/user/homelab.git";
|
||||
defaultBranch = "main";
|
||||
};
|
||||
}
|
||||
'';
|
||||
};
|
||||
};
|
||||
|
||||
timeout = lib.mkOption {
|
||||
type = lib.types.int;
|
||||
default = 1800;
|
||||
description = "Build timeout in seconds per host";
|
||||
};
|
||||
|
||||
environment = lib.mkOption {
|
||||
type = lib.types.attrsOf lib.types.str;
|
||||
default = { };
|
||||
description = "Additional environment variables for the service";
|
||||
example = { GIT_SSH_COMMAND = "ssh -i /run/secrets/deploy-key"; };
|
||||
};
|
||||
|
||||
metrics = {
|
||||
enable = lib.mkEnableOption "Prometheus metrics endpoint";
|
||||
|
||||
address = lib.mkOption {
|
||||
type = lib.types.str;
|
||||
default = ":9973";
|
||||
description = "Address for Prometheus metrics HTTP server";
|
||||
example = "127.0.0.1:9973";
|
||||
};
|
||||
|
||||
openFirewall = lib.mkOption {
|
||||
type = lib.types.bool;
|
||||
default = false;
|
||||
description = "Open firewall for metrics port";
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
config = lib.mkMerge [
|
||||
(lib.mkIf builderCfg.enable {
|
||||
assertions = [
|
||||
{
|
||||
assertion = builderCfg.configFile != null || builderCfg.settings.repos != {};
|
||||
message = "services.homelab-deploy.builder: either configFile or settings.repos must be specified";
|
||||
}
|
||||
];
|
||||
})
|
||||
|
||||
(lib.mkIf listenerCfg.enable {
|
||||
systemd.services.homelab-deploy-listener = {
|
||||
description = "homelab-deploy listener";
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
after = [ "network-online.target" ];
|
||||
wants = [ "network-online.target" ];
|
||||
|
||||
# Prevent self-interruption during nixos-rebuild switch
|
||||
# The service will continue running the old version until manually restarted
|
||||
stopIfChanged = false;
|
||||
restartIfChanged = false;
|
||||
|
||||
environment = listenerCfg.environment // {
|
||||
# Nix needs a writable cache for git flake fetching
|
||||
XDG_CACHE_HOME = "/var/cache/homelab-deploy";
|
||||
};
|
||||
|
||||
path = [ pkgs.git config.system.build.nixos-rebuild ];
|
||||
|
||||
serviceConfig = {
|
||||
CacheDirectory = "homelab-deploy";
|
||||
Type = "simple";
|
||||
ExecStart = "${listenerCfg.package}/bin/homelab-deploy listener ${listenerArgs}";
|
||||
Restart = "always";
|
||||
RestartSec = 10;
|
||||
|
||||
# Minimal hardening - nixos-rebuild requires broad system access:
|
||||
# - Write access to /nix/store for building
|
||||
# - Kernel namespace support for nix sandbox builds
|
||||
# - Ability to activate system configurations
|
||||
# - Network access for fetching from git/cache
|
||||
# Following the approach of nixos auto-upgrade which has no hardening
|
||||
};
|
||||
};
|
||||
|
||||
networking.firewall.allowedTCPPorts = lib.mkIf (listenerCfg.metrics.enable && listenerCfg.metrics.openFirewall) [
|
||||
listenerMetricsPort
|
||||
];
|
||||
})
|
||||
|
||||
(lib.mkIf builderCfg.enable {
|
||||
systemd.services.homelab-deploy-builder = {
|
||||
description = "homelab-deploy builder";
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
after = [ "network-online.target" ];
|
||||
wants = [ "network-online.target" ];
|
||||
|
||||
environment = builderCfg.environment // {
|
||||
# Nix needs a writable cache for git flake fetching
|
||||
XDG_CACHE_HOME = "/var/cache/homelab-deploy-builder";
|
||||
};
|
||||
|
||||
path = [ pkgs.git pkgs.nix ];
|
||||
|
||||
serviceConfig = {
|
||||
CacheDirectory = "homelab-deploy-builder";
|
||||
Type = "simple";
|
||||
ExecStart = "${builderCfg.package}/bin/homelab-deploy builder ${builderArgs}";
|
||||
Restart = "always";
|
||||
RestartSec = 10;
|
||||
|
||||
# Minimal hardening - nix build requires broad system access:
|
||||
# - Write access to /nix/store for building
|
||||
# - Kernel namespace support for nix sandbox builds
|
||||
# - Network access for fetching from git/cache
|
||||
};
|
||||
};
|
||||
|
||||
networking.firewall.allowedTCPPorts = lib.mkIf (builderCfg.metrics.enable && builderCfg.metrics.openFirewall) [
|
||||
builderMetricsPort
|
||||
];
|
||||
})
|
||||
];
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user