feat: add Prometheus metrics to listener service

Add an optional Prometheus metrics HTTP endpoint to the listener for
monitoring deployment operations. Includes four metrics:

- homelab_deploy_deployments_total (counter with status/action/error_code)
- homelab_deploy_deployment_duration_seconds (histogram with action/success)
- homelab_deploy_deployment_in_progress (gauge)
- homelab_deploy_info (gauge with hostname/tier/role/version)

New CLI flags: --metrics-enabled, --metrics-addr (default :9972)
New NixOS options: metrics.enable, metrics.address, metrics.openFirewall

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-02-07 07:58:22 +01:00
parent 56365835c7
commit 79db119d1c
10 changed files with 613 additions and 9 deletions

View File

@@ -61,6 +61,8 @@ homelab-deploy listener \
| `--timeout` | No | Deployment timeout in seconds (default: 600) |
| `--deploy-subject` | No | NATS subjects to subscribe to (repeatable) |
| `--discover-subject` | No | Discovery subject (default: `deploy.discover`) |
| `--metrics-enabled` | No | Enable Prometheus metrics endpoint |
| `--metrics-addr` | No | Metrics HTTP server address (default: `:9972`) |
#### Subject Templates
@@ -209,6 +211,9 @@ Add the module to your NixOS configuration:
| `deploySubjects` | list of string | see below | Subjects to subscribe to |
| `discoverSubject` | string | `"deploy.discover"` | Discovery subject |
| `environment` | attrs | `{}` | Additional environment variables |
| `metrics.enable` | bool | `false` | Enable Prometheus metrics endpoint |
| `metrics.address` | string | `":9972"` | Metrics HTTP server address |
| `metrics.openFirewall` | bool | `false` | Open firewall for metrics port |
Default `deploySubjects`:
```nix
@@ -219,6 +224,80 @@ Default `deploySubjects`:
]
```
## Prometheus Metrics
The listener can expose Prometheus metrics for monitoring deployment operations.
### Enabling Metrics
**CLI:**
```bash
homelab-deploy listener \
--hostname myhost \
--tier prod \
--nats-url nats://nats.example.com:4222 \
--nkey-file /run/secrets/listener.nkey \
--flake-url git+https://git.example.com/user/nixos-configs.git \
--metrics-enabled \
--metrics-addr :9972
```
**NixOS module:**
```nix
services.homelab-deploy.listener = {
enable = true;
tier = "prod";
natsUrl = "nats://nats.example.com:4222";
nkeyFile = "/run/secrets/homelab-deploy-nkey";
flakeUrl = "git+https://git.example.com/user/nixos-configs.git";
metrics = {
enable = true;
address = ":9972";
openFirewall = true; # Optional: open firewall for Prometheus scraping
};
};
```
### Available Metrics
| Metric | Type | Labels | Description |
|--------|------|--------|-------------|
| `homelab_deploy_deployments_total` | Counter | `status`, `action`, `error_code` | Total deployment requests processed |
| `homelab_deploy_deployment_duration_seconds` | Histogram | `action`, `success` | Deployment execution time |
| `homelab_deploy_deployment_in_progress` | Gauge | - | 1 if deployment running, 0 otherwise |
| `homelab_deploy_info` | Gauge | `hostname`, `tier`, `role`, `version` | Static instance metadata |
**Label values:**
- `status`: `completed`, `failed`, `rejected`
- `action`: `switch`, `boot`, `test`, `dry-activate`
- `error_code`: `invalid_action`, `invalid_revision`, `already_running`, `build_failed`, `timeout`, or empty
- `success`: `true`, `false`
### HTTP Endpoints
| Endpoint | Description |
|----------|-------------|
| `/metrics` | Prometheus metrics in text format |
| `/health` | Health check (returns `ok`) |
### Example Prometheus Queries
```promql
# Average deployment duration (last hour)
rate(homelab_deploy_deployment_duration_seconds_sum[1h]) /
rate(homelab_deploy_deployment_duration_seconds_count[1h])
# Deployment success rate (last 24 hours)
sum(rate(homelab_deploy_deployments_total{status="completed"}[24h])) /
sum(rate(homelab_deploy_deployments_total{status=~"completed|failed"}[24h]))
# 95th percentile deployment time
histogram_quantile(0.95, rate(homelab_deploy_deployment_duration_seconds_bucket[1h]))
# Currently running deployments across all hosts
sum(homelab_deploy_deployment_in_progress)
```
## Message Protocol
### Deploy Request

View File

@@ -16,7 +16,7 @@ import (
"github.com/urfave/cli/v3"
)
const version = "0.1.7"
const version = "0.1.8"
func main() {
app := &cli.Command{
@@ -90,6 +90,15 @@ func listenerCommand() *cli.Command {
Usage: "NATS subject for host discovery requests",
Value: "deploy.discover",
},
&cli.BoolFlag{
Name: "metrics-enabled",
Usage: "Enable Prometheus metrics endpoint",
},
&cli.StringFlag{
Name: "metrics-addr",
Usage: "Address for Prometheus metrics HTTP server",
Value: ":9972",
},
},
Action: func(ctx context.Context, c *cli.Command) error {
tier := c.String("tier")
@@ -107,6 +116,9 @@ func listenerCommand() *cli.Command {
Timeout: time.Duration(c.Int("timeout")) * time.Second,
DeploySubjects: c.StringSlice("deploy-subject"),
DiscoverSubject: c.String("discover-subject"),
MetricsEnabled: c.Bool("metrics-enabled"),
MetricsAddr: c.String("metrics-addr"),
Version: version,
}
logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{

View File

@@ -26,7 +26,7 @@
pname = "homelab-deploy";
inherit version;
src = ./.;
vendorHash = "sha256-JXa+obN62zrrwXlplqojY7dvEunUqDdSTee6N8c5JTg=";
vendorHash = "sha256-CN+l0JbQu+HDfotkt3PUFzBexHCHpCKIIZpAQRyojBk=";
subPackages = [ "cmd/homelab-deploy" ];
};
default = self.packages.${system}.homelab-deploy;

10
go.mod
View File

@@ -7,20 +7,30 @@ require (
github.com/mark3labs/mcp-go v0.43.2
github.com/nats-io/nats.go v1.48.0
github.com/nats-io/nkeys v0.4.15
github.com/prometheus/client_golang v1.23.2
github.com/urfave/cli/v3 v3.6.2
)
require (
github.com/bahlo/generic-list-go v0.2.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/buger/jsonparser v1.1.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/invopop/jsonschema v0.13.0 // indirect
github.com/klauspost/compress v1.18.0 // indirect
github.com/kylelemons/godebug v1.1.0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/nats-io/nuid v1.0.1 // indirect
github.com/prometheus/client_model v0.6.2 // indirect
github.com/prometheus/common v0.66.1 // indirect
github.com/prometheus/procfs v0.16.1 // indirect
github.com/spf13/cast v1.7.1 // indirect
github.com/wk8/go-ordered-map/v2 v2.1.8 // indirect
github.com/yosida95/uritemplate/v3 v3.0.2 // indirect
go.yaml.in/yaml/v2 v2.4.2 // indirect
golang.org/x/crypto v0.47.0 // indirect
golang.org/x/sys v0.40.0 // indirect
google.golang.org/protobuf v1.36.8 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)

33
go.sum
View File

@@ -1,13 +1,17 @@
github.com/bahlo/generic-list-go v0.2.0 h1:5sz/EEAK+ls5wF+NeqDpk5+iNdMDXrh3z3nPnH1Wvgk=
github.com/bahlo/generic-list-go v0.2.0/go.mod h1:2KvAjgMlE5NNynlg/5iLrrCCZ2+5xWbdbCW3pNTGyYg=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMUs=
github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8=
github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0=
github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/invopop/jsonschema v0.13.0 h1:KvpoAJWEjR3uD9Kbm2HWJmqsEaHt8lBUpd0qHcIi21E=
@@ -19,10 +23,14 @@ github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
github.com/mark3labs/mcp-go v0.43.2 h1:21PUSlWWiSbUPQwXIJ5WKlETixpFpq+WBpbMGDSVy/I=
github.com/mark3labs/mcp-go v0.43.2/go.mod h1:YnJfOL382MIWDx1kMY+2zsRHU/q78dBg9aFb8W6Thdw=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/nats-io/nats.go v1.48.0 h1:pSFyXApG+yWU/TgbKCjmm5K4wrHu86231/w84qRVR+U=
github.com/nats-io/nats.go v1.48.0/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g=
github.com/nats-io/nkeys v0.4.15 h1:JACV5jRVO9V856KOapQ7x+EY8Jo3qw1vJt/9Jpwzkk4=
@@ -31,8 +39,16 @@ github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=
github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8=
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs=
github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA=
github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
github.com/spf13/cast v1.7.1 h1:cuNEagBQEHWN1FnbGEjCXL2szYEXqfJPbP2HNUaca9Y=
github.com/spf13/cast v1.7.1/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo=
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
@@ -43,11 +59,18 @@ github.com/wk8/go-ordered-map/v2 v2.1.8 h1:5h/BUHu93oj4gIdvHHHGsScSTMijfx5PeYkE/
github.com/wk8/go-ordered-map/v2 v2.1.8/go.mod h1:5nJHM5DyteebpVlHnWMV0rPz6Zp7+xBAnxjb1X5vnTw=
github.com/yosida95/uritemplate/v3 v3.0.2 h1:Ed3Oyj9yrmi9087+NczuL5BwkIc4wvTb5zIM+UJPGz4=
github.com/yosida95/uritemplate/v3 v3.0.2/go.mod h1:ILOh0sOhIJR3+L/8afwt/kE++YT040gmv5BQTMR2HP4=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI=
go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU=
golang.org/x/crypto v0.47.0 h1:V6e3FRj+n4dbpw86FJ8Fv7XVOql7TEwpHapKoMJ/GO8=
golang.org/x/crypto v0.47.0/go.mod h1:ff3Y9VzzKbwSSEzWqJsJVBnWmRwRSHt/6Op5n9bQc4A=
golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ=
golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc=
google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

View File

@@ -8,6 +8,7 @@ import (
"git.t-juice.club/torjus/homelab-deploy/internal/deploy"
"git.t-juice.club/torjus/homelab-deploy/internal/messages"
"git.t-juice.club/torjus/homelab-deploy/internal/metrics"
"git.t-juice.club/torjus/homelab-deploy/internal/nats"
)
@@ -22,6 +23,9 @@ type Config struct {
Timeout time.Duration
DeploySubjects []string
DiscoverSubject string
MetricsEnabled bool
MetricsAddr string
Version string
}
// Listener handles deployment requests from NATS.
@@ -38,6 +42,10 @@ type Listener struct {
// restartCh signals that the listener should exit for restart
// (e.g., after a successful switch deployment)
restartCh chan struct{}
// metrics server and collector (nil if metrics disabled)
metricsServer *metrics.Server
metrics *metrics.Collector
}
// New creates a new listener with the given configuration.
@@ -46,17 +54,42 @@ func New(cfg Config, logger *slog.Logger) *Listener {
logger = slog.Default()
}
return &Listener{
l := &Listener{
cfg: cfg,
executor: deploy.NewExecutor(cfg.FlakeURL, cfg.Hostname, cfg.Timeout),
lock: deploy.NewLock(),
logger: logger,
restartCh: make(chan struct{}, 1),
}
if cfg.MetricsEnabled {
l.metricsServer = metrics.NewServer(metrics.ServerConfig{
Addr: cfg.MetricsAddr,
Logger: logger,
})
l.metrics = l.metricsServer.Collector()
}
return l
}
// Run starts the listener and blocks until the context is cancelled.
func (l *Listener) Run(ctx context.Context) error {
// Start metrics server if enabled
if l.metricsServer != nil {
if err := l.metricsServer.Start(); err != nil {
return fmt.Errorf("failed to start metrics server: %w", err)
}
defer func() {
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
_ = l.metricsServer.Shutdown(shutdownCtx)
}()
// Set instance info metric
l.metrics.SetInfo(l.cfg.Hostname, l.cfg.Tier, l.cfg.Role, l.cfg.Version)
}
// Connect to NATS
l.logger.Info("connecting to NATS",
"url", l.cfg.NATSUrl,
@@ -136,6 +169,9 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) {
messages.StatusRejected,
err.Error(),
).WithError(messages.ErrorInvalidAction))
if l.metrics != nil {
l.metrics.RecordRejection(req.Action, messages.ErrorInvalidAction)
}
return
}
@@ -150,6 +186,9 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) {
messages.StatusRejected,
"another deployment is already in progress",
).WithError(messages.ErrorAlreadyRunning))
if l.metrics != nil {
l.metrics.RecordRejection(req.Action, messages.ErrorAlreadyRunning)
}
return
}
defer l.lock.Release()
@@ -161,6 +200,12 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) {
fmt.Sprintf("starting deployment: %s", l.executor.BuildCommand(req.Action, req.Revision)),
))
// Record deployment start for metrics
if l.metrics != nil {
l.metrics.RecordDeploymentStart()
}
startTime := time.Now()
// Validate revision
ctx := context.Background()
if err := l.executor.ValidateRevision(ctx, req.Revision); err != nil {
@@ -173,6 +218,10 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) {
messages.StatusFailed,
fmt.Sprintf("revision validation failed: %v", err),
).WithError(messages.ErrorInvalidRevision))
if l.metrics != nil {
duration := time.Since(startTime).Seconds()
l.metrics.RecordDeploymentFailure(req.Action, messages.ErrorInvalidRevision, duration)
}
return
}
@@ -184,6 +233,7 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) {
)
result := l.executor.Execute(ctx, req.Action, req.Revision)
duration := time.Since(startTime).Seconds()
if result.Success {
l.logger.Info("deployment completed successfully",
@@ -194,6 +244,9 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) {
messages.StatusCompleted,
"deployment completed successfully",
))
if l.metrics != nil {
l.metrics.RecordDeploymentEnd(req.Action, true, duration)
}
// After a successful switch, signal restart so we pick up any new version
if req.Action == messages.ActionSwitch {
@@ -220,6 +273,9 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) {
messages.StatusFailed,
fmt.Sprintf("deployment failed (exit code %d): %s", result.ExitCode, result.Stderr),
).WithError(errorCode))
if l.metrics != nil {
l.metrics.RecordDeploymentFailure(req.Action, errorCode, duration)
}
}
}

View File

@@ -0,0 +1,98 @@
// Package metrics provides Prometheus metrics for the homelab-deploy listener.
package metrics
import (
"git.t-juice.club/torjus/homelab-deploy/internal/messages"
"github.com/prometheus/client_golang/prometheus"
)
// Collector holds all Prometheus metrics for the listener.
type Collector struct {
deploymentsTotal *prometheus.CounterVec
deploymentDuration *prometheus.HistogramVec
deploymentInProgress prometheus.Gauge
info *prometheus.GaugeVec
}
// NewCollector creates a new metrics collector and registers it with the given registerer.
func NewCollector(reg prometheus.Registerer) *Collector {
c := &Collector{
deploymentsTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "homelab_deploy_deployments_total",
Help: "Total deployment requests processed",
},
[]string{"status", "action", "error_code"},
),
deploymentDuration: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "homelab_deploy_deployment_duration_seconds",
Help: "Deployment execution time",
// Bucket boundaries for typical NixOS build times
Buckets: []float64{30, 60, 120, 300, 600, 900, 1200, 1800},
},
[]string{"action", "success"},
),
deploymentInProgress: prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "homelab_deploy_deployment_in_progress",
Help: "1 if deployment running, 0 otherwise",
},
),
info: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "homelab_deploy_info",
Help: "Static instance metadata",
},
[]string{"hostname", "tier", "role", "version"},
),
}
reg.MustRegister(c.deploymentsTotal)
reg.MustRegister(c.deploymentDuration)
reg.MustRegister(c.deploymentInProgress)
reg.MustRegister(c.info)
return c
}
// SetInfo sets the static instance metadata.
func (c *Collector) SetInfo(hostname, tier, role, version string) {
c.info.WithLabelValues(hostname, tier, role, version).Set(1)
}
// RecordDeploymentStart marks the start of a deployment.
func (c *Collector) RecordDeploymentStart() {
c.deploymentInProgress.Set(1)
}
// RecordDeploymentEnd records the completion of a deployment.
func (c *Collector) RecordDeploymentEnd(action messages.Action, success bool, durationSeconds float64) {
c.deploymentInProgress.Set(0)
successLabel := "false"
if success {
successLabel = "true"
}
c.deploymentDuration.WithLabelValues(string(action), successLabel).Observe(durationSeconds)
status := "completed"
if !success {
status = "failed"
}
c.deploymentsTotal.WithLabelValues(status, string(action), "").Inc()
}
// RecordDeploymentFailure records a deployment failure with an error code.
func (c *Collector) RecordDeploymentFailure(action messages.Action, errorCode messages.ErrorCode, durationSeconds float64) {
c.deploymentInProgress.Set(0)
c.deploymentDuration.WithLabelValues(string(action), "false").Observe(durationSeconds)
c.deploymentsTotal.WithLabelValues("failed", string(action), string(errorCode)).Inc()
}
// RecordRejection records a rejected deployment request.
func (c *Collector) RecordRejection(action messages.Action, errorCode messages.ErrorCode) {
c.deploymentsTotal.WithLabelValues("rejected", string(action), string(errorCode)).Inc()
}

View File

@@ -0,0 +1,210 @@
package metrics
import (
"context"
"io"
"net/http"
"strings"
"testing"
"time"
"git.t-juice.club/torjus/homelab-deploy/internal/messages"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
)
func TestCollector_SetInfo(t *testing.T) {
reg := prometheus.NewRegistry()
c := NewCollector(reg)
c.SetInfo("testhost", "test", "web", "1.0.0")
expected := `
# HELP homelab_deploy_info Static instance metadata
# TYPE homelab_deploy_info gauge
homelab_deploy_info{hostname="testhost",role="web",tier="test",version="1.0.0"} 1
`
if err := testutil.GatherAndCompare(reg, strings.NewReader(expected), "homelab_deploy_info"); err != nil {
t.Errorf("unexpected metrics: %v", err)
}
}
func TestCollector_RecordDeploymentStart(t *testing.T) {
reg := prometheus.NewRegistry()
c := NewCollector(reg)
c.RecordDeploymentStart()
expected := `
# HELP homelab_deploy_deployment_in_progress 1 if deployment running, 0 otherwise
# TYPE homelab_deploy_deployment_in_progress gauge
homelab_deploy_deployment_in_progress 1
`
if err := testutil.GatherAndCompare(reg, strings.NewReader(expected), "homelab_deploy_deployment_in_progress"); err != nil {
t.Errorf("unexpected metrics: %v", err)
}
}
func TestCollector_RecordDeploymentEnd_Success(t *testing.T) {
reg := prometheus.NewRegistry()
c := NewCollector(reg)
c.RecordDeploymentStart()
c.RecordDeploymentEnd(messages.ActionSwitch, true, 120.5)
// Check in_progress is 0
inProgressExpected := `
# HELP homelab_deploy_deployment_in_progress 1 if deployment running, 0 otherwise
# TYPE homelab_deploy_deployment_in_progress gauge
homelab_deploy_deployment_in_progress 0
`
if err := testutil.GatherAndCompare(reg, strings.NewReader(inProgressExpected), "homelab_deploy_deployment_in_progress"); err != nil {
t.Errorf("unexpected in_progress metrics: %v", err)
}
// Check counter incremented
counterExpected := `
# HELP homelab_deploy_deployments_total Total deployment requests processed
# TYPE homelab_deploy_deployments_total counter
homelab_deploy_deployments_total{action="switch",error_code="",status="completed"} 1
`
if err := testutil.GatherAndCompare(reg, strings.NewReader(counterExpected), "homelab_deploy_deployments_total"); err != nil {
t.Errorf("unexpected counter metrics: %v", err)
}
}
func TestCollector_RecordDeploymentEnd_Failure(t *testing.T) {
reg := prometheus.NewRegistry()
c := NewCollector(reg)
c.RecordDeploymentStart()
c.RecordDeploymentEnd(messages.ActionBoot, false, 60.0)
counterExpected := `
# HELP homelab_deploy_deployments_total Total deployment requests processed
# TYPE homelab_deploy_deployments_total counter
homelab_deploy_deployments_total{action="boot",error_code="",status="failed"} 1
`
if err := testutil.GatherAndCompare(reg, strings.NewReader(counterExpected), "homelab_deploy_deployments_total"); err != nil {
t.Errorf("unexpected counter metrics: %v", err)
}
}
func TestCollector_RecordDeploymentFailure(t *testing.T) {
reg := prometheus.NewRegistry()
c := NewCollector(reg)
c.RecordDeploymentStart()
c.RecordDeploymentFailure(messages.ActionSwitch, messages.ErrorBuildFailed, 300.0)
counterExpected := `
# HELP homelab_deploy_deployments_total Total deployment requests processed
# TYPE homelab_deploy_deployments_total counter
homelab_deploy_deployments_total{action="switch",error_code="build_failed",status="failed"} 1
`
if err := testutil.GatherAndCompare(reg, strings.NewReader(counterExpected), "homelab_deploy_deployments_total"); err != nil {
t.Errorf("unexpected counter metrics: %v", err)
}
}
func TestCollector_RecordRejection(t *testing.T) {
reg := prometheus.NewRegistry()
c := NewCollector(reg)
c.RecordRejection(messages.ActionSwitch, messages.ErrorAlreadyRunning)
expected := `
# HELP homelab_deploy_deployments_total Total deployment requests processed
# TYPE homelab_deploy_deployments_total counter
homelab_deploy_deployments_total{action="switch",error_code="already_running",status="rejected"} 1
`
if err := testutil.GatherAndCompare(reg, strings.NewReader(expected), "homelab_deploy_deployments_total"); err != nil {
t.Errorf("unexpected metrics: %v", err)
}
}
func TestServer_StartShutdown(t *testing.T) {
srv := NewServer(ServerConfig{
Addr: ":0", // Let OS pick a free port
})
if err := srv.Start(); err != nil {
t.Fatalf("failed to start server: %v", err)
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if err := srv.Shutdown(ctx); err != nil {
t.Errorf("failed to shutdown server: %v", err)
}
}
func TestServer_Endpoints(t *testing.T) {
srv := NewServer(ServerConfig{
Addr: "127.0.0.1:19972", // Use a fixed port for testing
})
if err := srv.Start(); err != nil {
t.Fatalf("failed to start server: %v", err)
}
defer func() {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
_ = srv.Shutdown(ctx)
}()
// Give server time to start
time.Sleep(50 * time.Millisecond)
t.Run("health endpoint", func(t *testing.T) {
resp, err := http.Get("http://127.0.0.1:19972/health")
if err != nil {
t.Fatalf("failed to get health endpoint: %v", err)
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK {
t.Errorf("expected status 200, got %d", resp.StatusCode)
}
body, _ := io.ReadAll(resp.Body)
if string(body) != "ok" {
t.Errorf("expected body 'ok', got %q", string(body))
}
})
t.Run("metrics endpoint", func(t *testing.T) {
// Set some info to have metrics to display
srv.Collector().SetInfo("testhost", "test", "web", "1.0.0")
resp, err := http.Get("http://127.0.0.1:19972/metrics")
if err != nil {
t.Fatalf("failed to get metrics endpoint: %v", err)
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK {
t.Errorf("expected status 200, got %d", resp.StatusCode)
}
body, _ := io.ReadAll(resp.Body)
bodyStr := string(body)
if !strings.Contains(bodyStr, "homelab_deploy_info") {
t.Error("expected metrics to contain homelab_deploy_info")
}
})
}
func TestServer_Collector(t *testing.T) {
srv := NewServer(ServerConfig{
Addr: ":0",
})
collector := srv.Collector()
if collector == nil {
t.Error("expected non-nil collector")
}
}

View File

@@ -0,0 +1,84 @@
package metrics
import (
"context"
"fmt"
"log/slog"
"net/http"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
// ServerConfig holds configuration for the metrics server.
type ServerConfig struct {
Addr string
Logger *slog.Logger
}
// Server serves Prometheus metrics over HTTP.
type Server struct {
httpServer *http.Server
registry *prometheus.Registry
collector *Collector
logger *slog.Logger
}
// NewServer creates a new metrics server.
func NewServer(cfg ServerConfig) *Server {
logger := cfg.Logger
if logger == nil {
logger = slog.Default()
}
registry := prometheus.NewRegistry()
collector := NewCollector(registry)
mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.HandlerFor(registry, promhttp.HandlerOpts{
Registry: registry,
}))
mux.HandleFunc("/health", func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("ok"))
})
return &Server{
httpServer: &http.Server{
Addr: cfg.Addr,
Handler: mux,
ReadHeaderTimeout: 10 * time.Second,
},
registry: registry,
collector: collector,
logger: logger,
}
}
// Collector returns the metrics collector.
func (s *Server) Collector() *Collector {
return s.collector
}
// Start starts the HTTP server in a goroutine.
func (s *Server) Start() error {
s.logger.Info("starting metrics server", "addr", s.httpServer.Addr)
go func() {
if err := s.httpServer.ListenAndServe(); err != nil && err != http.ErrServerClosed {
s.logger.Error("metrics server error", "error", err)
}
}()
return nil
}
// Shutdown gracefully shuts down the server.
func (s *Server) Shutdown(ctx context.Context) error {
s.logger.Info("shutting down metrics server")
if err := s.httpServer.Shutdown(ctx); err != nil {
return fmt.Errorf("failed to shutdown metrics server: %w", err)
}
return nil
}

View File

@@ -15,7 +15,18 @@ let
"--discover-subject ${lib.escapeShellArg cfg.discoverSubject}"
]
++ lib.optional (cfg.role != null) "--role ${lib.escapeShellArg cfg.role}"
++ map (s: "--deploy-subject ${lib.escapeShellArg s}") cfg.deploySubjects);
++ map (s: "--deploy-subject ${lib.escapeShellArg s}") cfg.deploySubjects
++ lib.optionals cfg.metrics.enable [
"--metrics-enabled"
"--metrics-addr ${lib.escapeShellArg cfg.metrics.address}"
]);
# Extract port from metrics address for firewall rule
metricsPort = let
addr = cfg.metrics.address;
# Handle both ":9972" and "0.0.0.0:9972" formats
parts = lib.splitString ":" addr;
in lib.toInt (lib.last parts);
in
{
@@ -94,6 +105,23 @@ in
description = "Additional environment variables for the service";
example = { GIT_SSH_COMMAND = "ssh -i /run/secrets/deploy-key"; };
};
metrics = {
enable = lib.mkEnableOption "Prometheus metrics endpoint";
address = lib.mkOption {
type = lib.types.str;
default = ":9972";
description = "Address for Prometheus metrics HTTP server";
example = "127.0.0.1:9972";
};
openFirewall = lib.mkOption {
type = lib.types.bool;
default = false;
description = "Open firewall for metrics port";
};
};
};
config = lib.mkIf cfg.enable {
@@ -130,5 +158,9 @@ in
# Following the approach of nixos auto-upgrade which has no hardening
};
};
networking.firewall.allowedTCPPorts = lib.mkIf (cfg.metrics.enable && cfg.metrics.openFirewall) [
metricsPort
];
};
}