From 79db119d1ca6630023947ef0a65896cc3307c2ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sat, 7 Feb 2026 07:58:22 +0100 Subject: [PATCH] feat: add Prometheus metrics to listener service Add an optional Prometheus metrics HTTP endpoint to the listener for monitoring deployment operations. Includes four metrics: - homelab_deploy_deployments_total (counter with status/action/error_code) - homelab_deploy_deployment_duration_seconds (histogram with action/success) - homelab_deploy_deployment_in_progress (gauge) - homelab_deploy_info (gauge with hostname/tier/role/version) New CLI flags: --metrics-enabled, --metrics-addr (default :9972) New NixOS options: metrics.enable, metrics.address, metrics.openFirewall Co-Authored-By: Claude Opus 4.5 --- README.md | 79 ++++++++++++ cmd/homelab-deploy/main.go | 14 ++- flake.nix | 2 +- go.mod | 10 ++ go.sum | 33 ++++- internal/listener/listener.go | 58 ++++++++- internal/metrics/metrics.go | 98 +++++++++++++++ internal/metrics/metrics_test.go | 210 +++++++++++++++++++++++++++++++ internal/metrics/server.go | 84 +++++++++++++ nixos/module.nix | 34 ++++- 10 files changed, 613 insertions(+), 9 deletions(-) create mode 100644 internal/metrics/metrics.go create mode 100644 internal/metrics/metrics_test.go create mode 100644 internal/metrics/server.go diff --git a/README.md b/README.md index 2fc41c6..b5107c5 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,8 @@ homelab-deploy listener \ | `--timeout` | No | Deployment timeout in seconds (default: 600) | | `--deploy-subject` | No | NATS subjects to subscribe to (repeatable) | | `--discover-subject` | No | Discovery subject (default: `deploy.discover`) | +| `--metrics-enabled` | No | Enable Prometheus metrics endpoint | +| `--metrics-addr` | No | Metrics HTTP server address (default: `:9972`) | #### Subject Templates @@ -209,6 +211,9 @@ Add the module to your NixOS configuration: | `deploySubjects` | list of string | see below | Subjects to subscribe to | | `discoverSubject` | string | `"deploy.discover"` | Discovery subject | | `environment` | attrs | `{}` | Additional environment variables | +| `metrics.enable` | bool | `false` | Enable Prometheus metrics endpoint | +| `metrics.address` | string | `":9972"` | Metrics HTTP server address | +| `metrics.openFirewall` | bool | `false` | Open firewall for metrics port | Default `deploySubjects`: ```nix @@ -219,6 +224,80 @@ Default `deploySubjects`: ] ``` +## Prometheus Metrics + +The listener can expose Prometheus metrics for monitoring deployment operations. + +### Enabling Metrics + +**CLI:** +```bash +homelab-deploy listener \ + --hostname myhost \ + --tier prod \ + --nats-url nats://nats.example.com:4222 \ + --nkey-file /run/secrets/listener.nkey \ + --flake-url git+https://git.example.com/user/nixos-configs.git \ + --metrics-enabled \ + --metrics-addr :9972 +``` + +**NixOS module:** +```nix +services.homelab-deploy.listener = { + enable = true; + tier = "prod"; + natsUrl = "nats://nats.example.com:4222"; + nkeyFile = "/run/secrets/homelab-deploy-nkey"; + flakeUrl = "git+https://git.example.com/user/nixos-configs.git"; + metrics = { + enable = true; + address = ":9972"; + openFirewall = true; # Optional: open firewall for Prometheus scraping + }; +}; +``` + +### Available Metrics + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `homelab_deploy_deployments_total` | Counter | `status`, `action`, `error_code` | Total deployment requests processed | +| `homelab_deploy_deployment_duration_seconds` | Histogram | `action`, `success` | Deployment execution time | +| `homelab_deploy_deployment_in_progress` | Gauge | - | 1 if deployment running, 0 otherwise | +| `homelab_deploy_info` | Gauge | `hostname`, `tier`, `role`, `version` | Static instance metadata | + +**Label values:** +- `status`: `completed`, `failed`, `rejected` +- `action`: `switch`, `boot`, `test`, `dry-activate` +- `error_code`: `invalid_action`, `invalid_revision`, `already_running`, `build_failed`, `timeout`, or empty +- `success`: `true`, `false` + +### HTTP Endpoints + +| Endpoint | Description | +|----------|-------------| +| `/metrics` | Prometheus metrics in text format | +| `/health` | Health check (returns `ok`) | + +### Example Prometheus Queries + +```promql +# Average deployment duration (last hour) +rate(homelab_deploy_deployment_duration_seconds_sum[1h]) / +rate(homelab_deploy_deployment_duration_seconds_count[1h]) + +# Deployment success rate (last 24 hours) +sum(rate(homelab_deploy_deployments_total{status="completed"}[24h])) / +sum(rate(homelab_deploy_deployments_total{status=~"completed|failed"}[24h])) + +# 95th percentile deployment time +histogram_quantile(0.95, rate(homelab_deploy_deployment_duration_seconds_bucket[1h])) + +# Currently running deployments across all hosts +sum(homelab_deploy_deployment_in_progress) +``` + ## Message Protocol ### Deploy Request diff --git a/cmd/homelab-deploy/main.go b/cmd/homelab-deploy/main.go index 573b7a2..c7060dd 100644 --- a/cmd/homelab-deploy/main.go +++ b/cmd/homelab-deploy/main.go @@ -16,7 +16,7 @@ import ( "github.com/urfave/cli/v3" ) -const version = "0.1.7" +const version = "0.1.8" func main() { app := &cli.Command{ @@ -90,6 +90,15 @@ func listenerCommand() *cli.Command { Usage: "NATS subject for host discovery requests", Value: "deploy.discover", }, + &cli.BoolFlag{ + Name: "metrics-enabled", + Usage: "Enable Prometheus metrics endpoint", + }, + &cli.StringFlag{ + Name: "metrics-addr", + Usage: "Address for Prometheus metrics HTTP server", + Value: ":9972", + }, }, Action: func(ctx context.Context, c *cli.Command) error { tier := c.String("tier") @@ -107,6 +116,9 @@ func listenerCommand() *cli.Command { Timeout: time.Duration(c.Int("timeout")) * time.Second, DeploySubjects: c.StringSlice("deploy-subject"), DiscoverSubject: c.String("discover-subject"), + MetricsEnabled: c.Bool("metrics-enabled"), + MetricsAddr: c.String("metrics-addr"), + Version: version, } logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{ diff --git a/flake.nix b/flake.nix index 1c37b55..001e5dc 100644 --- a/flake.nix +++ b/flake.nix @@ -26,7 +26,7 @@ pname = "homelab-deploy"; inherit version; src = ./.; - vendorHash = "sha256-JXa+obN62zrrwXlplqojY7dvEunUqDdSTee6N8c5JTg="; + vendorHash = "sha256-CN+l0JbQu+HDfotkt3PUFzBexHCHpCKIIZpAQRyojBk="; subPackages = [ "cmd/homelab-deploy" ]; }; default = self.packages.${system}.homelab-deploy; diff --git a/go.mod b/go.mod index 5ec166c..835f140 100644 --- a/go.mod +++ b/go.mod @@ -7,20 +7,30 @@ require ( github.com/mark3labs/mcp-go v0.43.2 github.com/nats-io/nats.go v1.48.0 github.com/nats-io/nkeys v0.4.15 + github.com/prometheus/client_golang v1.23.2 github.com/urfave/cli/v3 v3.6.2 ) require ( github.com/bahlo/generic-list-go v0.2.0 // indirect + github.com/beorn7/perks v1.0.1 // indirect github.com/buger/jsonparser v1.1.1 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/invopop/jsonschema v0.13.0 // indirect github.com/klauspost/compress v1.18.0 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect github.com/mailru/easyjson v0.7.7 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/nats-io/nuid v1.0.1 // indirect + github.com/prometheus/client_model v0.6.2 // indirect + github.com/prometheus/common v0.66.1 // indirect + github.com/prometheus/procfs v0.16.1 // indirect github.com/spf13/cast v1.7.1 // indirect github.com/wk8/go-ordered-map/v2 v2.1.8 // indirect github.com/yosida95/uritemplate/v3 v3.0.2 // indirect + go.yaml.in/yaml/v2 v2.4.2 // indirect golang.org/x/crypto v0.47.0 // indirect golang.org/x/sys v0.40.0 // indirect + google.golang.org/protobuf v1.36.8 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index dc44fe3..f502363 100644 --- a/go.sum +++ b/go.sum @@ -1,13 +1,17 @@ github.com/bahlo/generic-list-go v0.2.0 h1:5sz/EEAK+ls5wF+NeqDpk5+iNdMDXrh3z3nPnH1Wvgk= github.com/bahlo/generic-list-go v0.2.0/go.mod h1:2KvAjgMlE5NNynlg/5iLrrCCZ2+5xWbdbCW3pNTGyYg= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMUs= github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= -github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= -github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/invopop/jsonschema v0.13.0 h1:KvpoAJWEjR3uD9Kbm2HWJmqsEaHt8lBUpd0qHcIi21E= @@ -19,10 +23,14 @@ github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/mark3labs/mcp-go v0.43.2 h1:21PUSlWWiSbUPQwXIJ5WKlETixpFpq+WBpbMGDSVy/I= github.com/mark3labs/mcp-go v0.43.2/go.mod h1:YnJfOL382MIWDx1kMY+2zsRHU/q78dBg9aFb8W6Thdw= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/nats-io/nats.go v1.48.0 h1:pSFyXApG+yWU/TgbKCjmm5K4wrHu86231/w84qRVR+U= github.com/nats-io/nats.go v1.48.0/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g= github.com/nats-io/nkeys v0.4.15 h1:JACV5jRVO9V856KOapQ7x+EY8Jo3qw1vJt/9Jpwzkk4= @@ -31,8 +39,16 @@ github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= -github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= +github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= +github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= +github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= +github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= +github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/spf13/cast v1.7.1 h1:cuNEagBQEHWN1FnbGEjCXL2szYEXqfJPbP2HNUaca9Y= github.com/spf13/cast v1.7.1/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= @@ -43,11 +59,18 @@ github.com/wk8/go-ordered-map/v2 v2.1.8 h1:5h/BUHu93oj4gIdvHHHGsScSTMijfx5PeYkE/ github.com/wk8/go-ordered-map/v2 v2.1.8/go.mod h1:5nJHM5DyteebpVlHnWMV0rPz6Zp7+xBAnxjb1X5vnTw= github.com/yosida95/uritemplate/v3 v3.0.2 h1:Ed3Oyj9yrmi9087+NczuL5BwkIc4wvTb5zIM+UJPGz4= github.com/yosida95/uritemplate/v3 v3.0.2/go.mod h1:ILOh0sOhIJR3+L/8afwt/kE++YT040gmv5BQTMR2HP4= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= +go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= golang.org/x/crypto v0.47.0 h1:V6e3FRj+n4dbpw86FJ8Fv7XVOql7TEwpHapKoMJ/GO8= golang.org/x/crypto v0.47.0/go.mod h1:ff3Y9VzzKbwSSEzWqJsJVBnWmRwRSHt/6Op5n9bQc4A= golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ= golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= +google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/listener/listener.go b/internal/listener/listener.go index ce477b5..2ea3e22 100644 --- a/internal/listener/listener.go +++ b/internal/listener/listener.go @@ -8,6 +8,7 @@ import ( "git.t-juice.club/torjus/homelab-deploy/internal/deploy" "git.t-juice.club/torjus/homelab-deploy/internal/messages" + "git.t-juice.club/torjus/homelab-deploy/internal/metrics" "git.t-juice.club/torjus/homelab-deploy/internal/nats" ) @@ -22,6 +23,9 @@ type Config struct { Timeout time.Duration DeploySubjects []string DiscoverSubject string + MetricsEnabled bool + MetricsAddr string + Version string } // Listener handles deployment requests from NATS. @@ -38,6 +42,10 @@ type Listener struct { // restartCh signals that the listener should exit for restart // (e.g., after a successful switch deployment) restartCh chan struct{} + + // metrics server and collector (nil if metrics disabled) + metricsServer *metrics.Server + metrics *metrics.Collector } // New creates a new listener with the given configuration. @@ -46,17 +54,42 @@ func New(cfg Config, logger *slog.Logger) *Listener { logger = slog.Default() } - return &Listener{ + l := &Listener{ cfg: cfg, executor: deploy.NewExecutor(cfg.FlakeURL, cfg.Hostname, cfg.Timeout), lock: deploy.NewLock(), logger: logger, restartCh: make(chan struct{}, 1), } + + if cfg.MetricsEnabled { + l.metricsServer = metrics.NewServer(metrics.ServerConfig{ + Addr: cfg.MetricsAddr, + Logger: logger, + }) + l.metrics = l.metricsServer.Collector() + } + + return l } // Run starts the listener and blocks until the context is cancelled. func (l *Listener) Run(ctx context.Context) error { + // Start metrics server if enabled + if l.metricsServer != nil { + if err := l.metricsServer.Start(); err != nil { + return fmt.Errorf("failed to start metrics server: %w", err) + } + defer func() { + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _ = l.metricsServer.Shutdown(shutdownCtx) + }() + + // Set instance info metric + l.metrics.SetInfo(l.cfg.Hostname, l.cfg.Tier, l.cfg.Role, l.cfg.Version) + } + // Connect to NATS l.logger.Info("connecting to NATS", "url", l.cfg.NATSUrl, @@ -136,6 +169,9 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) { messages.StatusRejected, err.Error(), ).WithError(messages.ErrorInvalidAction)) + if l.metrics != nil { + l.metrics.RecordRejection(req.Action, messages.ErrorInvalidAction) + } return } @@ -150,6 +186,9 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) { messages.StatusRejected, "another deployment is already in progress", ).WithError(messages.ErrorAlreadyRunning)) + if l.metrics != nil { + l.metrics.RecordRejection(req.Action, messages.ErrorAlreadyRunning) + } return } defer l.lock.Release() @@ -161,6 +200,12 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) { fmt.Sprintf("starting deployment: %s", l.executor.BuildCommand(req.Action, req.Revision)), )) + // Record deployment start for metrics + if l.metrics != nil { + l.metrics.RecordDeploymentStart() + } + startTime := time.Now() + // Validate revision ctx := context.Background() if err := l.executor.ValidateRevision(ctx, req.Revision); err != nil { @@ -173,6 +218,10 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) { messages.StatusFailed, fmt.Sprintf("revision validation failed: %v", err), ).WithError(messages.ErrorInvalidRevision)) + if l.metrics != nil { + duration := time.Since(startTime).Seconds() + l.metrics.RecordDeploymentFailure(req.Action, messages.ErrorInvalidRevision, duration) + } return } @@ -184,6 +233,7 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) { ) result := l.executor.Execute(ctx, req.Action, req.Revision) + duration := time.Since(startTime).Seconds() if result.Success { l.logger.Info("deployment completed successfully", @@ -194,6 +244,9 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) { messages.StatusCompleted, "deployment completed successfully", )) + if l.metrics != nil { + l.metrics.RecordDeploymentEnd(req.Action, true, duration) + } // After a successful switch, signal restart so we pick up any new version if req.Action == messages.ActionSwitch { @@ -220,6 +273,9 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) { messages.StatusFailed, fmt.Sprintf("deployment failed (exit code %d): %s", result.ExitCode, result.Stderr), ).WithError(errorCode)) + if l.metrics != nil { + l.metrics.RecordDeploymentFailure(req.Action, errorCode, duration) + } } } diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go new file mode 100644 index 0000000..c4e872b --- /dev/null +++ b/internal/metrics/metrics.go @@ -0,0 +1,98 @@ +// Package metrics provides Prometheus metrics for the homelab-deploy listener. +package metrics + +import ( + "git.t-juice.club/torjus/homelab-deploy/internal/messages" + "github.com/prometheus/client_golang/prometheus" +) + +// Collector holds all Prometheus metrics for the listener. +type Collector struct { + deploymentsTotal *prometheus.CounterVec + deploymentDuration *prometheus.HistogramVec + deploymentInProgress prometheus.Gauge + info *prometheus.GaugeVec +} + +// NewCollector creates a new metrics collector and registers it with the given registerer. +func NewCollector(reg prometheus.Registerer) *Collector { + c := &Collector{ + deploymentsTotal: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "homelab_deploy_deployments_total", + Help: "Total deployment requests processed", + }, + []string{"status", "action", "error_code"}, + ), + deploymentDuration: prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "homelab_deploy_deployment_duration_seconds", + Help: "Deployment execution time", + // Bucket boundaries for typical NixOS build times + Buckets: []float64{30, 60, 120, 300, 600, 900, 1200, 1800}, + }, + []string{"action", "success"}, + ), + deploymentInProgress: prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "homelab_deploy_deployment_in_progress", + Help: "1 if deployment running, 0 otherwise", + }, + ), + info: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "homelab_deploy_info", + Help: "Static instance metadata", + }, + []string{"hostname", "tier", "role", "version"}, + ), + } + + reg.MustRegister(c.deploymentsTotal) + reg.MustRegister(c.deploymentDuration) + reg.MustRegister(c.deploymentInProgress) + reg.MustRegister(c.info) + + return c +} + +// SetInfo sets the static instance metadata. +func (c *Collector) SetInfo(hostname, tier, role, version string) { + c.info.WithLabelValues(hostname, tier, role, version).Set(1) +} + +// RecordDeploymentStart marks the start of a deployment. +func (c *Collector) RecordDeploymentStart() { + c.deploymentInProgress.Set(1) +} + +// RecordDeploymentEnd records the completion of a deployment. +func (c *Collector) RecordDeploymentEnd(action messages.Action, success bool, durationSeconds float64) { + c.deploymentInProgress.Set(0) + + successLabel := "false" + if success { + successLabel = "true" + } + + c.deploymentDuration.WithLabelValues(string(action), successLabel).Observe(durationSeconds) + + status := "completed" + if !success { + status = "failed" + } + + c.deploymentsTotal.WithLabelValues(status, string(action), "").Inc() +} + +// RecordDeploymentFailure records a deployment failure with an error code. +func (c *Collector) RecordDeploymentFailure(action messages.Action, errorCode messages.ErrorCode, durationSeconds float64) { + c.deploymentInProgress.Set(0) + c.deploymentDuration.WithLabelValues(string(action), "false").Observe(durationSeconds) + c.deploymentsTotal.WithLabelValues("failed", string(action), string(errorCode)).Inc() +} + +// RecordRejection records a rejected deployment request. +func (c *Collector) RecordRejection(action messages.Action, errorCode messages.ErrorCode) { + c.deploymentsTotal.WithLabelValues("rejected", string(action), string(errorCode)).Inc() +} diff --git a/internal/metrics/metrics_test.go b/internal/metrics/metrics_test.go new file mode 100644 index 0000000..3acc383 --- /dev/null +++ b/internal/metrics/metrics_test.go @@ -0,0 +1,210 @@ +package metrics + +import ( + "context" + "io" + "net/http" + "strings" + "testing" + "time" + + "git.t-juice.club/torjus/homelab-deploy/internal/messages" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" +) + +func TestCollector_SetInfo(t *testing.T) { + reg := prometheus.NewRegistry() + c := NewCollector(reg) + + c.SetInfo("testhost", "test", "web", "1.0.0") + + expected := ` +# HELP homelab_deploy_info Static instance metadata +# TYPE homelab_deploy_info gauge +homelab_deploy_info{hostname="testhost",role="web",tier="test",version="1.0.0"} 1 +` + if err := testutil.GatherAndCompare(reg, strings.NewReader(expected), "homelab_deploy_info"); err != nil { + t.Errorf("unexpected metrics: %v", err) + } +} + +func TestCollector_RecordDeploymentStart(t *testing.T) { + reg := prometheus.NewRegistry() + c := NewCollector(reg) + + c.RecordDeploymentStart() + + expected := ` +# HELP homelab_deploy_deployment_in_progress 1 if deployment running, 0 otherwise +# TYPE homelab_deploy_deployment_in_progress gauge +homelab_deploy_deployment_in_progress 1 +` + if err := testutil.GatherAndCompare(reg, strings.NewReader(expected), "homelab_deploy_deployment_in_progress"); err != nil { + t.Errorf("unexpected metrics: %v", err) + } +} + +func TestCollector_RecordDeploymentEnd_Success(t *testing.T) { + reg := prometheus.NewRegistry() + c := NewCollector(reg) + + c.RecordDeploymentStart() + c.RecordDeploymentEnd(messages.ActionSwitch, true, 120.5) + + // Check in_progress is 0 + inProgressExpected := ` +# HELP homelab_deploy_deployment_in_progress 1 if deployment running, 0 otherwise +# TYPE homelab_deploy_deployment_in_progress gauge +homelab_deploy_deployment_in_progress 0 +` + if err := testutil.GatherAndCompare(reg, strings.NewReader(inProgressExpected), "homelab_deploy_deployment_in_progress"); err != nil { + t.Errorf("unexpected in_progress metrics: %v", err) + } + + // Check counter incremented + counterExpected := ` +# HELP homelab_deploy_deployments_total Total deployment requests processed +# TYPE homelab_deploy_deployments_total counter +homelab_deploy_deployments_total{action="switch",error_code="",status="completed"} 1 +` + if err := testutil.GatherAndCompare(reg, strings.NewReader(counterExpected), "homelab_deploy_deployments_total"); err != nil { + t.Errorf("unexpected counter metrics: %v", err) + } +} + +func TestCollector_RecordDeploymentEnd_Failure(t *testing.T) { + reg := prometheus.NewRegistry() + c := NewCollector(reg) + + c.RecordDeploymentStart() + c.RecordDeploymentEnd(messages.ActionBoot, false, 60.0) + + counterExpected := ` +# HELP homelab_deploy_deployments_total Total deployment requests processed +# TYPE homelab_deploy_deployments_total counter +homelab_deploy_deployments_total{action="boot",error_code="",status="failed"} 1 +` + if err := testutil.GatherAndCompare(reg, strings.NewReader(counterExpected), "homelab_deploy_deployments_total"); err != nil { + t.Errorf("unexpected counter metrics: %v", err) + } +} + +func TestCollector_RecordDeploymentFailure(t *testing.T) { + reg := prometheus.NewRegistry() + c := NewCollector(reg) + + c.RecordDeploymentStart() + c.RecordDeploymentFailure(messages.ActionSwitch, messages.ErrorBuildFailed, 300.0) + + counterExpected := ` +# HELP homelab_deploy_deployments_total Total deployment requests processed +# TYPE homelab_deploy_deployments_total counter +homelab_deploy_deployments_total{action="switch",error_code="build_failed",status="failed"} 1 +` + if err := testutil.GatherAndCompare(reg, strings.NewReader(counterExpected), "homelab_deploy_deployments_total"); err != nil { + t.Errorf("unexpected counter metrics: %v", err) + } +} + +func TestCollector_RecordRejection(t *testing.T) { + reg := prometheus.NewRegistry() + c := NewCollector(reg) + + c.RecordRejection(messages.ActionSwitch, messages.ErrorAlreadyRunning) + + expected := ` +# HELP homelab_deploy_deployments_total Total deployment requests processed +# TYPE homelab_deploy_deployments_total counter +homelab_deploy_deployments_total{action="switch",error_code="already_running",status="rejected"} 1 +` + if err := testutil.GatherAndCompare(reg, strings.NewReader(expected), "homelab_deploy_deployments_total"); err != nil { + t.Errorf("unexpected metrics: %v", err) + } +} + +func TestServer_StartShutdown(t *testing.T) { + srv := NewServer(ServerConfig{ + Addr: ":0", // Let OS pick a free port + }) + + if err := srv.Start(); err != nil { + t.Fatalf("failed to start server: %v", err) + } + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + if err := srv.Shutdown(ctx); err != nil { + t.Errorf("failed to shutdown server: %v", err) + } +} + +func TestServer_Endpoints(t *testing.T) { + srv := NewServer(ServerConfig{ + Addr: "127.0.0.1:19972", // Use a fixed port for testing + }) + + if err := srv.Start(); err != nil { + t.Fatalf("failed to start server: %v", err) + } + + defer func() { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _ = srv.Shutdown(ctx) + }() + + // Give server time to start + time.Sleep(50 * time.Millisecond) + + t.Run("health endpoint", func(t *testing.T) { + resp, err := http.Get("http://127.0.0.1:19972/health") + if err != nil { + t.Fatalf("failed to get health endpoint: %v", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusOK { + t.Errorf("expected status 200, got %d", resp.StatusCode) + } + + body, _ := io.ReadAll(resp.Body) + if string(body) != "ok" { + t.Errorf("expected body 'ok', got %q", string(body)) + } + }) + + t.Run("metrics endpoint", func(t *testing.T) { + // Set some info to have metrics to display + srv.Collector().SetInfo("testhost", "test", "web", "1.0.0") + + resp, err := http.Get("http://127.0.0.1:19972/metrics") + if err != nil { + t.Fatalf("failed to get metrics endpoint: %v", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusOK { + t.Errorf("expected status 200, got %d", resp.StatusCode) + } + + body, _ := io.ReadAll(resp.Body) + bodyStr := string(body) + + if !strings.Contains(bodyStr, "homelab_deploy_info") { + t.Error("expected metrics to contain homelab_deploy_info") + } + }) +} + +func TestServer_Collector(t *testing.T) { + srv := NewServer(ServerConfig{ + Addr: ":0", + }) + + collector := srv.Collector() + if collector == nil { + t.Error("expected non-nil collector") + } +} diff --git a/internal/metrics/server.go b/internal/metrics/server.go new file mode 100644 index 0000000..da6ebb8 --- /dev/null +++ b/internal/metrics/server.go @@ -0,0 +1,84 @@ +package metrics + +import ( + "context" + "fmt" + "log/slog" + "net/http" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" +) + +// ServerConfig holds configuration for the metrics server. +type ServerConfig struct { + Addr string + Logger *slog.Logger +} + +// Server serves Prometheus metrics over HTTP. +type Server struct { + httpServer *http.Server + registry *prometheus.Registry + collector *Collector + logger *slog.Logger +} + +// NewServer creates a new metrics server. +func NewServer(cfg ServerConfig) *Server { + logger := cfg.Logger + if logger == nil { + logger = slog.Default() + } + + registry := prometheus.NewRegistry() + collector := NewCollector(registry) + + mux := http.NewServeMux() + mux.Handle("/metrics", promhttp.HandlerFor(registry, promhttp.HandlerOpts{ + Registry: registry, + })) + mux.HandleFunc("/health", func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("ok")) + }) + + return &Server{ + httpServer: &http.Server{ + Addr: cfg.Addr, + Handler: mux, + ReadHeaderTimeout: 10 * time.Second, + }, + registry: registry, + collector: collector, + logger: logger, + } +} + +// Collector returns the metrics collector. +func (s *Server) Collector() *Collector { + return s.collector +} + +// Start starts the HTTP server in a goroutine. +func (s *Server) Start() error { + s.logger.Info("starting metrics server", "addr", s.httpServer.Addr) + + go func() { + if err := s.httpServer.ListenAndServe(); err != nil && err != http.ErrServerClosed { + s.logger.Error("metrics server error", "error", err) + } + }() + + return nil +} + +// Shutdown gracefully shuts down the server. +func (s *Server) Shutdown(ctx context.Context) error { + s.logger.Info("shutting down metrics server") + if err := s.httpServer.Shutdown(ctx); err != nil { + return fmt.Errorf("failed to shutdown metrics server: %w", err) + } + return nil +} diff --git a/nixos/module.nix b/nixos/module.nix index 07bffc7..ee0a77b 100644 --- a/nixos/module.nix +++ b/nixos/module.nix @@ -15,7 +15,18 @@ let "--discover-subject ${lib.escapeShellArg cfg.discoverSubject}" ] ++ lib.optional (cfg.role != null) "--role ${lib.escapeShellArg cfg.role}" - ++ map (s: "--deploy-subject ${lib.escapeShellArg s}") cfg.deploySubjects); + ++ map (s: "--deploy-subject ${lib.escapeShellArg s}") cfg.deploySubjects + ++ lib.optionals cfg.metrics.enable [ + "--metrics-enabled" + "--metrics-addr ${lib.escapeShellArg cfg.metrics.address}" + ]); + + # Extract port from metrics address for firewall rule + metricsPort = let + addr = cfg.metrics.address; + # Handle both ":9972" and "0.0.0.0:9972" formats + parts = lib.splitString ":" addr; + in lib.toInt (lib.last parts); in { @@ -94,6 +105,23 @@ in description = "Additional environment variables for the service"; example = { GIT_SSH_COMMAND = "ssh -i /run/secrets/deploy-key"; }; }; + + metrics = { + enable = lib.mkEnableOption "Prometheus metrics endpoint"; + + address = lib.mkOption { + type = lib.types.str; + default = ":9972"; + description = "Address for Prometheus metrics HTTP server"; + example = "127.0.0.1:9972"; + }; + + openFirewall = lib.mkOption { + type = lib.types.bool; + default = false; + description = "Open firewall for metrics port"; + }; + }; }; config = lib.mkIf cfg.enable { @@ -130,5 +158,9 @@ in # Following the approach of nixos auto-upgrade which has no hardening }; }; + + networking.firewall.allowedTCPPorts = lib.mkIf (cfg.metrics.enable && cfg.metrics.openFirewall) [ + metricsPort + ]; }; }