feat: add Prometheus metrics to listener service
Add an optional Prometheus metrics HTTP endpoint to the listener for monitoring deployment operations. Includes four metrics: - homelab_deploy_deployments_total (counter with status/action/error_code) - homelab_deploy_deployment_duration_seconds (histogram with action/success) - homelab_deploy_deployment_in_progress (gauge) - homelab_deploy_info (gauge with hostname/tier/role/version) New CLI flags: --metrics-enabled, --metrics-addr (default :9972) New NixOS options: metrics.enable, metrics.address, metrics.openFirewall Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
79
README.md
79
README.md
@@ -61,6 +61,8 @@ homelab-deploy listener \
|
|||||||
| `--timeout` | No | Deployment timeout in seconds (default: 600) |
|
| `--timeout` | No | Deployment timeout in seconds (default: 600) |
|
||||||
| `--deploy-subject` | No | NATS subjects to subscribe to (repeatable) |
|
| `--deploy-subject` | No | NATS subjects to subscribe to (repeatable) |
|
||||||
| `--discover-subject` | No | Discovery subject (default: `deploy.discover`) |
|
| `--discover-subject` | No | Discovery subject (default: `deploy.discover`) |
|
||||||
|
| `--metrics-enabled` | No | Enable Prometheus metrics endpoint |
|
||||||
|
| `--metrics-addr` | No | Metrics HTTP server address (default: `:9972`) |
|
||||||
|
|
||||||
#### Subject Templates
|
#### Subject Templates
|
||||||
|
|
||||||
@@ -209,6 +211,9 @@ Add the module to your NixOS configuration:
|
|||||||
| `deploySubjects` | list of string | see below | Subjects to subscribe to |
|
| `deploySubjects` | list of string | see below | Subjects to subscribe to |
|
||||||
| `discoverSubject` | string | `"deploy.discover"` | Discovery subject |
|
| `discoverSubject` | string | `"deploy.discover"` | Discovery subject |
|
||||||
| `environment` | attrs | `{}` | Additional environment variables |
|
| `environment` | attrs | `{}` | Additional environment variables |
|
||||||
|
| `metrics.enable` | bool | `false` | Enable Prometheus metrics endpoint |
|
||||||
|
| `metrics.address` | string | `":9972"` | Metrics HTTP server address |
|
||||||
|
| `metrics.openFirewall` | bool | `false` | Open firewall for metrics port |
|
||||||
|
|
||||||
Default `deploySubjects`:
|
Default `deploySubjects`:
|
||||||
```nix
|
```nix
|
||||||
@@ -219,6 +224,80 @@ Default `deploySubjects`:
|
|||||||
]
|
]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Prometheus Metrics
|
||||||
|
|
||||||
|
The listener can expose Prometheus metrics for monitoring deployment operations.
|
||||||
|
|
||||||
|
### Enabling Metrics
|
||||||
|
|
||||||
|
**CLI:**
|
||||||
|
```bash
|
||||||
|
homelab-deploy listener \
|
||||||
|
--hostname myhost \
|
||||||
|
--tier prod \
|
||||||
|
--nats-url nats://nats.example.com:4222 \
|
||||||
|
--nkey-file /run/secrets/listener.nkey \
|
||||||
|
--flake-url git+https://git.example.com/user/nixos-configs.git \
|
||||||
|
--metrics-enabled \
|
||||||
|
--metrics-addr :9972
|
||||||
|
```
|
||||||
|
|
||||||
|
**NixOS module:**
|
||||||
|
```nix
|
||||||
|
services.homelab-deploy.listener = {
|
||||||
|
enable = true;
|
||||||
|
tier = "prod";
|
||||||
|
natsUrl = "nats://nats.example.com:4222";
|
||||||
|
nkeyFile = "/run/secrets/homelab-deploy-nkey";
|
||||||
|
flakeUrl = "git+https://git.example.com/user/nixos-configs.git";
|
||||||
|
metrics = {
|
||||||
|
enable = true;
|
||||||
|
address = ":9972";
|
||||||
|
openFirewall = true; # Optional: open firewall for Prometheus scraping
|
||||||
|
};
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
### Available Metrics
|
||||||
|
|
||||||
|
| Metric | Type | Labels | Description |
|
||||||
|
|--------|------|--------|-------------|
|
||||||
|
| `homelab_deploy_deployments_total` | Counter | `status`, `action`, `error_code` | Total deployment requests processed |
|
||||||
|
| `homelab_deploy_deployment_duration_seconds` | Histogram | `action`, `success` | Deployment execution time |
|
||||||
|
| `homelab_deploy_deployment_in_progress` | Gauge | - | 1 if deployment running, 0 otherwise |
|
||||||
|
| `homelab_deploy_info` | Gauge | `hostname`, `tier`, `role`, `version` | Static instance metadata |
|
||||||
|
|
||||||
|
**Label values:**
|
||||||
|
- `status`: `completed`, `failed`, `rejected`
|
||||||
|
- `action`: `switch`, `boot`, `test`, `dry-activate`
|
||||||
|
- `error_code`: `invalid_action`, `invalid_revision`, `already_running`, `build_failed`, `timeout`, or empty
|
||||||
|
- `success`: `true`, `false`
|
||||||
|
|
||||||
|
### HTTP Endpoints
|
||||||
|
|
||||||
|
| Endpoint | Description |
|
||||||
|
|----------|-------------|
|
||||||
|
| `/metrics` | Prometheus metrics in text format |
|
||||||
|
| `/health` | Health check (returns `ok`) |
|
||||||
|
|
||||||
|
### Example Prometheus Queries
|
||||||
|
|
||||||
|
```promql
|
||||||
|
# Average deployment duration (last hour)
|
||||||
|
rate(homelab_deploy_deployment_duration_seconds_sum[1h]) /
|
||||||
|
rate(homelab_deploy_deployment_duration_seconds_count[1h])
|
||||||
|
|
||||||
|
# Deployment success rate (last 24 hours)
|
||||||
|
sum(rate(homelab_deploy_deployments_total{status="completed"}[24h])) /
|
||||||
|
sum(rate(homelab_deploy_deployments_total{status=~"completed|failed"}[24h]))
|
||||||
|
|
||||||
|
# 95th percentile deployment time
|
||||||
|
histogram_quantile(0.95, rate(homelab_deploy_deployment_duration_seconds_bucket[1h]))
|
||||||
|
|
||||||
|
# Currently running deployments across all hosts
|
||||||
|
sum(homelab_deploy_deployment_in_progress)
|
||||||
|
```
|
||||||
|
|
||||||
## Message Protocol
|
## Message Protocol
|
||||||
|
|
||||||
### Deploy Request
|
### Deploy Request
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ import (
|
|||||||
"github.com/urfave/cli/v3"
|
"github.com/urfave/cli/v3"
|
||||||
)
|
)
|
||||||
|
|
||||||
const version = "0.1.7"
|
const version = "0.1.8"
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
app := &cli.Command{
|
app := &cli.Command{
|
||||||
@@ -90,6 +90,15 @@ func listenerCommand() *cli.Command {
|
|||||||
Usage: "NATS subject for host discovery requests",
|
Usage: "NATS subject for host discovery requests",
|
||||||
Value: "deploy.discover",
|
Value: "deploy.discover",
|
||||||
},
|
},
|
||||||
|
&cli.BoolFlag{
|
||||||
|
Name: "metrics-enabled",
|
||||||
|
Usage: "Enable Prometheus metrics endpoint",
|
||||||
|
},
|
||||||
|
&cli.StringFlag{
|
||||||
|
Name: "metrics-addr",
|
||||||
|
Usage: "Address for Prometheus metrics HTTP server",
|
||||||
|
Value: ":9972",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
Action: func(ctx context.Context, c *cli.Command) error {
|
Action: func(ctx context.Context, c *cli.Command) error {
|
||||||
tier := c.String("tier")
|
tier := c.String("tier")
|
||||||
@@ -107,6 +116,9 @@ func listenerCommand() *cli.Command {
|
|||||||
Timeout: time.Duration(c.Int("timeout")) * time.Second,
|
Timeout: time.Duration(c.Int("timeout")) * time.Second,
|
||||||
DeploySubjects: c.StringSlice("deploy-subject"),
|
DeploySubjects: c.StringSlice("deploy-subject"),
|
||||||
DiscoverSubject: c.String("discover-subject"),
|
DiscoverSubject: c.String("discover-subject"),
|
||||||
|
MetricsEnabled: c.Bool("metrics-enabled"),
|
||||||
|
MetricsAddr: c.String("metrics-addr"),
|
||||||
|
Version: version,
|
||||||
}
|
}
|
||||||
|
|
||||||
logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
|
logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
|
||||||
|
|||||||
@@ -26,7 +26,7 @@
|
|||||||
pname = "homelab-deploy";
|
pname = "homelab-deploy";
|
||||||
inherit version;
|
inherit version;
|
||||||
src = ./.;
|
src = ./.;
|
||||||
vendorHash = "sha256-JXa+obN62zrrwXlplqojY7dvEunUqDdSTee6N8c5JTg=";
|
vendorHash = "sha256-CN+l0JbQu+HDfotkt3PUFzBexHCHpCKIIZpAQRyojBk=";
|
||||||
subPackages = [ "cmd/homelab-deploy" ];
|
subPackages = [ "cmd/homelab-deploy" ];
|
||||||
};
|
};
|
||||||
default = self.packages.${system}.homelab-deploy;
|
default = self.packages.${system}.homelab-deploy;
|
||||||
|
|||||||
10
go.mod
10
go.mod
@@ -7,20 +7,30 @@ require (
|
|||||||
github.com/mark3labs/mcp-go v0.43.2
|
github.com/mark3labs/mcp-go v0.43.2
|
||||||
github.com/nats-io/nats.go v1.48.0
|
github.com/nats-io/nats.go v1.48.0
|
||||||
github.com/nats-io/nkeys v0.4.15
|
github.com/nats-io/nkeys v0.4.15
|
||||||
|
github.com/prometheus/client_golang v1.23.2
|
||||||
github.com/urfave/cli/v3 v3.6.2
|
github.com/urfave/cli/v3 v3.6.2
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/bahlo/generic-list-go v0.2.0 // indirect
|
github.com/bahlo/generic-list-go v0.2.0 // indirect
|
||||||
|
github.com/beorn7/perks v1.0.1 // indirect
|
||||||
github.com/buger/jsonparser v1.1.1 // indirect
|
github.com/buger/jsonparser v1.1.1 // indirect
|
||||||
|
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||||
github.com/invopop/jsonschema v0.13.0 // indirect
|
github.com/invopop/jsonschema v0.13.0 // indirect
|
||||||
github.com/klauspost/compress v1.18.0 // indirect
|
github.com/klauspost/compress v1.18.0 // indirect
|
||||||
|
github.com/kylelemons/godebug v1.1.0 // indirect
|
||||||
github.com/mailru/easyjson v0.7.7 // indirect
|
github.com/mailru/easyjson v0.7.7 // indirect
|
||||||
|
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||||
github.com/nats-io/nuid v1.0.1 // indirect
|
github.com/nats-io/nuid v1.0.1 // indirect
|
||||||
|
github.com/prometheus/client_model v0.6.2 // indirect
|
||||||
|
github.com/prometheus/common v0.66.1 // indirect
|
||||||
|
github.com/prometheus/procfs v0.16.1 // indirect
|
||||||
github.com/spf13/cast v1.7.1 // indirect
|
github.com/spf13/cast v1.7.1 // indirect
|
||||||
github.com/wk8/go-ordered-map/v2 v2.1.8 // indirect
|
github.com/wk8/go-ordered-map/v2 v2.1.8 // indirect
|
||||||
github.com/yosida95/uritemplate/v3 v3.0.2 // indirect
|
github.com/yosida95/uritemplate/v3 v3.0.2 // indirect
|
||||||
|
go.yaml.in/yaml/v2 v2.4.2 // indirect
|
||||||
golang.org/x/crypto v0.47.0 // indirect
|
golang.org/x/crypto v0.47.0 // indirect
|
||||||
golang.org/x/sys v0.40.0 // indirect
|
golang.org/x/sys v0.40.0 // indirect
|
||||||
|
google.golang.org/protobuf v1.36.8 // indirect
|
||||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||||
)
|
)
|
||||||
|
|||||||
33
go.sum
33
go.sum
@@ -1,13 +1,17 @@
|
|||||||
github.com/bahlo/generic-list-go v0.2.0 h1:5sz/EEAK+ls5wF+NeqDpk5+iNdMDXrh3z3nPnH1Wvgk=
|
github.com/bahlo/generic-list-go v0.2.0 h1:5sz/EEAK+ls5wF+NeqDpk5+iNdMDXrh3z3nPnH1Wvgk=
|
||||||
github.com/bahlo/generic-list-go v0.2.0/go.mod h1:2KvAjgMlE5NNynlg/5iLrrCCZ2+5xWbdbCW3pNTGyYg=
|
github.com/bahlo/generic-list-go v0.2.0/go.mod h1:2KvAjgMlE5NNynlg/5iLrrCCZ2+5xWbdbCW3pNTGyYg=
|
||||||
|
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
|
||||||
|
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
|
||||||
github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMUs=
|
github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMUs=
|
||||||
github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0=
|
github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0=
|
||||||
|
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
|
||||||
|
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8=
|
github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8=
|
||||||
github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0=
|
github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0=
|
||||||
github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
|
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
|
||||||
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
|
||||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||||
github.com/invopop/jsonschema v0.13.0 h1:KvpoAJWEjR3uD9Kbm2HWJmqsEaHt8lBUpd0qHcIi21E=
|
github.com/invopop/jsonschema v0.13.0 h1:KvpoAJWEjR3uD9Kbm2HWJmqsEaHt8lBUpd0qHcIi21E=
|
||||||
@@ -19,10 +23,14 @@ github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
|
|||||||
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
|
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
|
||||||
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||||
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||||
|
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
|
||||||
|
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
|
||||||
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
|
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
|
||||||
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
|
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
|
||||||
github.com/mark3labs/mcp-go v0.43.2 h1:21PUSlWWiSbUPQwXIJ5WKlETixpFpq+WBpbMGDSVy/I=
|
github.com/mark3labs/mcp-go v0.43.2 h1:21PUSlWWiSbUPQwXIJ5WKlETixpFpq+WBpbMGDSVy/I=
|
||||||
github.com/mark3labs/mcp-go v0.43.2/go.mod h1:YnJfOL382MIWDx1kMY+2zsRHU/q78dBg9aFb8W6Thdw=
|
github.com/mark3labs/mcp-go v0.43.2/go.mod h1:YnJfOL382MIWDx1kMY+2zsRHU/q78dBg9aFb8W6Thdw=
|
||||||
|
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
|
||||||
|
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
|
||||||
github.com/nats-io/nats.go v1.48.0 h1:pSFyXApG+yWU/TgbKCjmm5K4wrHu86231/w84qRVR+U=
|
github.com/nats-io/nats.go v1.48.0 h1:pSFyXApG+yWU/TgbKCjmm5K4wrHu86231/w84qRVR+U=
|
||||||
github.com/nats-io/nats.go v1.48.0/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g=
|
github.com/nats-io/nats.go v1.48.0/go.mod h1:iRWIPokVIFbVijxuMQq4y9ttaBTMe0SFdlZfMDd+33g=
|
||||||
github.com/nats-io/nkeys v0.4.15 h1:JACV5jRVO9V856KOapQ7x+EY8Jo3qw1vJt/9Jpwzkk4=
|
github.com/nats-io/nkeys v0.4.15 h1:JACV5jRVO9V856KOapQ7x+EY8Jo3qw1vJt/9Jpwzkk4=
|
||||||
@@ -31,8 +39,16 @@ github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=
|
|||||||
github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
|
github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
|
||||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||||
github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8=
|
github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
|
||||||
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
|
github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
|
||||||
|
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
|
||||||
|
github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
|
||||||
|
github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs=
|
||||||
|
github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA=
|
||||||
|
github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
|
||||||
|
github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
|
||||||
|
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
|
||||||
|
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
|
||||||
github.com/spf13/cast v1.7.1 h1:cuNEagBQEHWN1FnbGEjCXL2szYEXqfJPbP2HNUaca9Y=
|
github.com/spf13/cast v1.7.1 h1:cuNEagBQEHWN1FnbGEjCXL2szYEXqfJPbP2HNUaca9Y=
|
||||||
github.com/spf13/cast v1.7.1/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo=
|
github.com/spf13/cast v1.7.1/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo=
|
||||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||||
@@ -43,11 +59,18 @@ github.com/wk8/go-ordered-map/v2 v2.1.8 h1:5h/BUHu93oj4gIdvHHHGsScSTMijfx5PeYkE/
|
|||||||
github.com/wk8/go-ordered-map/v2 v2.1.8/go.mod h1:5nJHM5DyteebpVlHnWMV0rPz6Zp7+xBAnxjb1X5vnTw=
|
github.com/wk8/go-ordered-map/v2 v2.1.8/go.mod h1:5nJHM5DyteebpVlHnWMV0rPz6Zp7+xBAnxjb1X5vnTw=
|
||||||
github.com/yosida95/uritemplate/v3 v3.0.2 h1:Ed3Oyj9yrmi9087+NczuL5BwkIc4wvTb5zIM+UJPGz4=
|
github.com/yosida95/uritemplate/v3 v3.0.2 h1:Ed3Oyj9yrmi9087+NczuL5BwkIc4wvTb5zIM+UJPGz4=
|
||||||
github.com/yosida95/uritemplate/v3 v3.0.2/go.mod h1:ILOh0sOhIJR3+L/8afwt/kE++YT040gmv5BQTMR2HP4=
|
github.com/yosida95/uritemplate/v3 v3.0.2/go.mod h1:ILOh0sOhIJR3+L/8afwt/kE++YT040gmv5BQTMR2HP4=
|
||||||
|
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
|
||||||
|
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
|
||||||
|
go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI=
|
||||||
|
go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU=
|
||||||
golang.org/x/crypto v0.47.0 h1:V6e3FRj+n4dbpw86FJ8Fv7XVOql7TEwpHapKoMJ/GO8=
|
golang.org/x/crypto v0.47.0 h1:V6e3FRj+n4dbpw86FJ8Fv7XVOql7TEwpHapKoMJ/GO8=
|
||||||
golang.org/x/crypto v0.47.0/go.mod h1:ff3Y9VzzKbwSSEzWqJsJVBnWmRwRSHt/6Op5n9bQc4A=
|
golang.org/x/crypto v0.47.0/go.mod h1:ff3Y9VzzKbwSSEzWqJsJVBnWmRwRSHt/6Op5n9bQc4A=
|
||||||
golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ=
|
golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ=
|
||||||
golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
|
golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
|
||||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
|
google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc=
|
||||||
|
google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU=
|
||||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||||
|
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
|
||||||
|
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
|
||||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ import (
|
|||||||
|
|
||||||
"git.t-juice.club/torjus/homelab-deploy/internal/deploy"
|
"git.t-juice.club/torjus/homelab-deploy/internal/deploy"
|
||||||
"git.t-juice.club/torjus/homelab-deploy/internal/messages"
|
"git.t-juice.club/torjus/homelab-deploy/internal/messages"
|
||||||
|
"git.t-juice.club/torjus/homelab-deploy/internal/metrics"
|
||||||
"git.t-juice.club/torjus/homelab-deploy/internal/nats"
|
"git.t-juice.club/torjus/homelab-deploy/internal/nats"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -22,6 +23,9 @@ type Config struct {
|
|||||||
Timeout time.Duration
|
Timeout time.Duration
|
||||||
DeploySubjects []string
|
DeploySubjects []string
|
||||||
DiscoverSubject string
|
DiscoverSubject string
|
||||||
|
MetricsEnabled bool
|
||||||
|
MetricsAddr string
|
||||||
|
Version string
|
||||||
}
|
}
|
||||||
|
|
||||||
// Listener handles deployment requests from NATS.
|
// Listener handles deployment requests from NATS.
|
||||||
@@ -38,6 +42,10 @@ type Listener struct {
|
|||||||
// restartCh signals that the listener should exit for restart
|
// restartCh signals that the listener should exit for restart
|
||||||
// (e.g., after a successful switch deployment)
|
// (e.g., after a successful switch deployment)
|
||||||
restartCh chan struct{}
|
restartCh chan struct{}
|
||||||
|
|
||||||
|
// metrics server and collector (nil if metrics disabled)
|
||||||
|
metricsServer *metrics.Server
|
||||||
|
metrics *metrics.Collector
|
||||||
}
|
}
|
||||||
|
|
||||||
// New creates a new listener with the given configuration.
|
// New creates a new listener with the given configuration.
|
||||||
@@ -46,17 +54,42 @@ func New(cfg Config, logger *slog.Logger) *Listener {
|
|||||||
logger = slog.Default()
|
logger = slog.Default()
|
||||||
}
|
}
|
||||||
|
|
||||||
return &Listener{
|
l := &Listener{
|
||||||
cfg: cfg,
|
cfg: cfg,
|
||||||
executor: deploy.NewExecutor(cfg.FlakeURL, cfg.Hostname, cfg.Timeout),
|
executor: deploy.NewExecutor(cfg.FlakeURL, cfg.Hostname, cfg.Timeout),
|
||||||
lock: deploy.NewLock(),
|
lock: deploy.NewLock(),
|
||||||
logger: logger,
|
logger: logger,
|
||||||
restartCh: make(chan struct{}, 1),
|
restartCh: make(chan struct{}, 1),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if cfg.MetricsEnabled {
|
||||||
|
l.metricsServer = metrics.NewServer(metrics.ServerConfig{
|
||||||
|
Addr: cfg.MetricsAddr,
|
||||||
|
Logger: logger,
|
||||||
|
})
|
||||||
|
l.metrics = l.metricsServer.Collector()
|
||||||
|
}
|
||||||
|
|
||||||
|
return l
|
||||||
}
|
}
|
||||||
|
|
||||||
// Run starts the listener and blocks until the context is cancelled.
|
// Run starts the listener and blocks until the context is cancelled.
|
||||||
func (l *Listener) Run(ctx context.Context) error {
|
func (l *Listener) Run(ctx context.Context) error {
|
||||||
|
// Start metrics server if enabled
|
||||||
|
if l.metricsServer != nil {
|
||||||
|
if err := l.metricsServer.Start(); err != nil {
|
||||||
|
return fmt.Errorf("failed to start metrics server: %w", err)
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
_ = l.metricsServer.Shutdown(shutdownCtx)
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Set instance info metric
|
||||||
|
l.metrics.SetInfo(l.cfg.Hostname, l.cfg.Tier, l.cfg.Role, l.cfg.Version)
|
||||||
|
}
|
||||||
|
|
||||||
// Connect to NATS
|
// Connect to NATS
|
||||||
l.logger.Info("connecting to NATS",
|
l.logger.Info("connecting to NATS",
|
||||||
"url", l.cfg.NATSUrl,
|
"url", l.cfg.NATSUrl,
|
||||||
@@ -136,6 +169,9 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) {
|
|||||||
messages.StatusRejected,
|
messages.StatusRejected,
|
||||||
err.Error(),
|
err.Error(),
|
||||||
).WithError(messages.ErrorInvalidAction))
|
).WithError(messages.ErrorInvalidAction))
|
||||||
|
if l.metrics != nil {
|
||||||
|
l.metrics.RecordRejection(req.Action, messages.ErrorInvalidAction)
|
||||||
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -150,6 +186,9 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) {
|
|||||||
messages.StatusRejected,
|
messages.StatusRejected,
|
||||||
"another deployment is already in progress",
|
"another deployment is already in progress",
|
||||||
).WithError(messages.ErrorAlreadyRunning))
|
).WithError(messages.ErrorAlreadyRunning))
|
||||||
|
if l.metrics != nil {
|
||||||
|
l.metrics.RecordRejection(req.Action, messages.ErrorAlreadyRunning)
|
||||||
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer l.lock.Release()
|
defer l.lock.Release()
|
||||||
@@ -161,6 +200,12 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) {
|
|||||||
fmt.Sprintf("starting deployment: %s", l.executor.BuildCommand(req.Action, req.Revision)),
|
fmt.Sprintf("starting deployment: %s", l.executor.BuildCommand(req.Action, req.Revision)),
|
||||||
))
|
))
|
||||||
|
|
||||||
|
// Record deployment start for metrics
|
||||||
|
if l.metrics != nil {
|
||||||
|
l.metrics.RecordDeploymentStart()
|
||||||
|
}
|
||||||
|
startTime := time.Now()
|
||||||
|
|
||||||
// Validate revision
|
// Validate revision
|
||||||
ctx := context.Background()
|
ctx := context.Background()
|
||||||
if err := l.executor.ValidateRevision(ctx, req.Revision); err != nil {
|
if err := l.executor.ValidateRevision(ctx, req.Revision); err != nil {
|
||||||
@@ -173,6 +218,10 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) {
|
|||||||
messages.StatusFailed,
|
messages.StatusFailed,
|
||||||
fmt.Sprintf("revision validation failed: %v", err),
|
fmt.Sprintf("revision validation failed: %v", err),
|
||||||
).WithError(messages.ErrorInvalidRevision))
|
).WithError(messages.ErrorInvalidRevision))
|
||||||
|
if l.metrics != nil {
|
||||||
|
duration := time.Since(startTime).Seconds()
|
||||||
|
l.metrics.RecordDeploymentFailure(req.Action, messages.ErrorInvalidRevision, duration)
|
||||||
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -184,6 +233,7 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) {
|
|||||||
)
|
)
|
||||||
|
|
||||||
result := l.executor.Execute(ctx, req.Action, req.Revision)
|
result := l.executor.Execute(ctx, req.Action, req.Revision)
|
||||||
|
duration := time.Since(startTime).Seconds()
|
||||||
|
|
||||||
if result.Success {
|
if result.Success {
|
||||||
l.logger.Info("deployment completed successfully",
|
l.logger.Info("deployment completed successfully",
|
||||||
@@ -194,6 +244,9 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) {
|
|||||||
messages.StatusCompleted,
|
messages.StatusCompleted,
|
||||||
"deployment completed successfully",
|
"deployment completed successfully",
|
||||||
))
|
))
|
||||||
|
if l.metrics != nil {
|
||||||
|
l.metrics.RecordDeploymentEnd(req.Action, true, duration)
|
||||||
|
}
|
||||||
|
|
||||||
// After a successful switch, signal restart so we pick up any new version
|
// After a successful switch, signal restart so we pick up any new version
|
||||||
if req.Action == messages.ActionSwitch {
|
if req.Action == messages.ActionSwitch {
|
||||||
@@ -220,6 +273,9 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) {
|
|||||||
messages.StatusFailed,
|
messages.StatusFailed,
|
||||||
fmt.Sprintf("deployment failed (exit code %d): %s", result.ExitCode, result.Stderr),
|
fmt.Sprintf("deployment failed (exit code %d): %s", result.ExitCode, result.Stderr),
|
||||||
).WithError(errorCode))
|
).WithError(errorCode))
|
||||||
|
if l.metrics != nil {
|
||||||
|
l.metrics.RecordDeploymentFailure(req.Action, errorCode, duration)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
98
internal/metrics/metrics.go
Normal file
98
internal/metrics/metrics.go
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
// Package metrics provides Prometheus metrics for the homelab-deploy listener.
|
||||||
|
package metrics
|
||||||
|
|
||||||
|
import (
|
||||||
|
"git.t-juice.club/torjus/homelab-deploy/internal/messages"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Collector holds all Prometheus metrics for the listener.
|
||||||
|
type Collector struct {
|
||||||
|
deploymentsTotal *prometheus.CounterVec
|
||||||
|
deploymentDuration *prometheus.HistogramVec
|
||||||
|
deploymentInProgress prometheus.Gauge
|
||||||
|
info *prometheus.GaugeVec
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewCollector creates a new metrics collector and registers it with the given registerer.
|
||||||
|
func NewCollector(reg prometheus.Registerer) *Collector {
|
||||||
|
c := &Collector{
|
||||||
|
deploymentsTotal: prometheus.NewCounterVec(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Name: "homelab_deploy_deployments_total",
|
||||||
|
Help: "Total deployment requests processed",
|
||||||
|
},
|
||||||
|
[]string{"status", "action", "error_code"},
|
||||||
|
),
|
||||||
|
deploymentDuration: prometheus.NewHistogramVec(
|
||||||
|
prometheus.HistogramOpts{
|
||||||
|
Name: "homelab_deploy_deployment_duration_seconds",
|
||||||
|
Help: "Deployment execution time",
|
||||||
|
// Bucket boundaries for typical NixOS build times
|
||||||
|
Buckets: []float64{30, 60, 120, 300, 600, 900, 1200, 1800},
|
||||||
|
},
|
||||||
|
[]string{"action", "success"},
|
||||||
|
),
|
||||||
|
deploymentInProgress: prometheus.NewGauge(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Name: "homelab_deploy_deployment_in_progress",
|
||||||
|
Help: "1 if deployment running, 0 otherwise",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
info: prometheus.NewGaugeVec(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Name: "homelab_deploy_info",
|
||||||
|
Help: "Static instance metadata",
|
||||||
|
},
|
||||||
|
[]string{"hostname", "tier", "role", "version"},
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
reg.MustRegister(c.deploymentsTotal)
|
||||||
|
reg.MustRegister(c.deploymentDuration)
|
||||||
|
reg.MustRegister(c.deploymentInProgress)
|
||||||
|
reg.MustRegister(c.info)
|
||||||
|
|
||||||
|
return c
|
||||||
|
}
|
||||||
|
|
||||||
|
// SetInfo sets the static instance metadata.
|
||||||
|
func (c *Collector) SetInfo(hostname, tier, role, version string) {
|
||||||
|
c.info.WithLabelValues(hostname, tier, role, version).Set(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
// RecordDeploymentStart marks the start of a deployment.
|
||||||
|
func (c *Collector) RecordDeploymentStart() {
|
||||||
|
c.deploymentInProgress.Set(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
// RecordDeploymentEnd records the completion of a deployment.
|
||||||
|
func (c *Collector) RecordDeploymentEnd(action messages.Action, success bool, durationSeconds float64) {
|
||||||
|
c.deploymentInProgress.Set(0)
|
||||||
|
|
||||||
|
successLabel := "false"
|
||||||
|
if success {
|
||||||
|
successLabel = "true"
|
||||||
|
}
|
||||||
|
|
||||||
|
c.deploymentDuration.WithLabelValues(string(action), successLabel).Observe(durationSeconds)
|
||||||
|
|
||||||
|
status := "completed"
|
||||||
|
if !success {
|
||||||
|
status = "failed"
|
||||||
|
}
|
||||||
|
|
||||||
|
c.deploymentsTotal.WithLabelValues(status, string(action), "").Inc()
|
||||||
|
}
|
||||||
|
|
||||||
|
// RecordDeploymentFailure records a deployment failure with an error code.
|
||||||
|
func (c *Collector) RecordDeploymentFailure(action messages.Action, errorCode messages.ErrorCode, durationSeconds float64) {
|
||||||
|
c.deploymentInProgress.Set(0)
|
||||||
|
c.deploymentDuration.WithLabelValues(string(action), "false").Observe(durationSeconds)
|
||||||
|
c.deploymentsTotal.WithLabelValues("failed", string(action), string(errorCode)).Inc()
|
||||||
|
}
|
||||||
|
|
||||||
|
// RecordRejection records a rejected deployment request.
|
||||||
|
func (c *Collector) RecordRejection(action messages.Action, errorCode messages.ErrorCode) {
|
||||||
|
c.deploymentsTotal.WithLabelValues("rejected", string(action), string(errorCode)).Inc()
|
||||||
|
}
|
||||||
210
internal/metrics/metrics_test.go
Normal file
210
internal/metrics/metrics_test.go
Normal file
@@ -0,0 +1,210 @@
|
|||||||
|
package metrics
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"git.t-juice.club/torjus/homelab-deploy/internal/messages"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
"github.com/prometheus/client_golang/prometheus/testutil"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestCollector_SetInfo(t *testing.T) {
|
||||||
|
reg := prometheus.NewRegistry()
|
||||||
|
c := NewCollector(reg)
|
||||||
|
|
||||||
|
c.SetInfo("testhost", "test", "web", "1.0.0")
|
||||||
|
|
||||||
|
expected := `
|
||||||
|
# HELP homelab_deploy_info Static instance metadata
|
||||||
|
# TYPE homelab_deploy_info gauge
|
||||||
|
homelab_deploy_info{hostname="testhost",role="web",tier="test",version="1.0.0"} 1
|
||||||
|
`
|
||||||
|
if err := testutil.GatherAndCompare(reg, strings.NewReader(expected), "homelab_deploy_info"); err != nil {
|
||||||
|
t.Errorf("unexpected metrics: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCollector_RecordDeploymentStart(t *testing.T) {
|
||||||
|
reg := prometheus.NewRegistry()
|
||||||
|
c := NewCollector(reg)
|
||||||
|
|
||||||
|
c.RecordDeploymentStart()
|
||||||
|
|
||||||
|
expected := `
|
||||||
|
# HELP homelab_deploy_deployment_in_progress 1 if deployment running, 0 otherwise
|
||||||
|
# TYPE homelab_deploy_deployment_in_progress gauge
|
||||||
|
homelab_deploy_deployment_in_progress 1
|
||||||
|
`
|
||||||
|
if err := testutil.GatherAndCompare(reg, strings.NewReader(expected), "homelab_deploy_deployment_in_progress"); err != nil {
|
||||||
|
t.Errorf("unexpected metrics: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCollector_RecordDeploymentEnd_Success(t *testing.T) {
|
||||||
|
reg := prometheus.NewRegistry()
|
||||||
|
c := NewCollector(reg)
|
||||||
|
|
||||||
|
c.RecordDeploymentStart()
|
||||||
|
c.RecordDeploymentEnd(messages.ActionSwitch, true, 120.5)
|
||||||
|
|
||||||
|
// Check in_progress is 0
|
||||||
|
inProgressExpected := `
|
||||||
|
# HELP homelab_deploy_deployment_in_progress 1 if deployment running, 0 otherwise
|
||||||
|
# TYPE homelab_deploy_deployment_in_progress gauge
|
||||||
|
homelab_deploy_deployment_in_progress 0
|
||||||
|
`
|
||||||
|
if err := testutil.GatherAndCompare(reg, strings.NewReader(inProgressExpected), "homelab_deploy_deployment_in_progress"); err != nil {
|
||||||
|
t.Errorf("unexpected in_progress metrics: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check counter incremented
|
||||||
|
counterExpected := `
|
||||||
|
# HELP homelab_deploy_deployments_total Total deployment requests processed
|
||||||
|
# TYPE homelab_deploy_deployments_total counter
|
||||||
|
homelab_deploy_deployments_total{action="switch",error_code="",status="completed"} 1
|
||||||
|
`
|
||||||
|
if err := testutil.GatherAndCompare(reg, strings.NewReader(counterExpected), "homelab_deploy_deployments_total"); err != nil {
|
||||||
|
t.Errorf("unexpected counter metrics: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCollector_RecordDeploymentEnd_Failure(t *testing.T) {
|
||||||
|
reg := prometheus.NewRegistry()
|
||||||
|
c := NewCollector(reg)
|
||||||
|
|
||||||
|
c.RecordDeploymentStart()
|
||||||
|
c.RecordDeploymentEnd(messages.ActionBoot, false, 60.0)
|
||||||
|
|
||||||
|
counterExpected := `
|
||||||
|
# HELP homelab_deploy_deployments_total Total deployment requests processed
|
||||||
|
# TYPE homelab_deploy_deployments_total counter
|
||||||
|
homelab_deploy_deployments_total{action="boot",error_code="",status="failed"} 1
|
||||||
|
`
|
||||||
|
if err := testutil.GatherAndCompare(reg, strings.NewReader(counterExpected), "homelab_deploy_deployments_total"); err != nil {
|
||||||
|
t.Errorf("unexpected counter metrics: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCollector_RecordDeploymentFailure(t *testing.T) {
|
||||||
|
reg := prometheus.NewRegistry()
|
||||||
|
c := NewCollector(reg)
|
||||||
|
|
||||||
|
c.RecordDeploymentStart()
|
||||||
|
c.RecordDeploymentFailure(messages.ActionSwitch, messages.ErrorBuildFailed, 300.0)
|
||||||
|
|
||||||
|
counterExpected := `
|
||||||
|
# HELP homelab_deploy_deployments_total Total deployment requests processed
|
||||||
|
# TYPE homelab_deploy_deployments_total counter
|
||||||
|
homelab_deploy_deployments_total{action="switch",error_code="build_failed",status="failed"} 1
|
||||||
|
`
|
||||||
|
if err := testutil.GatherAndCompare(reg, strings.NewReader(counterExpected), "homelab_deploy_deployments_total"); err != nil {
|
||||||
|
t.Errorf("unexpected counter metrics: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCollector_RecordRejection(t *testing.T) {
|
||||||
|
reg := prometheus.NewRegistry()
|
||||||
|
c := NewCollector(reg)
|
||||||
|
|
||||||
|
c.RecordRejection(messages.ActionSwitch, messages.ErrorAlreadyRunning)
|
||||||
|
|
||||||
|
expected := `
|
||||||
|
# HELP homelab_deploy_deployments_total Total deployment requests processed
|
||||||
|
# TYPE homelab_deploy_deployments_total counter
|
||||||
|
homelab_deploy_deployments_total{action="switch",error_code="already_running",status="rejected"} 1
|
||||||
|
`
|
||||||
|
if err := testutil.GatherAndCompare(reg, strings.NewReader(expected), "homelab_deploy_deployments_total"); err != nil {
|
||||||
|
t.Errorf("unexpected metrics: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestServer_StartShutdown(t *testing.T) {
|
||||||
|
srv := NewServer(ServerConfig{
|
||||||
|
Addr: ":0", // Let OS pick a free port
|
||||||
|
})
|
||||||
|
|
||||||
|
if err := srv.Start(); err != nil {
|
||||||
|
t.Fatalf("failed to start server: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
if err := srv.Shutdown(ctx); err != nil {
|
||||||
|
t.Errorf("failed to shutdown server: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestServer_Endpoints(t *testing.T) {
|
||||||
|
srv := NewServer(ServerConfig{
|
||||||
|
Addr: "127.0.0.1:19972", // Use a fixed port for testing
|
||||||
|
})
|
||||||
|
|
||||||
|
if err := srv.Start(); err != nil {
|
||||||
|
t.Fatalf("failed to start server: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
defer func() {
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
_ = srv.Shutdown(ctx)
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Give server time to start
|
||||||
|
time.Sleep(50 * time.Millisecond)
|
||||||
|
|
||||||
|
t.Run("health endpoint", func(t *testing.T) {
|
||||||
|
resp, err := http.Get("http://127.0.0.1:19972/health")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("failed to get health endpoint: %v", err)
|
||||||
|
}
|
||||||
|
defer func() { _ = resp.Body.Close() }()
|
||||||
|
|
||||||
|
if resp.StatusCode != http.StatusOK {
|
||||||
|
t.Errorf("expected status 200, got %d", resp.StatusCode)
|
||||||
|
}
|
||||||
|
|
||||||
|
body, _ := io.ReadAll(resp.Body)
|
||||||
|
if string(body) != "ok" {
|
||||||
|
t.Errorf("expected body 'ok', got %q", string(body))
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("metrics endpoint", func(t *testing.T) {
|
||||||
|
// Set some info to have metrics to display
|
||||||
|
srv.Collector().SetInfo("testhost", "test", "web", "1.0.0")
|
||||||
|
|
||||||
|
resp, err := http.Get("http://127.0.0.1:19972/metrics")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("failed to get metrics endpoint: %v", err)
|
||||||
|
}
|
||||||
|
defer func() { _ = resp.Body.Close() }()
|
||||||
|
|
||||||
|
if resp.StatusCode != http.StatusOK {
|
||||||
|
t.Errorf("expected status 200, got %d", resp.StatusCode)
|
||||||
|
}
|
||||||
|
|
||||||
|
body, _ := io.ReadAll(resp.Body)
|
||||||
|
bodyStr := string(body)
|
||||||
|
|
||||||
|
if !strings.Contains(bodyStr, "homelab_deploy_info") {
|
||||||
|
t.Error("expected metrics to contain homelab_deploy_info")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestServer_Collector(t *testing.T) {
|
||||||
|
srv := NewServer(ServerConfig{
|
||||||
|
Addr: ":0",
|
||||||
|
})
|
||||||
|
|
||||||
|
collector := srv.Collector()
|
||||||
|
if collector == nil {
|
||||||
|
t.Error("expected non-nil collector")
|
||||||
|
}
|
||||||
|
}
|
||||||
84
internal/metrics/server.go
Normal file
84
internal/metrics/server.go
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
package metrics
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
|
"net/http"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ServerConfig holds configuration for the metrics server.
|
||||||
|
type ServerConfig struct {
|
||||||
|
Addr string
|
||||||
|
Logger *slog.Logger
|
||||||
|
}
|
||||||
|
|
||||||
|
// Server serves Prometheus metrics over HTTP.
|
||||||
|
type Server struct {
|
||||||
|
httpServer *http.Server
|
||||||
|
registry *prometheus.Registry
|
||||||
|
collector *Collector
|
||||||
|
logger *slog.Logger
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewServer creates a new metrics server.
|
||||||
|
func NewServer(cfg ServerConfig) *Server {
|
||||||
|
logger := cfg.Logger
|
||||||
|
if logger == nil {
|
||||||
|
logger = slog.Default()
|
||||||
|
}
|
||||||
|
|
||||||
|
registry := prometheus.NewRegistry()
|
||||||
|
collector := NewCollector(registry)
|
||||||
|
|
||||||
|
mux := http.NewServeMux()
|
||||||
|
mux.Handle("/metrics", promhttp.HandlerFor(registry, promhttp.HandlerOpts{
|
||||||
|
Registry: registry,
|
||||||
|
}))
|
||||||
|
mux.HandleFunc("/health", func(w http.ResponseWriter, _ *http.Request) {
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
_, _ = w.Write([]byte("ok"))
|
||||||
|
})
|
||||||
|
|
||||||
|
return &Server{
|
||||||
|
httpServer: &http.Server{
|
||||||
|
Addr: cfg.Addr,
|
||||||
|
Handler: mux,
|
||||||
|
ReadHeaderTimeout: 10 * time.Second,
|
||||||
|
},
|
||||||
|
registry: registry,
|
||||||
|
collector: collector,
|
||||||
|
logger: logger,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collector returns the metrics collector.
|
||||||
|
func (s *Server) Collector() *Collector {
|
||||||
|
return s.collector
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start starts the HTTP server in a goroutine.
|
||||||
|
func (s *Server) Start() error {
|
||||||
|
s.logger.Info("starting metrics server", "addr", s.httpServer.Addr)
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
if err := s.httpServer.ListenAndServe(); err != nil && err != http.ErrServerClosed {
|
||||||
|
s.logger.Error("metrics server error", "error", err)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Shutdown gracefully shuts down the server.
|
||||||
|
func (s *Server) Shutdown(ctx context.Context) error {
|
||||||
|
s.logger.Info("shutting down metrics server")
|
||||||
|
if err := s.httpServer.Shutdown(ctx); err != nil {
|
||||||
|
return fmt.Errorf("failed to shutdown metrics server: %w", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
@@ -15,7 +15,18 @@ let
|
|||||||
"--discover-subject ${lib.escapeShellArg cfg.discoverSubject}"
|
"--discover-subject ${lib.escapeShellArg cfg.discoverSubject}"
|
||||||
]
|
]
|
||||||
++ lib.optional (cfg.role != null) "--role ${lib.escapeShellArg cfg.role}"
|
++ lib.optional (cfg.role != null) "--role ${lib.escapeShellArg cfg.role}"
|
||||||
++ map (s: "--deploy-subject ${lib.escapeShellArg s}") cfg.deploySubjects);
|
++ map (s: "--deploy-subject ${lib.escapeShellArg s}") cfg.deploySubjects
|
||||||
|
++ lib.optionals cfg.metrics.enable [
|
||||||
|
"--metrics-enabled"
|
||||||
|
"--metrics-addr ${lib.escapeShellArg cfg.metrics.address}"
|
||||||
|
]);
|
||||||
|
|
||||||
|
# Extract port from metrics address for firewall rule
|
||||||
|
metricsPort = let
|
||||||
|
addr = cfg.metrics.address;
|
||||||
|
# Handle both ":9972" and "0.0.0.0:9972" formats
|
||||||
|
parts = lib.splitString ":" addr;
|
||||||
|
in lib.toInt (lib.last parts);
|
||||||
|
|
||||||
in
|
in
|
||||||
{
|
{
|
||||||
@@ -94,6 +105,23 @@ in
|
|||||||
description = "Additional environment variables for the service";
|
description = "Additional environment variables for the service";
|
||||||
example = { GIT_SSH_COMMAND = "ssh -i /run/secrets/deploy-key"; };
|
example = { GIT_SSH_COMMAND = "ssh -i /run/secrets/deploy-key"; };
|
||||||
};
|
};
|
||||||
|
|
||||||
|
metrics = {
|
||||||
|
enable = lib.mkEnableOption "Prometheus metrics endpoint";
|
||||||
|
|
||||||
|
address = lib.mkOption {
|
||||||
|
type = lib.types.str;
|
||||||
|
default = ":9972";
|
||||||
|
description = "Address for Prometheus metrics HTTP server";
|
||||||
|
example = "127.0.0.1:9972";
|
||||||
|
};
|
||||||
|
|
||||||
|
openFirewall = lib.mkOption {
|
||||||
|
type = lib.types.bool;
|
||||||
|
default = false;
|
||||||
|
description = "Open firewall for metrics port";
|
||||||
|
};
|
||||||
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
config = lib.mkIf cfg.enable {
|
config = lib.mkIf cfg.enable {
|
||||||
@@ -130,5 +158,9 @@ in
|
|||||||
# Following the approach of nixos auto-upgrade which has no hardening
|
# Following the approach of nixos auto-upgrade which has no hardening
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
networking.firewall.allowedTCPPorts = lib.mkIf (cfg.metrics.enable && cfg.metrics.openFirewall) [
|
||||||
|
metricsPort
|
||||||
|
];
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user