feat: add Prometheus metrics to listener service

Add an optional Prometheus metrics HTTP endpoint to the listener for
monitoring deployment operations. Includes four metrics:

- homelab_deploy_deployments_total (counter with status/action/error_code)
- homelab_deploy_deployment_duration_seconds (histogram with action/success)
- homelab_deploy_deployment_in_progress (gauge)
- homelab_deploy_info (gauge with hostname/tier/role/version)

New CLI flags: --metrics-enabled, --metrics-addr (default :9972)
New NixOS options: metrics.enable, metrics.address, metrics.openFirewall

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-02-07 07:58:22 +01:00
parent 56365835c7
commit 79db119d1c
10 changed files with 613 additions and 9 deletions

View File

@@ -0,0 +1,98 @@
// Package metrics provides Prometheus metrics for the homelab-deploy listener.
package metrics
import (
"git.t-juice.club/torjus/homelab-deploy/internal/messages"
"github.com/prometheus/client_golang/prometheus"
)
// Collector holds all Prometheus metrics for the listener.
type Collector struct {
deploymentsTotal *prometheus.CounterVec
deploymentDuration *prometheus.HistogramVec
deploymentInProgress prometheus.Gauge
info *prometheus.GaugeVec
}
// NewCollector creates a new metrics collector and registers it with the given registerer.
func NewCollector(reg prometheus.Registerer) *Collector {
c := &Collector{
deploymentsTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "homelab_deploy_deployments_total",
Help: "Total deployment requests processed",
},
[]string{"status", "action", "error_code"},
),
deploymentDuration: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "homelab_deploy_deployment_duration_seconds",
Help: "Deployment execution time",
// Bucket boundaries for typical NixOS build times
Buckets: []float64{30, 60, 120, 300, 600, 900, 1200, 1800},
},
[]string{"action", "success"},
),
deploymentInProgress: prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "homelab_deploy_deployment_in_progress",
Help: "1 if deployment running, 0 otherwise",
},
),
info: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "homelab_deploy_info",
Help: "Static instance metadata",
},
[]string{"hostname", "tier", "role", "version"},
),
}
reg.MustRegister(c.deploymentsTotal)
reg.MustRegister(c.deploymentDuration)
reg.MustRegister(c.deploymentInProgress)
reg.MustRegister(c.info)
return c
}
// SetInfo sets the static instance metadata.
func (c *Collector) SetInfo(hostname, tier, role, version string) {
c.info.WithLabelValues(hostname, tier, role, version).Set(1)
}
// RecordDeploymentStart marks the start of a deployment.
func (c *Collector) RecordDeploymentStart() {
c.deploymentInProgress.Set(1)
}
// RecordDeploymentEnd records the completion of a deployment.
func (c *Collector) RecordDeploymentEnd(action messages.Action, success bool, durationSeconds float64) {
c.deploymentInProgress.Set(0)
successLabel := "false"
if success {
successLabel = "true"
}
c.deploymentDuration.WithLabelValues(string(action), successLabel).Observe(durationSeconds)
status := "completed"
if !success {
status = "failed"
}
c.deploymentsTotal.WithLabelValues(status, string(action), "").Inc()
}
// RecordDeploymentFailure records a deployment failure with an error code.
func (c *Collector) RecordDeploymentFailure(action messages.Action, errorCode messages.ErrorCode, durationSeconds float64) {
c.deploymentInProgress.Set(0)
c.deploymentDuration.WithLabelValues(string(action), "false").Observe(durationSeconds)
c.deploymentsTotal.WithLabelValues("failed", string(action), string(errorCode)).Inc()
}
// RecordRejection records a rejected deployment request.
func (c *Collector) RecordRejection(action messages.Action, errorCode messages.ErrorCode) {
c.deploymentsTotal.WithLabelValues("rejected", string(action), string(errorCode)).Inc()
}