Counter and histogram metrics were absent from Prometheus scrapes until the first deployment occurred, making it impossible to distinguish "no deployments" from "exporter not running" in dashboards and alerts. Initialize all expected label combinations with zero values when the collector is created so metrics appear in every scrape from startup. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
126 lines
4.0 KiB
Go
126 lines
4.0 KiB
Go
// Package metrics provides Prometheus metrics for the homelab-deploy listener.
|
|
package metrics
|
|
|
|
import (
|
|
"git.t-juice.club/torjus/homelab-deploy/internal/messages"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
)
|
|
|
|
// Collector holds all Prometheus metrics for the listener.
|
|
type Collector struct {
|
|
deploymentsTotal *prometheus.CounterVec
|
|
deploymentDuration *prometheus.HistogramVec
|
|
deploymentInProgress prometheus.Gauge
|
|
info *prometheus.GaugeVec
|
|
}
|
|
|
|
// NewCollector creates a new metrics collector and registers it with the given registerer.
|
|
func NewCollector(reg prometheus.Registerer) *Collector {
|
|
c := &Collector{
|
|
deploymentsTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "homelab_deploy_deployments_total",
|
|
Help: "Total deployment requests processed",
|
|
},
|
|
[]string{"status", "action", "error_code"},
|
|
),
|
|
deploymentDuration: prometheus.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Name: "homelab_deploy_deployment_duration_seconds",
|
|
Help: "Deployment execution time",
|
|
// Bucket boundaries for typical NixOS build times
|
|
Buckets: []float64{30, 60, 120, 300, 600, 900, 1200, 1800},
|
|
},
|
|
[]string{"action", "success"},
|
|
),
|
|
deploymentInProgress: prometheus.NewGauge(
|
|
prometheus.GaugeOpts{
|
|
Name: "homelab_deploy_deployment_in_progress",
|
|
Help: "1 if deployment running, 0 otherwise",
|
|
},
|
|
),
|
|
info: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Name: "homelab_deploy_info",
|
|
Help: "Static instance metadata",
|
|
},
|
|
[]string{"hostname", "tier", "role", "version"},
|
|
),
|
|
}
|
|
|
|
reg.MustRegister(c.deploymentsTotal)
|
|
reg.MustRegister(c.deploymentDuration)
|
|
reg.MustRegister(c.deploymentInProgress)
|
|
reg.MustRegister(c.info)
|
|
|
|
c.initMetrics()
|
|
|
|
return c
|
|
}
|
|
|
|
// initMetrics initializes all metric label combinations with zero values.
|
|
// This ensures metrics appear in Prometheus scrapes before any deployments occur.
|
|
func (c *Collector) initMetrics() {
|
|
actions := []messages.Action{
|
|
messages.ActionSwitch,
|
|
messages.ActionBoot,
|
|
messages.ActionTest,
|
|
messages.ActionDryActivate,
|
|
}
|
|
|
|
// Initialize deployment counter for common status/action combinations
|
|
for _, action := range actions {
|
|
// Successful completions (no error code)
|
|
c.deploymentsTotal.WithLabelValues("completed", string(action), "")
|
|
// Failed deployments (no error code - from RecordDeploymentEnd)
|
|
c.deploymentsTotal.WithLabelValues("failed", string(action), "")
|
|
}
|
|
|
|
// Initialize histogram for all action/success combinations
|
|
for _, action := range actions {
|
|
c.deploymentDuration.WithLabelValues(string(action), "true")
|
|
c.deploymentDuration.WithLabelValues(string(action), "false")
|
|
}
|
|
}
|
|
|
|
// SetInfo sets the static instance metadata.
|
|
func (c *Collector) SetInfo(hostname, tier, role, version string) {
|
|
c.info.WithLabelValues(hostname, tier, role, version).Set(1)
|
|
}
|
|
|
|
// RecordDeploymentStart marks the start of a deployment.
|
|
func (c *Collector) RecordDeploymentStart() {
|
|
c.deploymentInProgress.Set(1)
|
|
}
|
|
|
|
// RecordDeploymentEnd records the completion of a deployment.
|
|
func (c *Collector) RecordDeploymentEnd(action messages.Action, success bool, durationSeconds float64) {
|
|
c.deploymentInProgress.Set(0)
|
|
|
|
successLabel := "false"
|
|
if success {
|
|
successLabel = "true"
|
|
}
|
|
|
|
c.deploymentDuration.WithLabelValues(string(action), successLabel).Observe(durationSeconds)
|
|
|
|
status := "completed"
|
|
if !success {
|
|
status = "failed"
|
|
}
|
|
|
|
c.deploymentsTotal.WithLabelValues(status, string(action), "").Inc()
|
|
}
|
|
|
|
// RecordDeploymentFailure records a deployment failure with an error code.
|
|
func (c *Collector) RecordDeploymentFailure(action messages.Action, errorCode messages.ErrorCode, durationSeconds float64) {
|
|
c.deploymentInProgress.Set(0)
|
|
c.deploymentDuration.WithLabelValues(string(action), "false").Observe(durationSeconds)
|
|
c.deploymentsTotal.WithLabelValues("failed", string(action), string(errorCode)).Inc()
|
|
}
|
|
|
|
// RecordRejection records a rejected deployment request.
|
|
func (c *Collector) RecordRejection(action messages.Action, errorCode messages.ErrorCode) {
|
|
c.deploymentsTotal.WithLabelValues("rejected", string(action), string(errorCode)).Inc()
|
|
}
|