From bc02393c5a051ce3a065aed8cc365c1378a6572f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Sun, 8 Feb 2026 23:38:26 +0100 Subject: [PATCH] fix: wait for metrics scrape before restarting after switch deployment After a successful switch deployment, the listener now waits for Prometheus to scrape the /metrics endpoint before exiting for restart. This ensures deployment metrics are captured before the process restarts and resets in-memory counters. Falls back to a 60 second timeout if no scrape occurs. Co-Authored-By: Claude Opus 4.5 --- cmd/homelab-deploy/main.go | 2 +- internal/listener/listener.go | 11 +++++++++++ internal/metrics/server.go | 22 ++++++++++++++++++++-- 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/cmd/homelab-deploy/main.go b/cmd/homelab-deploy/main.go index 837014a..4c5bdde 100644 --- a/cmd/homelab-deploy/main.go +++ b/cmd/homelab-deploy/main.go @@ -16,7 +16,7 @@ import ( "github.com/urfave/cli/v3" ) -const version = "0.1.11" +const version = "0.1.13" func main() { app := &cli.Command{ diff --git a/internal/listener/listener.go b/internal/listener/listener.go index b30357f..70b9ee3 100644 --- a/internal/listener/listener.go +++ b/internal/listener/listener.go @@ -270,6 +270,17 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) { // After a successful switch, signal restart so we pick up any new version if req.Action == messages.ActionSwitch { + // Wait for metrics scrape before restarting (if metrics enabled) + if l.metricsServer != nil { + l.logger.Info("waiting for metrics scrape before restart") + select { + case <-l.metricsServer.ScrapeCh(): + l.logger.Info("metrics scraped, proceeding with restart") + case <-time.After(60 * time.Second): + l.logger.Warn("no metrics scrape within timeout, proceeding with restart anyway") + } + } + select { case l.restartCh <- struct{}{}: default: diff --git a/internal/metrics/server.go b/internal/metrics/server.go index da6ebb8..886d463 100644 --- a/internal/metrics/server.go +++ b/internal/metrics/server.go @@ -23,6 +23,7 @@ type Server struct { registry *prometheus.Registry collector *Collector logger *slog.Logger + scrapeCh chan struct{} } // NewServer creates a new metrics server. @@ -35,9 +36,20 @@ func NewServer(cfg ServerConfig) *Server { registry := prometheus.NewRegistry() collector := NewCollector(registry) - mux := http.NewServeMux() - mux.Handle("/metrics", promhttp.HandlerFor(registry, promhttp.HandlerOpts{ + scrapeCh := make(chan struct{}) + + metricsHandler := promhttp.HandlerFor(registry, promhttp.HandlerOpts{ Registry: registry, + }) + + mux := http.NewServeMux() + mux.Handle("/metrics", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + metricsHandler.ServeHTTP(w, r) + // Signal that a scrape occurred (non-blocking) + select { + case scrapeCh <- struct{}{}: + default: + } })) mux.HandleFunc("/health", func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusOK) @@ -53,6 +65,7 @@ func NewServer(cfg ServerConfig) *Server { registry: registry, collector: collector, logger: logger, + scrapeCh: scrapeCh, } } @@ -61,6 +74,11 @@ func (s *Server) Collector() *Collector { return s.collector } +// ScrapeCh returns a channel that receives a signal each time the metrics endpoint is scraped. +func (s *Server) ScrapeCh() <-chan struct{} { + return s.scrapeCh +} + // Start starts the HTTP server in a goroutine. func (s *Server) Start() error { s.logger.Info("starting metrics server", "addr", s.httpServer.Addr)