fix: wait for metrics scrape before restarting after switch deployment
After a successful switch deployment, the listener now waits for Prometheus to scrape the /metrics endpoint before exiting for restart. This ensures deployment metrics are captured before the process restarts and resets in-memory counters. Falls back to a 60 second timeout if no scrape occurs. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -16,7 +16,7 @@ import (
|
|||||||
"github.com/urfave/cli/v3"
|
"github.com/urfave/cli/v3"
|
||||||
)
|
)
|
||||||
|
|
||||||
const version = "0.1.11"
|
const version = "0.1.13"
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
app := &cli.Command{
|
app := &cli.Command{
|
||||||
|
|||||||
@@ -270,6 +270,17 @@ func (l *Listener) handleDeployRequest(subject string, data []byte) {
|
|||||||
|
|
||||||
// After a successful switch, signal restart so we pick up any new version
|
// After a successful switch, signal restart so we pick up any new version
|
||||||
if req.Action == messages.ActionSwitch {
|
if req.Action == messages.ActionSwitch {
|
||||||
|
// Wait for metrics scrape before restarting (if metrics enabled)
|
||||||
|
if l.metricsServer != nil {
|
||||||
|
l.logger.Info("waiting for metrics scrape before restart")
|
||||||
|
select {
|
||||||
|
case <-l.metricsServer.ScrapeCh():
|
||||||
|
l.logger.Info("metrics scraped, proceeding with restart")
|
||||||
|
case <-time.After(60 * time.Second):
|
||||||
|
l.logger.Warn("no metrics scrape within timeout, proceeding with restart anyway")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
select {
|
select {
|
||||||
case l.restartCh <- struct{}{}:
|
case l.restartCh <- struct{}{}:
|
||||||
default:
|
default:
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ type Server struct {
|
|||||||
registry *prometheus.Registry
|
registry *prometheus.Registry
|
||||||
collector *Collector
|
collector *Collector
|
||||||
logger *slog.Logger
|
logger *slog.Logger
|
||||||
|
scrapeCh chan struct{}
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewServer creates a new metrics server.
|
// NewServer creates a new metrics server.
|
||||||
@@ -35,9 +36,20 @@ func NewServer(cfg ServerConfig) *Server {
|
|||||||
registry := prometheus.NewRegistry()
|
registry := prometheus.NewRegistry()
|
||||||
collector := NewCollector(registry)
|
collector := NewCollector(registry)
|
||||||
|
|
||||||
mux := http.NewServeMux()
|
scrapeCh := make(chan struct{})
|
||||||
mux.Handle("/metrics", promhttp.HandlerFor(registry, promhttp.HandlerOpts{
|
|
||||||
|
metricsHandler := promhttp.HandlerFor(registry, promhttp.HandlerOpts{
|
||||||
Registry: registry,
|
Registry: registry,
|
||||||
|
})
|
||||||
|
|
||||||
|
mux := http.NewServeMux()
|
||||||
|
mux.Handle("/metrics", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
metricsHandler.ServeHTTP(w, r)
|
||||||
|
// Signal that a scrape occurred (non-blocking)
|
||||||
|
select {
|
||||||
|
case scrapeCh <- struct{}{}:
|
||||||
|
default:
|
||||||
|
}
|
||||||
}))
|
}))
|
||||||
mux.HandleFunc("/health", func(w http.ResponseWriter, _ *http.Request) {
|
mux.HandleFunc("/health", func(w http.ResponseWriter, _ *http.Request) {
|
||||||
w.WriteHeader(http.StatusOK)
|
w.WriteHeader(http.StatusOK)
|
||||||
@@ -53,6 +65,7 @@ func NewServer(cfg ServerConfig) *Server {
|
|||||||
registry: registry,
|
registry: registry,
|
||||||
collector: collector,
|
collector: collector,
|
||||||
logger: logger,
|
logger: logger,
|
||||||
|
scrapeCh: scrapeCh,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -61,6 +74,11 @@ func (s *Server) Collector() *Collector {
|
|||||||
return s.collector
|
return s.collector
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ScrapeCh returns a channel that receives a signal each time the metrics endpoint is scraped.
|
||||||
|
func (s *Server) ScrapeCh() <-chan struct{} {
|
||||||
|
return s.scrapeCh
|
||||||
|
}
|
||||||
|
|
||||||
// Start starts the HTTP server in a goroutine.
|
// Start starts the HTTP server in a goroutine.
|
||||||
func (s *Server) Start() error {
|
func (s *Server) Start() error {
|
||||||
s.logger.Info("starting metrics server", "addr", s.httpServer.Addr)
|
s.logger.Info("starting metrics server", "addr", s.httpServer.Addr)
|
||||||
|
|||||||
Reference in New Issue
Block a user