Add a new info metric that exposes the current system's flake revision and the latest remote revision as labels. This makes it easier to see exactly which revision is deployed vs available. Also adds version constant to Go code and extracts it in flake.nix, providing a single source of truth for the version. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
235 lines
5.7 KiB
Go
235 lines
5.7 KiB
Go
package collector
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"log/slog"
|
|
"os"
|
|
"os/exec"
|
|
"regexp"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
)
|
|
|
|
const nixosVersionPath = "/run/current-system/nixos-version"
|
|
|
|
// revisionPattern extracts the git hash from nixos-version.
|
|
// Formats: "25.11.20260203.e576e3c" or "1994-294a625"
|
|
var revisionPattern = regexp.MustCompile(`[.-]([a-f0-9]{7,40})$`)
|
|
|
|
type FlakeCollector struct {
|
|
flakeURL string
|
|
checkInterval time.Duration
|
|
|
|
inputAge *prometheus.Desc
|
|
inputInfo *prometheus.Desc
|
|
flakeInfo *prometheus.Desc
|
|
revisionBehind *prometheus.Desc
|
|
|
|
mu sync.RWMutex
|
|
cachedData *flakeMetadata
|
|
lastFetch time.Time
|
|
fetchError error
|
|
}
|
|
|
|
type flakeMetadata struct {
|
|
Revision string `json:"revision"`
|
|
Locks flakeLocks `json:"locks"`
|
|
}
|
|
|
|
type flakeLocks struct {
|
|
Nodes map[string]flakeLockNode `json:"nodes"`
|
|
Root string `json:"root"`
|
|
}
|
|
|
|
type flakeLockNode struct {
|
|
Inputs map[string]interface{} `json:"inputs,omitempty"`
|
|
Locked *lockedInfo `json:"locked,omitempty"`
|
|
Original *originalInfo `json:"original,omitempty"`
|
|
}
|
|
|
|
type lockedInfo struct {
|
|
LastModified int64 `json:"lastModified"`
|
|
Rev string `json:"rev"`
|
|
Type string `json:"type"`
|
|
}
|
|
|
|
type originalInfo struct {
|
|
Type string `json:"type"`
|
|
}
|
|
|
|
func NewFlakeCollector(flakeURL string, checkInterval time.Duration) *FlakeCollector {
|
|
return &FlakeCollector{
|
|
flakeURL: flakeURL,
|
|
checkInterval: checkInterval,
|
|
inputAge: prometheus.NewDesc(
|
|
"nixos_flake_input_age_seconds",
|
|
"Age of flake input in seconds",
|
|
[]string{"input"}, nil,
|
|
),
|
|
inputInfo: prometheus.NewDesc(
|
|
"nixos_flake_input_info",
|
|
"Info gauge with revision and type labels",
|
|
[]string{"input", "rev", "type"}, nil,
|
|
),
|
|
flakeInfo: prometheus.NewDesc(
|
|
"nixos_flake_info",
|
|
"Info gauge with current and remote flake revisions",
|
|
[]string{"current_rev", "remote_rev"}, nil,
|
|
),
|
|
revisionBehind: prometheus.NewDesc(
|
|
"nixos_flake_revision_behind",
|
|
"1 if current system revision differs from remote latest, 0 if match",
|
|
nil, nil,
|
|
),
|
|
}
|
|
}
|
|
|
|
func (c *FlakeCollector) Describe(ch chan<- *prometheus.Desc) {
|
|
ch <- c.inputAge
|
|
ch <- c.inputInfo
|
|
ch <- c.flakeInfo
|
|
ch <- c.revisionBehind
|
|
}
|
|
|
|
func (c *FlakeCollector) Collect(ch chan<- prometheus.Metric) {
|
|
data, err := c.getFlakeData()
|
|
if err != nil {
|
|
slog.Error("Failed to get flake data", "error", err)
|
|
return
|
|
}
|
|
|
|
c.collectInputMetrics(ch, data)
|
|
c.collectRevisionBehind(ch, data)
|
|
}
|
|
|
|
func (c *FlakeCollector) getFlakeData() (*flakeMetadata, error) {
|
|
c.mu.RLock()
|
|
if c.cachedData != nil && time.Since(c.lastFetch) < c.checkInterval {
|
|
data := c.cachedData
|
|
c.mu.RUnlock()
|
|
return data, nil
|
|
}
|
|
c.mu.RUnlock()
|
|
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
|
|
// Double-check after acquiring write lock
|
|
if c.cachedData != nil && time.Since(c.lastFetch) < c.checkInterval {
|
|
return c.cachedData, nil
|
|
}
|
|
|
|
data, err := fetchFlakeMetadata(c.flakeURL)
|
|
if err != nil {
|
|
c.fetchError = err
|
|
// Return cached data if available, even if stale
|
|
if c.cachedData != nil {
|
|
slog.Warn("Using stale flake data due to fetch error", "error", err)
|
|
return c.cachedData, nil
|
|
}
|
|
return nil, err
|
|
}
|
|
|
|
c.cachedData = data
|
|
c.lastFetch = time.Now()
|
|
c.fetchError = nil
|
|
return data, nil
|
|
}
|
|
|
|
func (c *FlakeCollector) collectInputMetrics(ch chan<- prometheus.Metric, data *flakeMetadata) {
|
|
now := time.Now().Unix()
|
|
|
|
for name, node := range data.Locks.Nodes {
|
|
// Skip the root node
|
|
if name == "root" {
|
|
continue
|
|
}
|
|
|
|
if node.Locked == nil {
|
|
continue
|
|
}
|
|
|
|
// Input age
|
|
age := float64(now - node.Locked.LastModified)
|
|
ch <- prometheus.MustNewConstMetric(c.inputAge, prometheus.GaugeValue, age, name)
|
|
|
|
// Input info
|
|
rev := node.Locked.Rev
|
|
if len(rev) > 7 {
|
|
rev = rev[:7]
|
|
}
|
|
inputType := node.Locked.Type
|
|
ch <- prometheus.MustNewConstMetric(c.inputInfo, prometheus.GaugeValue, 1, name, rev, inputType)
|
|
}
|
|
}
|
|
|
|
func (c *FlakeCollector) collectRevisionBehind(ch chan<- prometheus.Metric, data *flakeMetadata) {
|
|
currentRev, err := getCurrentSystemRevision()
|
|
if err != nil {
|
|
slog.Error("Failed to get current system revision", "error", err)
|
|
return
|
|
}
|
|
|
|
remoteRev := data.Revision
|
|
if len(remoteRev) > 7 {
|
|
remoteRev = remoteRev[:7]
|
|
}
|
|
|
|
// Emit flake info metric with revisions
|
|
ch <- prometheus.MustNewConstMetric(c.flakeInfo, prometheus.GaugeValue, 1, currentRev, remoteRev)
|
|
|
|
behind := 0.0
|
|
if currentRev != "" && data.Revision != "" {
|
|
if currentRev != remoteRev && !strings.HasPrefix(data.Revision, currentRev) {
|
|
behind = 1.0
|
|
}
|
|
}
|
|
|
|
ch <- prometheus.MustNewConstMetric(c.revisionBehind, prometheus.GaugeValue, behind)
|
|
}
|
|
|
|
func fetchFlakeMetadata(flakeURL string) (*flakeMetadata, error) {
|
|
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
|
defer cancel()
|
|
|
|
cmd := exec.CommandContext(ctx, "nix", "flake", "metadata", "--json", flakeURL)
|
|
output, err := cmd.Output()
|
|
if err != nil {
|
|
if exitErr, ok := err.(*exec.ExitError); ok {
|
|
return nil, fmt.Errorf("nix flake metadata failed: %s", strings.TrimSpace(string(exitErr.Stderr)))
|
|
}
|
|
return nil, fmt.Errorf("nix flake metadata failed: %w", err)
|
|
}
|
|
|
|
var data flakeMetadata
|
|
if err := json.Unmarshal(output, &data); err != nil {
|
|
return nil, fmt.Errorf("failed to parse flake metadata: %w", err)
|
|
}
|
|
|
|
return &data, nil
|
|
}
|
|
|
|
func getCurrentSystemRevision() (string, error) {
|
|
data, err := os.ReadFile(nixosVersionPath)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
version := strings.TrimSpace(string(data))
|
|
matches := revisionPattern.FindStringSubmatch(version)
|
|
if matches == nil {
|
|
return "", nil
|
|
}
|
|
|
|
rev := matches[1]
|
|
if len(rev) > 7 {
|
|
rev = rev[:7]
|
|
}
|
|
return rev, nil
|
|
}
|