commit f637da487c4a824377d355a59491e0df37f255f8 Author: Torjus Håkestad Date: Fri Feb 6 22:50:14 2026 +0100 feat: implement nixos-exporter Prometheus exporter for NixOS-specific metrics including: - Generation collector: count, current, booted, age, config mismatch - Flake collector: input age, input info, revision behind Includes NixOS module, flake packaging, and documentation. Co-Authored-By: Claude Opus 4.5 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b2be92b --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +result diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..6ac7fae --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,90 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +nixos-exporter is a Prometheus exporter for NixOS-specific metrics. It exposes system state information that standard exporters don't cover: generation management, flake input freshness, and upgrade status. + +**Status**: Implementation complete. + +## Build Commands + +Run commands through the Nix development shell using `nix develop -c`: + +```bash +# Build +nix develop -c go build ./... + +# Run tests +nix develop -c go test ./... + +# Run single test +nix develop -c go test -run TestName ./path/to/package + +# Lint +nix develop -c golangci-lint run + +# Vulnerability check +nix develop -c govulncheck ./... + +# Test Nix build +nix build + +# Run the binary (prefer this over go build + running binary) +# To pass arguments, use -- before them: nix run .#default -- --help +nix run .#default +``` + +## Testing Procedures + +Before committing, run the following checks: + +1. `nix develop -c go test ./...` - Unit tests +2. `nix develop -c golangci-lint run` - Linting +3. `nix develop -c govulncheck ./...` - Vulnerability scanning +4. `nix build` - Verify nix build works + +## Architecture + +Single binary with pluggable collectors: + +``` +nixos-exporter/ +├── main.go # Entry point, HTTP server on :9971 +├── collector/ +│ ├── generation.go # Core metrics (count, current, booted, age, mismatch) +│ └── flake.go # Flake input freshness metrics (optional) +└── config/ + └── config.go # YAML + CLI flag handling +``` + +### Collectors + +**Generation collector** (always enabled): Reads symlinks in `/nix/var/nix/profiles/` and `/run/current-system` to determine generation state. + +**Flake collector** (optional): Parses `flake.lock` JSON to extract `lastModified` timestamps per input. + +### Configuration + +Supports YAML config file, CLI flags (`--listen`, `--collector.flake`, `--flake.lock-path`), and NixOS module integration via `services.prometheus.exporters.nixos`. + +## Commit Message Format + +Use conventional commit format: + +``` +feat: add new feature +fix: fix a bug +docs: update documentation +refactor: refactor code without changing behavior +test: add or update tests +chore: maintenance tasks +``` + +## Key Design Decisions + +- Go chosen for mature Prometheus client library and static binary output +- Port 9971 allocated for this exporter +- Follows nixpkgs `services.prometheus.exporters.*` module pattern +- No root required - only reads symlinks and files diff --git a/README.md b/README.md new file mode 100644 index 0000000..ba85057 --- /dev/null +++ b/README.md @@ -0,0 +1,142 @@ +# nixos-exporter + +A Prometheus exporter for NixOS-specific metrics. Exposes system state information that standard exporters don't cover: generation management, flake input freshness, and upgrade status. + +## Installation + +### As a flake + +```nix +{ + inputs.nixos-exporter.url = "git+https://git.t-juice.club/torjus/nixos-exporter"; + + outputs = { self, nixpkgs, nixos-exporter, ... }: { + nixosConfigurations.myhost = nixpkgs.lib.nixosSystem { + modules = [ + nixos-exporter.nixosModules.default + { + services.prometheus.exporters.nixos = { + enable = true; + flake = { + enable = true; + url = "github:myuser/myconfig"; + }; + }; + } + ]; + }; + }; +} +``` + +### Manual + +```bash +nix build +./result/bin/nixos-exporter --listen=:9971 +``` + +## CLI Flags + +| Flag | Default | Description | +|------|---------|-------------| +| `--listen` | `:9971` | Address to listen on | +| `--collector.flake` | `false` | Enable flake collector | +| `--flake.url` | | Flake URL for revision comparison (required if flake collector enabled) | +| `--flake.check-interval` | `1h` | Interval between remote flake checks | + +## NixOS Module Options + +```nix +services.prometheus.exporters.nixos = { + enable = true; + port = 9971; + listenAddress = "0.0.0.0"; + openFirewall = false; + + flake = { + enable = false; + url = ""; # Required if flake.enable = true + checkInterval = "1h"; + }; +}; +``` + +## Metrics + +### Generation Metrics (always enabled) + +| Metric | Type | Description | +|--------|------|-------------| +| `nixos_generation_count` | Gauge | Total number of system generations | +| `nixos_current_generation` | Gauge | Currently active generation number | +| `nixos_booted_generation` | Gauge | Generation that was booted | +| `nixos_generation_age_seconds` | Gauge | Age of current generation in seconds | +| `nixos_config_mismatch` | Gauge | 1 if booted generation differs from current | + +### Flake Metrics (optional) + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `nixos_flake_input_age_seconds` | Gauge | `input` | Age of flake input in seconds | +| `nixos_flake_input_info` | Gauge | `input`, `rev`, `type` | Info gauge with revision and type labels | +| `nixos_flake_revision_behind` | Gauge | | 1 if current system revision differs from remote latest | + +## Example Prometheus Alerts + +```yaml +groups: + - name: nixos + rules: + - alert: NixOSConfigStale + expr: nixos_generation_age_seconds > 7 * 24 * 3600 + for: 1h + labels: + severity: warning + annotations: + summary: "NixOS config on {{ $labels.instance }} is over 7 days old" + + - alert: NixOSRebootRequired + expr: nixos_config_mismatch == 1 + for: 24h + labels: + severity: info + annotations: + summary: "{{ $labels.instance }} needs reboot to apply config" + + - alert: NixpkgsInputStale + expr: nixos_flake_input_age_seconds{input="nixpkgs"} > 30 * 24 * 3600 + for: 1d + labels: + severity: info + annotations: + summary: "nixpkgs input on {{ $labels.instance }} is over 30 days old" + + - alert: NixOSRevisionBehind + expr: nixos_flake_revision_behind == 1 + for: 1h + labels: + severity: info + annotations: + summary: "{{ $labels.instance }} is behind remote flake revision" +``` + +## Security Considerations + +- The `/metrics` endpoint exposes system state and revision information. Only expose it on internal networks. +- Runs as non-root user; only reads symlinks and files that are world-readable. +- When using the flake collector, the exporter executes `nix flake metadata` to fetch remote data. + +## Known Limitations + +- The `nixos_flake_revision_behind` metric relies on parsing the git hash from `/run/current-system/nixos-version`. The format of this file varies depending on NixOS configuration: + - Standard format: `25.11.20260203.e576e3c` + - Custom format: `1994-294a625` + + If your system uses a non-standard format that doesn't end with a git hash, the revision comparison may not work correctly. + +- Flake input ages reflect the remote flake state. If the deployed system is behind, these will show newer timestamps than what's actually deployed. + +## License + +MIT diff --git a/collector/flake.go b/collector/flake.go new file mode 100644 index 0000000..c026ad1 --- /dev/null +++ b/collector/flake.go @@ -0,0 +1,216 @@ +package collector + +import ( + "encoding/json" + "log/slog" + "os" + "os/exec" + "regexp" + "strings" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" +) + +const nixosVersionPath = "/run/current-system/nixos-version" + +// revisionPattern extracts the git hash from nixos-version. +// Formats: "25.11.20260203.e576e3c" or "1994-294a625" +var revisionPattern = regexp.MustCompile(`[.-]([a-f0-9]{7,40})$`) + +type FlakeCollector struct { + flakeURL string + checkInterval time.Duration + + inputAge *prometheus.Desc + inputInfo *prometheus.Desc + revisionBehind *prometheus.Desc + + mu sync.RWMutex + cachedData *flakeMetadata + lastFetch time.Time + fetchError error +} + +type flakeMetadata struct { + Revision string `json:"revision"` + Locks flakeLocks `json:"locks"` +} + +type flakeLocks struct { + Nodes map[string]flakeLockNode `json:"nodes"` + Root string `json:"root"` +} + +type flakeLockNode struct { + Inputs map[string]interface{} `json:"inputs,omitempty"` + Locked *lockedInfo `json:"locked,omitempty"` + Original *originalInfo `json:"original,omitempty"` +} + +type lockedInfo struct { + LastModified int64 `json:"lastModified"` + Rev string `json:"rev"` + Type string `json:"type"` +} + +type originalInfo struct { + Type string `json:"type"` +} + +func NewFlakeCollector(flakeURL string, checkInterval time.Duration) *FlakeCollector { + return &FlakeCollector{ + flakeURL: flakeURL, + checkInterval: checkInterval, + inputAge: prometheus.NewDesc( + "nixos_flake_input_age_seconds", + "Age of flake input in seconds", + []string{"input"}, nil, + ), + inputInfo: prometheus.NewDesc( + "nixos_flake_input_info", + "Info gauge with revision and type labels", + []string{"input", "rev", "type"}, nil, + ), + revisionBehind: prometheus.NewDesc( + "nixos_flake_revision_behind", + "1 if current system revision differs from remote latest, 0 if match", + nil, nil, + ), + } +} + +func (c *FlakeCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.inputAge + ch <- c.inputInfo + ch <- c.revisionBehind +} + +func (c *FlakeCollector) Collect(ch chan<- prometheus.Metric) { + data, err := c.getFlakeData() + if err != nil { + slog.Error("Failed to get flake data", "error", err) + return + } + + c.collectInputMetrics(ch, data) + c.collectRevisionBehind(ch, data) +} + +func (c *FlakeCollector) getFlakeData() (*flakeMetadata, error) { + c.mu.RLock() + if c.cachedData != nil && time.Since(c.lastFetch) < c.checkInterval { + data := c.cachedData + c.mu.RUnlock() + return data, nil + } + c.mu.RUnlock() + + c.mu.Lock() + defer c.mu.Unlock() + + // Double-check after acquiring write lock + if c.cachedData != nil && time.Since(c.lastFetch) < c.checkInterval { + return c.cachedData, nil + } + + data, err := fetchFlakeMetadata(c.flakeURL) + if err != nil { + c.fetchError = err + // Return cached data if available, even if stale + if c.cachedData != nil { + slog.Warn("Using stale flake data due to fetch error", "error", err) + return c.cachedData, nil + } + return nil, err + } + + c.cachedData = data + c.lastFetch = time.Now() + c.fetchError = nil + return data, nil +} + +func (c *FlakeCollector) collectInputMetrics(ch chan<- prometheus.Metric, data *flakeMetadata) { + now := time.Now().Unix() + + for name, node := range data.Locks.Nodes { + // Skip the root node + if name == "root" { + continue + } + + if node.Locked == nil { + continue + } + + // Input age + age := float64(now - node.Locked.LastModified) + ch <- prometheus.MustNewConstMetric(c.inputAge, prometheus.GaugeValue, age, name) + + // Input info + rev := node.Locked.Rev + if len(rev) > 7 { + rev = rev[:7] + } + inputType := node.Locked.Type + ch <- prometheus.MustNewConstMetric(c.inputInfo, prometheus.GaugeValue, 1, name, rev, inputType) + } +} + +func (c *FlakeCollector) collectRevisionBehind(ch chan<- prometheus.Metric, data *flakeMetadata) { + currentRev, err := getCurrentSystemRevision() + if err != nil { + slog.Error("Failed to get current system revision", "error", err) + return + } + + behind := 0.0 + if currentRev != "" && data.Revision != "" { + // Compare short hashes + remoteShort := data.Revision + if len(remoteShort) > 7 { + remoteShort = remoteShort[:7] + } + if currentRev != remoteShort && !strings.HasPrefix(data.Revision, currentRev) { + behind = 1.0 + } + } + + ch <- prometheus.MustNewConstMetric(c.revisionBehind, prometheus.GaugeValue, behind) +} + +func fetchFlakeMetadata(flakeURL string) (*flakeMetadata, error) { + cmd := exec.Command("nix", "flake", "metadata", "--json", flakeURL) + output, err := cmd.Output() + if err != nil { + return nil, err + } + + var data flakeMetadata + if err := json.Unmarshal(output, &data); err != nil { + return nil, err + } + + return &data, nil +} + +func getCurrentSystemRevision() (string, error) { + data, err := os.ReadFile(nixosVersionPath) + if err != nil { + return "", err + } + + version := strings.TrimSpace(string(data)) + matches := revisionPattern.FindStringSubmatch(version) + if matches == nil { + return "", nil + } + + rev := matches[1] + if len(rev) > 7 { + rev = rev[:7] + } + return rev, nil +} diff --git a/collector/flake_test.go b/collector/flake_test.go new file mode 100644 index 0000000..219dcc4 --- /dev/null +++ b/collector/flake_test.go @@ -0,0 +1,151 @@ +package collector + +import ( + "encoding/json" + "os" + "path/filepath" + "testing" +) + +func TestRevisionPattern(t *testing.T) { + tests := []struct { + version string + wantRev string + }{ + {"25.11.20260203.e576e3c", "e576e3c"}, + {"1994-294a625", "294a625"}, + {"25.05.20250101.abcdef1234567890", "abcdef1234567890"}, + {"no-revision-here", ""}, + {"", ""}, + } + + for _, tt := range tests { + t.Run(tt.version, func(t *testing.T) { + matches := revisionPattern.FindStringSubmatch(tt.version) + var got string + if matches != nil { + got = matches[1] + } + if got != tt.wantRev { + t.Errorf("revisionPattern.FindStringSubmatch(%q) = %q, want %q", tt.version, got, tt.wantRev) + } + }) + } +} + +func TestGetCurrentSystemRevision(t *testing.T) { + // Skip if not on NixOS + if _, err := os.Stat(nixosVersionPath); os.IsNotExist(err) { + t.Skip("not running on NixOS") + } + + rev, err := getCurrentSystemRevision() + if err != nil { + t.Fatal(err) + } + + // Just check it returns something reasonable + t.Logf("current system revision: %s", rev) +} + +func TestGetCurrentSystemRevisionFromFile(t *testing.T) { + // Create a temp file to simulate /run/current-system/nixos-version + dir := t.TempDir() + versionPath := filepath.Join(dir, "nixos-version") + + tests := []struct { + content string + wantRev string + }{ + {"25.11.20260203.e576e3c\n", "e576e3c"}, + {"1994-294a625\n", "294a625"}, + {"25.05.20250101.abcdef1234567890\n", "abcdef1"}, + {"no-hash", ""}, + } + + for _, tt := range tests { + t.Run(tt.content, func(t *testing.T) { + if err := os.WriteFile(versionPath, []byte(tt.content), 0644); err != nil { + t.Fatal(err) + } + + // We can't easily test the actual function without modifying the constant, + // so we test the pattern extraction logic directly + version := tt.content + if len(version) > 0 && version[len(version)-1] == '\n' { + version = version[:len(version)-1] + } + + matches := revisionPattern.FindStringSubmatch(version) + var rev string + if matches != nil { + rev = matches[1] + if len(rev) > 7 { + rev = rev[:7] + } + } + + if rev != tt.wantRev { + t.Errorf("got revision %q, want %q", rev, tt.wantRev) + } + }) + } +} + +func TestFlakeLocksUnmarshal(t *testing.T) { + jsonData := `{ + "revision": "abc1234567890", + "locks": { + "nodes": { + "nixpkgs": { + "locked": { + "lastModified": 1700000000, + "rev": "def4567890123", + "type": "github" + } + }, + "home-manager": { + "locked": { + "lastModified": 1699000000, + "rev": "ghi7890123456", + "type": "github" + } + }, + "root": { + "inputs": { + "nixpkgs": "nixpkgs", + "home-manager": "home-manager" + } + } + }, + "root": "root" + } + }` + + var data flakeMetadata + if err := json.Unmarshal([]byte(jsonData), &data); err != nil { + t.Fatal(err) + } + + if data.Revision != "abc1234567890" { + t.Errorf("expected revision abc1234567890, got %s", data.Revision) + } + + if len(data.Locks.Nodes) != 3 { + t.Errorf("expected 3 nodes, got %d", len(data.Locks.Nodes)) + } + + nixpkgs := data.Locks.Nodes["nixpkgs"] + if nixpkgs.Locked == nil { + t.Fatal("expected nixpkgs to have locked info") + } + if nixpkgs.Locked.LastModified != 1700000000 { + t.Errorf("expected lastModified 1700000000, got %d", nixpkgs.Locked.LastModified) + } + if nixpkgs.Locked.Rev != "def4567890123" { + t.Errorf("expected rev def4567890123, got %s", nixpkgs.Locked.Rev) + } + if nixpkgs.Locked.Type != "github" { + t.Errorf("expected type github, got %s", nixpkgs.Locked.Type) + } +} diff --git a/collector/generation.go b/collector/generation.go new file mode 100644 index 0000000..0e4f6ff --- /dev/null +++ b/collector/generation.go @@ -0,0 +1,218 @@ +package collector + +import ( + "log/slog" + "os" + "path/filepath" + "regexp" + "strconv" + "time" + + "github.com/prometheus/client_golang/prometheus" +) + +const ( + profileDir = "/nix/var/nix/profiles" + currentSystemDir = "/run/current-system" + bootedSystemDir = "/run/booted-system" +) + +var generationPattern = regexp.MustCompile(`^system-(\d+)-link$`) + +type GenerationCollector struct { + generationCount *prometheus.Desc + currentGen *prometheus.Desc + bootedGen *prometheus.Desc + generationAge *prometheus.Desc + configMismatch *prometheus.Desc +} + +func NewGenerationCollector() *GenerationCollector { + return &GenerationCollector{ + generationCount: prometheus.NewDesc( + "nixos_generation_count", + "Total number of system generations", + nil, nil, + ), + currentGen: prometheus.NewDesc( + "nixos_current_generation", + "Currently active generation number", + nil, nil, + ), + bootedGen: prometheus.NewDesc( + "nixos_booted_generation", + "Generation that was booted", + nil, nil, + ), + generationAge: prometheus.NewDesc( + "nixos_generation_age_seconds", + "Age of current generation in seconds", + nil, nil, + ), + configMismatch: prometheus.NewDesc( + "nixos_config_mismatch", + "1 if booted generation differs from current", + nil, nil, + ), + } +} + +func (c *GenerationCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.generationCount + ch <- c.currentGen + ch <- c.bootedGen + ch <- c.generationAge + ch <- c.configMismatch +} + +func (c *GenerationCollector) Collect(ch chan<- prometheus.Metric) { + c.collectGenerationCount(ch) + c.collectCurrentGeneration(ch) + c.collectBootedGeneration(ch) + c.collectGenerationAge(ch) + c.collectConfigMismatch(ch) +} + +func (c *GenerationCollector) collectGenerationCount(ch chan<- prometheus.Metric) { + count, err := countGenerations(profileDir) + if err != nil { + slog.Error("Failed to count generations", "error", err) + return + } + ch <- prometheus.MustNewConstMetric(c.generationCount, prometheus.GaugeValue, float64(count)) +} + +func (c *GenerationCollector) collectCurrentGeneration(ch chan<- prometheus.Metric) { + gen, err := getCurrentGeneration(profileDir) + if err != nil { + slog.Error("Failed to get current generation", "error", err) + return + } + ch <- prometheus.MustNewConstMetric(c.currentGen, prometheus.GaugeValue, float64(gen)) +} + +func (c *GenerationCollector) collectBootedGeneration(ch chan<- prometheus.Metric) { + gen, err := getBootedGeneration(profileDir, bootedSystemDir) + if err != nil { + slog.Error("Failed to get booted generation", "error", err) + return + } + ch <- prometheus.MustNewConstMetric(c.bootedGen, prometheus.GaugeValue, float64(gen)) +} + +func (c *GenerationCollector) collectGenerationAge(ch chan<- prometheus.Metric) { + age, err := getGenerationAge(profileDir) + if err != nil { + slog.Error("Failed to get generation age", "error", err) + return + } + ch <- prometheus.MustNewConstMetric(c.generationAge, prometheus.GaugeValue, age) +} + +func (c *GenerationCollector) collectConfigMismatch(ch chan<- prometheus.Metric) { + mismatch, err := checkConfigMismatch(currentSystemDir, bootedSystemDir) + if err != nil { + slog.Error("Failed to check config mismatch", "error", err) + return + } + value := 0.0 + if mismatch { + value = 1.0 + } + ch <- prometheus.MustNewConstMetric(c.configMismatch, prometheus.GaugeValue, value) +} + +// countGenerations counts system-*-link entries in the profile directory. +func countGenerations(profileDir string) (int, error) { + entries, err := os.ReadDir(profileDir) + if err != nil { + return 0, err + } + + count := 0 + for _, entry := range entries { + if generationPattern.MatchString(entry.Name()) { + count++ + } + } + return count, nil +} + +// getCurrentGeneration parses the generation number from the system symlink. +func getCurrentGeneration(profileDir string) (int, error) { + systemLink := filepath.Join(profileDir, "system") + target, err := os.Readlink(systemLink) + if err != nil { + return 0, err + } + + // Target is relative like "system-123-link" + base := filepath.Base(target) + matches := generationPattern.FindStringSubmatch(base) + if matches == nil { + return 0, nil + } + + return strconv.Atoi(matches[1]) +} + +// getBootedGeneration finds the generation that matches /run/booted-system. +func getBootedGeneration(profileDir, bootedSystemDir string) (int, error) { + bootedTarget, err := os.Readlink(bootedSystemDir) + if err != nil { + return 0, err + } + + entries, err := os.ReadDir(profileDir) + if err != nil { + return 0, err + } + + for _, entry := range entries { + if !generationPattern.MatchString(entry.Name()) { + continue + } + + linkPath := filepath.Join(profileDir, entry.Name()) + target, err := os.Readlink(linkPath) + if err != nil { + continue + } + + if target == bootedTarget { + matches := generationPattern.FindStringSubmatch(entry.Name()) + if matches != nil { + return strconv.Atoi(matches[1]) + } + } + } + + return 0, nil +} + +// getGenerationAge returns the age of the current system profile in seconds. +func getGenerationAge(profileDir string) (float64, error) { + systemLink := filepath.Join(profileDir, "system") + info, err := os.Lstat(systemLink) + if err != nil { + return 0, err + } + + age := time.Since(info.ModTime()).Seconds() + return age, nil +} + +// checkConfigMismatch compares /run/current-system and /run/booted-system targets. +func checkConfigMismatch(currentSystemDir, bootedSystemDir string) (bool, error) { + currentTarget, err := os.Readlink(currentSystemDir) + if err != nil { + return false, err + } + + bootedTarget, err := os.Readlink(bootedSystemDir) + if err != nil { + return false, err + } + + return currentTarget != bootedTarget, nil +} diff --git a/collector/generation_test.go b/collector/generation_test.go new file mode 100644 index 0000000..b19c41e --- /dev/null +++ b/collector/generation_test.go @@ -0,0 +1,178 @@ +package collector + +import ( + "os" + "path/filepath" + "testing" +) + +func TestCountGenerations(t *testing.T) { + dir := t.TempDir() + + // Create some generation symlinks + for _, name := range []string{ + "system-1-link", + "system-2-link", + "system-10-link", + "system", // current system link, should not be counted + "other-file", // unrelated file + "system-x-link", // malformed, should not be counted + } { + path := filepath.Join(dir, name) + if err := os.Symlink("/nix/store/dummy", path); err != nil { + t.Fatal(err) + } + } + + count, err := countGenerations(dir) + if err != nil { + t.Fatal(err) + } + + if count != 3 { + t.Errorf("expected 3 generations, got %d", count) + } +} + +func TestGetCurrentGeneration(t *testing.T) { + dir := t.TempDir() + + // Create system symlink pointing to a generation + if err := os.Symlink("system-42-link", filepath.Join(dir, "system")); err != nil { + t.Fatal(err) + } + + gen, err := getCurrentGeneration(dir) + if err != nil { + t.Fatal(err) + } + + if gen != 42 { + t.Errorf("expected generation 42, got %d", gen) + } +} + +func TestGetBootedGeneration(t *testing.T) { + profileDir := t.TempDir() + bootedDir := t.TempDir() + + storePath := "/nix/store/abc123-nixos-system" + + // Create generation symlinks + if err := os.Symlink("/nix/store/other", filepath.Join(profileDir, "system-1-link")); err != nil { + t.Fatal(err) + } + if err := os.Symlink(storePath, filepath.Join(profileDir, "system-2-link")); err != nil { + t.Fatal(err) + } + if err := os.Symlink("/nix/store/another", filepath.Join(profileDir, "system-3-link")); err != nil { + t.Fatal(err) + } + + // Create booted-system symlink + bootedSystemPath := filepath.Join(bootedDir, "booted-system") + if err := os.Symlink(storePath, bootedSystemPath); err != nil { + t.Fatal(err) + } + + gen, err := getBootedGeneration(profileDir, bootedSystemPath) + if err != nil { + t.Fatal(err) + } + + if gen != 2 { + t.Errorf("expected booted generation 2, got %d", gen) + } +} + +func TestCheckConfigMismatch(t *testing.T) { + dir := t.TempDir() + + currentPath := filepath.Join(dir, "current-system") + bootedPath := filepath.Join(dir, "booted-system") + + // Same target = no mismatch + if err := os.Symlink("/nix/store/same", currentPath); err != nil { + t.Fatal(err) + } + if err := os.Symlink("/nix/store/same", bootedPath); err != nil { + t.Fatal(err) + } + + mismatch, err := checkConfigMismatch(currentPath, bootedPath) + if err != nil { + t.Fatal(err) + } + if mismatch { + t.Error("expected no mismatch when targets are the same") + } + + // Different targets = mismatch + if err := os.Remove(currentPath); err != nil { + t.Fatal(err) + } + if err := os.Symlink("/nix/store/different", currentPath); err != nil { + t.Fatal(err) + } + + mismatch, err = checkConfigMismatch(currentPath, bootedPath) + if err != nil { + t.Fatal(err) + } + if !mismatch { + t.Error("expected mismatch when targets differ") + } +} + +func TestGetGenerationAge(t *testing.T) { + dir := t.TempDir() + + // Create system symlink + if err := os.Symlink("system-1-link", filepath.Join(dir, "system")); err != nil { + t.Fatal(err) + } + + age, err := getGenerationAge(dir) + if err != nil { + t.Fatal(err) + } + + // Age should be very small since we just created it + if age < 0 || age > 1 { + t.Errorf("expected age close to 0, got %f", age) + } +} + +func TestGenerationPattern(t *testing.T) { + tests := []struct { + name string + match bool + genNum string + }{ + {"system-1-link", true, "1"}, + {"system-42-link", true, "42"}, + {"system-123-link", true, "123"}, + {"system", false, ""}, + {"system-link", false, ""}, + {"system--link", false, ""}, + {"system-abc-link", false, ""}, + {"other-1-link", false, ""}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + matches := generationPattern.FindStringSubmatch(tt.name) + if tt.match { + if matches == nil { + t.Errorf("expected %q to match", tt.name) + } else if matches[1] != tt.genNum { + t.Errorf("expected generation %q, got %q", tt.genNum, matches[1]) + } + } else { + if matches != nil { + t.Errorf("expected %q not to match", tt.name) + } + } + }) + } +} diff --git a/config/config.go b/config/config.go new file mode 100644 index 0000000..9d357a5 --- /dev/null +++ b/config/config.go @@ -0,0 +1,31 @@ +package config + +import ( + "flag" + "fmt" + "time" +) + +type Config struct { + ListenAddr string + FlakeCollector bool + FlakeURL string + FlakeCheckInterval time.Duration +} + +func Parse() (*Config, error) { + cfg := &Config{} + + flag.StringVar(&cfg.ListenAddr, "listen", ":9971", "Address to listen on") + flag.BoolVar(&cfg.FlakeCollector, "collector.flake", false, "Enable flake collector") + flag.StringVar(&cfg.FlakeURL, "flake.url", "", "Flake URL for revision comparison (required if flake collector enabled)") + flag.DurationVar(&cfg.FlakeCheckInterval, "flake.check-interval", time.Hour, "Interval between remote flake checks") + + flag.Parse() + + if cfg.FlakeCollector && cfg.FlakeURL == "" { + return nil, fmt.Errorf("--flake.url is required when --collector.flake is enabled") + } + + return cfg, nil +} diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..f75f263 --- /dev/null +++ b/flake.lock @@ -0,0 +1,27 @@ +{ + "nodes": { + "nixpkgs": { + "locked": { + "lastModified": 1770197578, + "narHash": "sha256-AYqlWrX09+HvGs8zM6ebZ1pwUqjkfpnv8mewYwAo+iM=", + "owner": "nixos", + "repo": "nixpkgs", + "rev": "00c21e4c93d963c50d4c0c89bfa84ed6e0694df2", + "type": "github" + }, + "original": { + "owner": "nixos", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "nixpkgs": "nixpkgs" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..864fcdf --- /dev/null +++ b/flake.nix @@ -0,0 +1,53 @@ +{ + description = "Prometheus exporter for NixOS-specific metrics"; + + inputs = { + nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-unstable"; + }; + + outputs = { self, nixpkgs }: + let + supportedSystems = [ "x86_64-linux" "aarch64-linux" ]; + forAllSystems = nixpkgs.lib.genAttrs supportedSystems; + in + { + packages = forAllSystems (system: + let + pkgs = nixpkgs.legacyPackages.${system}; + in + { + default = pkgs.buildGoModule { + pname = "nixos-exporter"; + version = "0.1.0"; + src = ./.; + vendorHash = "sha256-NnvB20rORPS5QF5enbb5KpWaKZ70ybSgfd7wjk21/Cg="; + + meta = with pkgs.lib; { + description = "Prometheus exporter for NixOS-specific metrics"; + homepage = "https://git.t-juice.club/torjus/nixos-exporter"; + license = licenses.mit; + maintainers = [ ]; + platforms = platforms.linux; + }; + }; + }); + + devShells = forAllSystems (system: + let + pkgs = nixpkgs.legacyPackages.${system}; + in + { + default = pkgs.mkShell { + buildInputs = with pkgs; [ + go + gopls + golangci-lint + govulncheck + delve + ]; + }; + }); + + nixosModules.default = import ./module.nix { inherit self; }; + }; +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..c29452b --- /dev/null +++ b/go.mod @@ -0,0 +1,17 @@ +module git.t-juice.club/torjus/nixos-exporter + +go 1.23 + +require github.com/prometheus/client_golang v1.20.5 + +require ( + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/klauspost/compress v1.17.9 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/prometheus/client_model v0.6.1 // indirect + github.com/prometheus/common v0.55.0 // indirect + github.com/prometheus/procfs v0.15.1 // indirect + golang.org/x/sys v0.22.0 // indirect + google.golang.org/protobuf v1.34.2 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..d5318cf --- /dev/null +++ b/go.sum @@ -0,0 +1,24 @@ +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA= +github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y= +github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= +github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= +github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= +github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc= +github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8= +github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= +github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= +golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= +golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= +google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= diff --git a/main.go b/main.go new file mode 100644 index 0000000..a692b9e --- /dev/null +++ b/main.go @@ -0,0 +1,75 @@ +package main + +import ( + "context" + "log/slog" + "net/http" + "os" + "os/signal" + "syscall" + "time" + + "git.t-juice.club/torjus/nixos-exporter/collector" + "git.t-juice.club/torjus/nixos-exporter/config" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" +) + +func main() { + cfg, err := config.Parse() + if err != nil { + slog.Error("Failed to parse config", "error", err) + os.Exit(1) + } + + // Register generation collector + genCollector := collector.NewGenerationCollector() + prometheus.MustRegister(genCollector) + slog.Info("Registered generation collector") + + // Register flake collector if enabled + if cfg.FlakeCollector { + flakeCollector := collector.NewFlakeCollector(cfg.FlakeURL, cfg.FlakeCheckInterval) + prometheus.MustRegister(flakeCollector) + slog.Info("Registered flake collector", "url", cfg.FlakeURL, "check_interval", cfg.FlakeCheckInterval) + } + + mux := http.NewServeMux() + mux.Handle("/metrics", promhttp.Handler()) + mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(` +NixOS Exporter + +

NixOS Exporter

+

Metrics

+ +`)) + }) + + server := &http.Server{ + Addr: cfg.ListenAddr, + Handler: mux, + } + + // Handle shutdown gracefully + ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) + defer stop() + + go func() { + slog.Info("Starting server", "addr", cfg.ListenAddr) + if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed { + slog.Error("Server error", "error", err) + os.Exit(1) + } + }() + + <-ctx.Done() + slog.Info("Shutting down server") + + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + if err := server.Shutdown(shutdownCtx); err != nil { + slog.Error("Failed to shutdown server", "error", err) + } +} diff --git a/module.nix b/module.nix new file mode 100644 index 0000000..5ef4d05 --- /dev/null +++ b/module.nix @@ -0,0 +1,122 @@ +{ self }: +{ config, lib, pkgs, ... }: + +let + cfg = config.services.prometheus.exporters.nixos; +in +{ + options.services.prometheus.exporters.nixos = { + enable = lib.mkEnableOption "NixOS Prometheus exporter"; + + port = lib.mkOption { + type = lib.types.port; + default = 9971; + description = "Port to listen on."; + }; + + listenAddress = lib.mkOption { + type = lib.types.str; + default = "0.0.0.0"; + description = "Address to listen on."; + }; + + flake = { + enable = lib.mkEnableOption "flake collector"; + + url = lib.mkOption { + type = lib.types.str; + default = ""; + description = '' + Flake URL for revision comparison. + Required if flake collector is enabled. + ''; + }; + + checkInterval = lib.mkOption { + type = lib.types.str; + default = "1h"; + description = "Interval between remote flake checks."; + }; + }; + + openFirewall = lib.mkOption { + type = lib.types.bool; + default = false; + description = "Open the firewall for the exporter port."; + }; + + user = lib.mkOption { + type = lib.types.str; + default = "nixos-exporter"; + description = "User to run the exporter as."; + }; + + group = lib.mkOption { + type = lib.types.str; + default = "nixos-exporter"; + description = "Group to run the exporter as."; + }; + + package = lib.mkOption { + type = lib.types.package; + default = self.packages.${pkgs.system}.default; + description = "The nixos-exporter package to use."; + }; + }; + + config = lib.mkIf cfg.enable { + assertions = [ + { + assertion = cfg.flake.enable -> cfg.flake.url != ""; + message = "services.prometheus.exporters.nixos.flake.url must be set when flake collector is enabled"; + } + ]; + + users.users.${cfg.user} = { + isSystemUser = true; + group = cfg.group; + description = "NixOS exporter user"; + }; + + users.groups.${cfg.group} = { }; + + systemd.services.prometheus-nixos-exporter = { + description = "Prometheus NixOS Exporter"; + wantedBy = [ "multi-user.target" ]; + after = [ "network.target" ]; + + serviceConfig = { + User = cfg.user; + Group = cfg.group; + ExecStart = lib.concatStringsSep " " ([ + "${cfg.package}/bin/nixos-exporter" + "--listen=${cfg.listenAddress}:${toString cfg.port}" + ] ++ lib.optionals cfg.flake.enable [ + "--collector.flake" + "--flake.url=${cfg.flake.url}" + "--flake.check-interval=${cfg.flake.checkInterval}" + ]); + Restart = "on-failure"; + RestartSec = "5s"; + + # Hardening + NoNewPrivileges = true; + ProtectSystem = "strict"; + ProtectHome = true; + PrivateTmp = true; + PrivateDevices = true; + ProtectKernelTunables = true; + ProtectKernelModules = true; + ProtectControlGroups = true; + RestrictAddressFamilies = [ "AF_INET" "AF_INET6" ]; + RestrictNamespaces = true; + RestrictRealtime = true; + RestrictSUIDSGID = true; + MemoryDenyWriteExecute = true; + LockPersonality = true; + }; + }; + + networking.firewall.allowedTCPPorts = lib.mkIf cfg.openFirewall [ cfg.port ]; + }; +}