From d485948df02f7e0181358902578b8dcb28302d2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torjus=20H=C3=A5kestad?= Date: Fri, 13 Feb 2026 23:43:47 +0100 Subject: [PATCH] docs: update Loki queries from host to hostname label Update all LogQL examples, agent instructions, and scripts to use the hostname label instead of host, matching the Prometheus label naming convention. Also update pipe-to-loki and bootstrap scripts to push hostname instead of host. Co-Authored-By: Claude Opus 4.6 --- .claude/agents/auditor.md | 20 ++++++------ .claude/agents/investigate-alarm.md | 10 +++--- .claude/skills/observability/SKILL.md | 44 ++++++++++++++++++--------- CLAUDE.md | 2 +- docs/host-creation.md | 12 ++++---- docs/plans/loki-improvements.md | 18 +++++------ hosts/template2/bootstrap.nix | 2 +- system/pipe-to-loki.nix | 2 +- 8 files changed, 63 insertions(+), 47 deletions(-) diff --git a/.claude/agents/auditor.md b/.claude/agents/auditor.md index de12e51..12b253a 100644 --- a/.claude/agents/auditor.md +++ b/.claude/agents/auditor.md @@ -19,7 +19,7 @@ You may receive: ## Audit Log Structure Logs are shipped to Loki via promtail. Audit events use these labels: -- `host` - hostname +- `hostname` - hostname - `systemd_unit` - typically `auditd.service` for audit logs - `job` - typically `systemd-journal` @@ -36,7 +36,7 @@ Audit log entries contain structured data: Find SSH logins and session activity: ```logql -{host="", systemd_unit="sshd.service"} +{hostname="", systemd_unit="sshd.service"} ``` Look for: @@ -48,7 +48,7 @@ Look for: Query executed commands (filter out noise): ```logql -{host=""} |= "EXECVE" != "PATH item" != "PROCTITLE" != "SYSCALL" != "BPF" +{hostname=""} |= "EXECVE" != "PATH item" != "PROCTITLE" != "SYSCALL" != "BPF" ``` Further filtering: @@ -60,28 +60,28 @@ Further filtering: Check for privilege escalation: ```logql -{host=""} |= "sudo" |= "COMMAND" +{hostname=""} |= "sudo" |= "COMMAND" ``` Or via audit: ```logql -{host=""} |= "USER_CMD" +{hostname=""} |= "USER_CMD" ``` ### 4. Service Manipulation Check if services were manually stopped/started: ```logql -{host=""} |= "EXECVE" |= "systemctl" +{hostname=""} |= "EXECVE" |= "systemctl" ``` ### 5. File Operations Look for file modifications (if auditd rules are configured): ```logql -{host=""} |= "EXECVE" |= "vim" -{host=""} |= "EXECVE" |= "nano" -{host=""} |= "EXECVE" |= "rm" +{hostname=""} |= "EXECVE" |= "vim" +{hostname=""} |= "EXECVE" |= "nano" +{hostname=""} |= "EXECVE" |= "rm" ``` ## Query Guidelines @@ -99,7 +99,7 @@ Look for file modifications (if auditd rules are configured): **Time-bounded queries:** When investigating around a specific event: ```logql -{host=""} |= "EXECVE" != "systemd" +{hostname=""} |= "EXECVE" != "systemd" ``` With `start: "2026-02-08T14:30:00Z"` and `end: "2026-02-08T14:35:00Z"` diff --git a/.claude/agents/investigate-alarm.md b/.claude/agents/investigate-alarm.md index e4607ff..11462ea 100644 --- a/.claude/agents/investigate-alarm.md +++ b/.claude/agents/investigate-alarm.md @@ -41,13 +41,13 @@ Search for relevant log entries using `query_logs`. Focus on service-specific lo **Query strategies (start narrow, expand if needed):** - Start with `limit: 20-30`, increase only if needed - Use tight time windows: `start: "15m"` or `start: "30m"` initially -- Filter to specific services: `{host="", systemd_unit=".service"}` -- Search for errors: `{host=""} |= "error"` or `|= "failed"` +- Filter to specific services: `{hostname="", systemd_unit=".service"}` +- Search for errors: `{hostname=""} |= "error"` or `|= "failed"` **Common patterns:** -- Service logs: `{host="", systemd_unit=".service"}` -- All errors on host: `{host=""} |= "error"` -- Journal for a unit: `{host="", systemd_unit="nginx.service"} |= "failed"` +- Service logs: `{hostname="", systemd_unit=".service"}` +- All errors on host: `{hostname=""} |= "error"` +- Journal for a unit: `{hostname="", systemd_unit="nginx.service"} |= "failed"` **Avoid:** - Using `start: "1h"` with no filters on busy hosts diff --git a/.claude/skills/observability/SKILL.md b/.claude/skills/observability/SKILL.md index c2b758c..f89ea93 100644 --- a/.claude/skills/observability/SKILL.md +++ b/.claude/skills/observability/SKILL.md @@ -30,11 +30,13 @@ Use the `lab-monitoring` MCP server tools: ### Label Reference Available labels for log queries: -- `host` - Hostname (e.g., `ns1`, `monitoring01`, `ha1`) +- `hostname` - Hostname (e.g., `ns1`, `monitoring01`, `ha1`) - matches the Prometheus `hostname` label - `systemd_unit` - Systemd unit name (e.g., `nsd.service`, `nixos-upgrade.service`) - `job` - Either `systemd-journal` (most logs), `varlog` (file-based logs), or `bootstrap` (VM bootstrap logs) - `filename` - For `varlog` job, the log file path -- `hostname` - Alternative to `host` for some streams +- `tier` - Deployment tier (`test` or `prod`) +- `role` - Host role (e.g., `dns`, `vault`, `monitoring`) - matches the Prometheus `role` label +- `level` - Log level mapped from journal PRIORITY (`critical`, `error`, `warning`, `notice`, `info`, `debug`) - journal scrape only ### Log Format @@ -47,12 +49,12 @@ Journal logs are JSON-formatted. Key fields: **Logs from a specific service on a host:** ```logql -{host="ns1", systemd_unit="nsd.service"} +{hostname="ns1", systemd_unit="nsd.service"} ``` **All logs from a host:** ```logql -{host="monitoring01"} +{hostname="monitoring01"} ``` **Logs from a service across all hosts:** @@ -62,12 +64,12 @@ Journal logs are JSON-formatted. Key fields: **Substring matching (case-sensitive):** ```logql -{host="ha1"} |= "error" +{hostname="ha1"} |= "error" ``` **Exclude pattern:** ```logql -{host="ns1"} != "routine" +{hostname="ns1"} != "routine" ``` **Regex matching:** @@ -75,6 +77,20 @@ Journal logs are JSON-formatted. Key fields: {systemd_unit="prometheus.service"} |~ "scrape.*failed" ``` +**Filter by level (journal scrape only):** +```logql +{level="error"} # All errors across the fleet +{level=~"critical|error", tier="prod"} # Prod errors and criticals +{hostname="ns1", level="warning"} # Warnings from a specific host +``` + +**Filter by tier/role:** +```logql +{tier="prod"} |= "error" # All errors on prod hosts +{role="dns"} # All DNS server logs +{tier="test", job="systemd-journal"} # Journal logs from test hosts +``` + **File-based logs (caddy access logs, etc):** ```logql {job="varlog", hostname="nix-cache01"} @@ -106,7 +122,7 @@ Useful systemd units for troubleshooting: VMs provisioned from template2 send bootstrap progress directly to Loki via curl (before promtail is available). These logs use `job="bootstrap"` with additional labels: -- `host` - Target hostname +- `hostname` - Target hostname - `branch` - Git branch being deployed - `stage` - Bootstrap stage (see table below) @@ -127,7 +143,7 @@ VMs provisioned from template2 send bootstrap progress directly to Loki via curl ```logql {job="bootstrap"} # All bootstrap logs -{job="bootstrap", host="myhost"} # Specific host +{job="bootstrap", hostname="myhost"} # Specific host {job="bootstrap", stage="failed"} # All failures {job="bootstrap", stage=~"building|success"} # Track build progress ``` @@ -308,8 +324,8 @@ Current host labels: 1. Check `up{job=""}` or `up{hostname=""}` for scrape failures 2. Use `list_targets` to see target health details -3. Query service logs: `{host="", systemd_unit=".service"}` -4. Search for errors: `{host=""} |= "error"` +3. Query service logs: `{hostname="", systemd_unit=".service"}` +4. Search for errors: `{hostname=""} |= "error"` 5. Check `list_alerts` for related alerts 6. Use role filters for group issues: `up{role="dns"}` to check all DNS servers @@ -324,17 +340,17 @@ Current host labels: When provisioning new VMs, track bootstrap progress: -1. Watch bootstrap logs: `{job="bootstrap", host=""}` -2. Check for failures: `{job="bootstrap", host="", stage="failed"}` +1. Watch bootstrap logs: `{job="bootstrap", hostname=""}` +2. Check for failures: `{job="bootstrap", hostname="", stage="failed"}` 3. After success, verify host appears in metrics: `up{hostname=""}` -4. Check logs are flowing: `{host=""}` +4. Check logs are flowing: `{hostname=""}` See [docs/host-creation.md](../../../docs/host-creation.md) for the full host creation pipeline. ### Debug SSH/Access Issues ```logql -{host="", systemd_unit="sshd.service"} +{hostname="", systemd_unit="sshd.service"} ``` ### Check Recent Upgrades diff --git a/CLAUDE.md b/CLAUDE.md index 2bbff94..452aea8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -59,7 +59,7 @@ The script prints the session ID which the user can share. Query results with: ```logql {job="pipe-to-loki"} # All entries {job="pipe-to-loki", id="my-test"} # Specific ID -{job="pipe-to-loki", host="testvm01"} # From specific host +{job="pipe-to-loki", hostname="testvm01"} # From specific host {job="pipe-to-loki", type="session"} # Only sessions ``` diff --git a/docs/host-creation.md b/docs/host-creation.md index af3bf44..64e71c6 100644 --- a/docs/host-creation.md +++ b/docs/host-creation.md @@ -50,7 +50,7 @@ homelab.host.tier = "test"; # or "prod" During the bootstrap process, status updates are sent to Loki. Query bootstrap logs with: ``` -{job="bootstrap", host=""} +{job="bootstrap", hostname=""} ``` ### Bootstrap Stages @@ -72,7 +72,7 @@ The bootstrap process reports these stages via the `stage` label: ``` # All bootstrap activity for a host -{job="bootstrap", host="myhost"} +{job="bootstrap", hostname="myhost"} # Track all failures {job="bootstrap", stage="failed"} @@ -87,7 +87,7 @@ Once the VM reboots with its full configuration, it will start publishing metric 1. Check bootstrap completed successfully: ``` - {job="bootstrap", host="", stage="success"} + {job="bootstrap", hostname="", stage="success"} ``` 2. Verify the host is up and reporting metrics: @@ -102,7 +102,7 @@ Once the VM reboots with its full configuration, it will start publishing metric 4. Check logs are flowing: ``` - {host=""} + {hostname=""} ``` 5. Confirm expected services are running and producing logs @@ -119,7 +119,7 @@ Once the VM reboots with its full configuration, it will start publishing metric 1. Check bootstrap logs in Loki - if they never progress past `building`, the rebuild likely consumed all resources: ``` - {job="bootstrap", host=""} + {job="bootstrap", hostname=""} ``` 2. **USER**: SSH into the host and check the bootstrap service: @@ -149,7 +149,7 @@ Usually caused by running the `create-host` script without proper credentials, o 2. Check bootstrap logs for vault-related stages: ``` - {job="bootstrap", host="", stage=~"vault.*"} + {job="bootstrap", hostname="", stage=~"vault.*"} ``` 3. **USER**: Regenerate and provision credentials manually: diff --git a/docs/plans/loki-improvements.md b/docs/plans/loki-improvements.md index 68cbc2a..a00474e 100644 --- a/docs/plans/loki-improvements.md +++ b/docs/plans/loki-improvements.md @@ -86,13 +86,13 @@ These are generous limits that shouldn't affect normal operation but protect aga - The `varlog` scrape config uses `hostname` while journal uses `host` (different label name) - No `tier` or `role` labels, making it hard to filter logs by deployment tier or host function -**Recommendations:** +**Implemented:** Standardized on `hostname` to match Prometheus labels. The journal scrape previously used a relabel from `__journal__hostname` to `host`; now both scrape configs use a static `hostname` label from `config.networking.hostName`. Also updated `pipe-to-loki` and bootstrap scripts to use `hostname` instead of `host`. -1. **Fix varlog label:** Rename `hostname` to `host` for consistency with journal scrape config -2. **Add `tier` label:** Static label from `config.homelab.host.tier` (`test`/`prod`) on both scrape configs -3. **Add `role` label:** Static label from `config.homelab.host.role` on both scrape configs, only when set (10 hosts have no role, so omit to keep labels clean) +1. **Standardized label:** Both scrape configs use `hostname` (matching Prometheus) via shared `hostLabels` +2. **Added `tier` label:** Static label from `config.homelab.host.tier` (`test`/`prod`) on both scrape configs +3. **Added `role` label:** Static label from `config.homelab.host.role` on both scrape configs (conditionally, only when non-null) -No cardinality impact - `tier` and `role` are 1:1 with `host`, so they add metadata to existing streams without creating new ones. +No cardinality impact - `tier` and `role` are 1:1 with `hostname`, so they add metadata to existing streams without creating new ones. This enables queries like: - `{tier="prod"} |= "error"` - all errors on prod hosts @@ -167,10 +167,10 @@ For each service, check whether it supports a JSON log format option and whether 1. Add `compactor` section to `services/monitoring/loki.nix` 2. Add `limits_config` with 30-day retention and basic rate limits 3. Update `system/monitoring/logs.nix`: - - Fix `hostname` → `host` label in varlog scrape config - - Add `tier` static label from `config.homelab.host.tier` to both scrape configs - - Add `role` static label from `config.homelab.host.role` (conditionally, only when set) to both scrape configs - - Add pipeline stages to journal scrape config: `json` to extract PRIORITY, `template` to map to level name, `labels` to attach as `level` + - ~~Fix `hostname` → `host` label in varlog scrape config~~ Done: standardized on `hostname` (matching Prometheus) + - ~~Add `tier` static label from `config.homelab.host.tier` to both scrape configs~~ Done + - ~~Add `role` static label from `config.homelab.host.role` (conditionally, only when set) to both scrape configs~~ Done + - ~~Add pipeline stages to journal scrape config: `json` to extract PRIORITY, `template` to map to level name, `labels` to attach as `level`~~ Done 4. Deploy to monitoring01, verify compactor runs and old data gets cleaned 5. Verify `level` label works: `{level="error"}` should return results, and match cases where `detected_level="unknown"` diff --git a/hosts/template2/bootstrap.nix b/hosts/template2/bootstrap.nix index ea2d107..8accb5a 100644 --- a/hosts/template2/bootstrap.nix +++ b/hosts/template2/bootstrap.nix @@ -28,7 +28,7 @@ let streams: [{ stream: { job: "bootstrap", - host: $host, + hostname: $host, stage: $stage, branch: $branch }, diff --git a/system/pipe-to-loki.nix b/system/pipe-to-loki.nix index cb90e1c..7c4f3e4 100644 --- a/system/pipe-to-loki.nix +++ b/system/pipe-to-loki.nix @@ -61,7 +61,7 @@ let streams: [{ stream: { job: $job, - host: $host, + hostname: $host, type: $type, id: $id },