docs: update Loki queries from host to hostname label

Update all LogQL examples, agent instructions, and scripts to use the hostname label instead of host, matching the Prometheus label naming convention. Also update pipe-to-loki and bootstrap scripts to push hostname instead of host. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 23:43:47 +01:00
parent 7b804450a3
commit d485948df0
8 changed files with 63 additions and 47 deletions
--- a/.claude/agents/auditor.md
+++ b/.claude/agents/auditor.md
@@ -19,7 +19,7 @@ You may receive:
 ## Audit Log Structure
 Logs are shipped to Loki via promtail. Audit events use these labels:
- `host` - hostname
+- `hostname` - hostname
 - `systemd_unit` - typically `auditd.service` for audit logs
 - `job` - typically `systemd-journal`
@@ -36,7 +36,7 @@ Audit log entries contain structured data:
 Find SSH logins and session activity:
 ```logql
-{host="<hostname>", systemd_unit="sshd.service"}
+{hostname="<hostname>", systemd_unit="sshd.service"}
 ```
 Look for:
@@ -48,7 +48,7 @@ Look for:
 Query executed commands (filter out noise):
 ```logql
-{host="<hostname>"} |= "EXECVE" != "PATH item" != "PROCTITLE" != "SYSCALL" != "BPF"
+{hostname="<hostname>"} |= "EXECVE" != "PATH item" != "PROCTITLE" != "SYSCALL" != "BPF"
 ```
 Further filtering:
@@ -60,28 +60,28 @@ Further filtering:
 Check for privilege escalation:
 ```logql
-{host="<hostname>"} |= "sudo" |= "COMMAND"
+{hostname="<hostname>"} |= "sudo" |= "COMMAND"
 ```
 Or via audit:
 ```logql
-{host="<hostname>"} |= "USER_CMD"
+{hostname="<hostname>"} |= "USER_CMD"
 ```
 ### 4. Service Manipulation
 Check if services were manually stopped/started:
 ```logql
-{host="<hostname>"} |= "EXECVE" |= "systemctl"
+{hostname="<hostname>"} |= "EXECVE" |= "systemctl"
 ```
 ### 5. File Operations
 Look for file modifications (if auditd rules are configured):
 ```logql
-{host="<hostname>"} |= "EXECVE" |= "vim"
+{hostname="<hostname>"} |= "EXECVE" |= "vim"
-{host="<hostname>"} |= "EXECVE" |= "nano"
+{hostname="<hostname>"} |= "EXECVE" |= "nano"
-{host="<hostname>"} |= "EXECVE" |= "rm"
+{hostname="<hostname>"} |= "EXECVE" |= "rm"
 ```
 ## Query Guidelines
@@ -99,7 +99,7 @@ Look for file modifications (if auditd rules are configured):
 **Time-bounded queries:**
 When investigating around a specific event:
 ```logql
-{host="<hostname>"} |= "EXECVE" != "systemd"
+{hostname="<hostname>"} |= "EXECVE" != "systemd"
 ```
 With `start: "2026-02-08T14:30:00Z"` and `end: "2026-02-08T14:35:00Z"`
--- a/.claude/agents/investigate-alarm.md
+++ b/.claude/agents/investigate-alarm.md
@@ -41,13 +41,13 @@ Search for relevant log entries using `query_logs`. Focus on service-specific lo
 **Query strategies (start narrow, expand if needed):**
 - Start with `limit: 20-30`, increase only if needed
 - Use tight time windows: `start: "15m"` or `start: "30m"` initially
- Filter to specific services: `{host="<hostname>", systemd_unit="<service>.service"}`
+- Filter to specific services: `{hostname="<hostname>", systemd_unit="<service>.service"}`
- Search for errors: `{host="<hostname>"} |= "error"` or `|= "failed"`
+- Search for errors: `{hostname="<hostname>"} |= "error"` or `|= "failed"`
 **Common patterns:**
- Service logs: `{host="<hostname>", systemd_unit="<service>.service"}`
+- Service logs: `{hostname="<hostname>", systemd_unit="<service>.service"}`
- All errors on host: `{host="<hostname>"} |= "error"`
+- All errors on host: `{hostname="<hostname>"} |= "error"`
- Journal for a unit: `{host="<hostname>", systemd_unit="nginx.service"} |= "failed"`
+- Journal for a unit: `{hostname="<hostname>", systemd_unit="nginx.service"} |= "failed"`
 **Avoid:**
 - Using `start: "1h"` with no filters on busy hosts
--- a/.claude/skills/observability/SKILL.md
+++ b/.claude/skills/observability/SKILL.md
@@ -30,11 +30,13 @@ Use the `lab-monitoring` MCP server tools:
 ### Label Reference
 Available labels for log queries:
- `host` - Hostname (e.g., `ns1`, `monitoring01`, `ha1`)
+- `hostname` - Hostname (e.g., `ns1`, `monitoring01`, `ha1`) - matches the Prometheus `hostname` label
 - `systemd_unit` - Systemd unit name (e.g., `nsd.service`, `nixos-upgrade.service`)
 - `job` - Either `systemd-journal` (most logs), `varlog` (file-based logs), or `bootstrap` (VM bootstrap logs)
 - `filename` - For `varlog` job, the log file path
- `hostname` - Alternative to `host` for some streams
+- `tier` - Deployment tier (`test` or `prod`)
 - `role` - Host role (e.g., `dns`, `vault`, `monitoring`) - matches the Prometheus `role` label
 - `level` - Log level mapped from journal PRIORITY (`critical`, `error`, `warning`, `notice`, `info`, `debug`) - journal scrape only
 ### Log Format
@@ -47,12 +49,12 @@ Journal logs are JSON-formatted. Key fields:
 **Logs from a specific service on a host:**
 ```logql
-{host="ns1", systemd_unit="nsd.service"}
+{hostname="ns1", systemd_unit="nsd.service"}
 ```
 **All logs from a host:**
 ```logql
-{host="monitoring01"}
+{hostname="monitoring01"}
 ```
 **Logs from a service across all hosts:**
@@ -62,12 +64,12 @@ Journal logs are JSON-formatted. Key fields:
 **Substring matching (case-sensitive):**
 ```logql
-{host="ha1"} |= "error"
+{hostname="ha1"} |= "error"
 ```
 **Exclude pattern:**
 ```logql
-{host="ns1"} != "routine"
+{hostname="ns1"} != "routine"
 ```
 **Regex matching:**
@@ -75,6 +77,20 @@ Journal logs are JSON-formatted. Key fields:
 {systemd_unit="prometheus.service"} |~ "scrape.*failed"
 ```
 **Filter by level (journal scrape only):**
 ```logql
 {level="error"}                                  # All errors across the fleet
 {level=~"critical|error", tier="prod"}           # Prod errors and criticals
 {hostname="ns1", level="warning"}                # Warnings from a specific host
 ```
 **Filter by tier/role:**
 ```logql
 {tier="prod"} |= "error"                        # All errors on prod hosts
 {role="dns"}                                     # All DNS server logs
 {tier="test", job="systemd-journal"}             # Journal logs from test hosts
 ```
 **File-based logs (caddy access logs, etc):**
 ```logql
 {job="varlog", hostname="nix-cache01"}
@@ -106,7 +122,7 @@ Useful systemd units for troubleshooting:
 VMs provisioned from template2 send bootstrap progress directly to Loki via curl (before promtail is available). These logs use `job="bootstrap"` with additional labels:
- `host` - Target hostname
+- `hostname` - Target hostname
 - `branch` - Git branch being deployed
 - `stage` - Bootstrap stage (see table below)
@@ -127,7 +143,7 @@ VMs provisioned from template2 send bootstrap progress directly to Loki via curl
 ```logql
 {job="bootstrap"}                              # All bootstrap logs
-{job="bootstrap", host="myhost"}               # Specific host
+{job="bootstrap", hostname="myhost"}            # Specific host
 {job="bootstrap", stage="failed"}              # All failures
 {job="bootstrap", stage=~"building|success"}   # Track build progress
 ```
@@ -308,8 +324,8 @@ Current host labels:
 1. Check `up{job="<service>"}` or `up{hostname="<host>"}` for scrape failures
 2. Use `list_targets` to see target health details
-3. Query service logs: `{host="<host>", systemd_unit="<service>.service"}`
+3. Query service logs: `{hostname="<host>", systemd_unit="<service>.service"}`
-4. Search for errors: `{host="<host>"} |= "error"`
+4. Search for errors: `{hostname="<host>"} |= "error"`
 5. Check `list_alerts` for related alerts
 6. Use role filters for group issues: `up{role="dns"}` to check all DNS servers
@@ -324,17 +340,17 @@ Current host labels:
 When provisioning new VMs, track bootstrap progress:
-1. Watch bootstrap logs: `{job="bootstrap", host="<hostname>"}`
+1. Watch bootstrap logs: `{job="bootstrap", hostname="<hostname>"}`
-2. Check for failures: `{job="bootstrap", host="<hostname>", stage="failed"}`
+2. Check for failures: `{job="bootstrap", hostname="<hostname>", stage="failed"}`
 3. After success, verify host appears in metrics: `up{hostname="<hostname>"}`
-4. Check logs are flowing: `{host="<hostname>"}`
+4. Check logs are flowing: `{hostname="<hostname>"}`
 See [docs/host-creation.md](../../../docs/host-creation.md) for the full host creation pipeline.
 ### Debug SSH/Access Issues
 ```logql
-{host="<host>", systemd_unit="sshd.service"}
+{hostname="<host>", systemd_unit="sshd.service"}
 ```
 ### Check Recent Upgrades
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -59,7 +59,7 @@ The script prints the session ID which the user can share. Query results with:
 ```logql
 {job="pipe-to-loki"}                           # All entries
 {job="pipe-to-loki", id="my-test"}             # Specific ID
-{job="pipe-to-loki", host="testvm01"}          # From specific host
+{job="pipe-to-loki", hostname="testvm01"}       # From specific host
 {job="pipe-to-loki", type="session"}           # Only sessions
 ```
--- a/docs/host-creation.md
+++ b/docs/host-creation.md
@@ -50,7 +50,7 @@ homelab.host.tier = "test";  # or "prod"
 During the bootstrap process, status updates are sent to Loki. Query bootstrap logs with:
 ```
-{job="bootstrap", host="<hostname>"}
+{job="bootstrap", hostname="<hostname>"}
 ```
 ### Bootstrap Stages
@@ -72,7 +72,7 @@ The bootstrap process reports these stages via the `stage` label:
 ```
 # All bootstrap activity for a host
-{job="bootstrap", host="myhost"}
+{job="bootstrap", hostname="myhost"}
 # Track all failures
 {job="bootstrap", stage="failed"}
@@ -87,7 +87,7 @@ Once the VM reboots with its full configuration, it will start publishing metric
 1. Check bootstrap completed successfully:
   ```
-   {job="bootstrap", host="<hostname>", stage="success"}
+   {job="bootstrap", hostname="<hostname>", stage="success"}
   ```
 2. Verify the host is up and reporting metrics:
@@ -102,7 +102,7 @@ Once the VM reboots with its full configuration, it will start publishing metric
 4. Check logs are flowing:
   ```
-   {host="<hostname>"}
+   {hostname="<hostname>"}
   ```
 5. Confirm expected services are running and producing logs
@@ -119,7 +119,7 @@ Once the VM reboots with its full configuration, it will start publishing metric
 1. Check bootstrap logs in Loki - if they never progress past `building`, the rebuild likely consumed all resources:
   ```
-   {job="bootstrap", host="<hostname>"}
+   {job="bootstrap", hostname="<hostname>"}
   ```
 2. **USER**: SSH into the host and check the bootstrap service:
@@ -149,7 +149,7 @@ Usually caused by running the `create-host` script without proper credentials, o
 2. Check bootstrap logs for vault-related stages:
   ```
-   {job="bootstrap", host="<hostname>", stage=~"vault.*"}
+   {job="bootstrap", hostname="<hostname>", stage=~"vault.*"}
   ```
 3. **USER**: Regenerate and provision credentials manually:
--- a/docs/plans/loki-improvements.md
+++ b/docs/plans/loki-improvements.md
@@ -86,13 +86,13 @@ These are generous limits that shouldn't affect normal operation but protect aga
 - The `varlog` scrape config uses `hostname` while journal uses `host` (different label name)
 - No `tier` or `role` labels, making it hard to filter logs by deployment tier or host function
-**Recommendations:**
+**Implemented:** Standardized on `hostname` to match Prometheus labels. The journal scrape previously used a relabel from `__journal__hostname` to `host`; now both scrape configs use a static `hostname` label from `config.networking.hostName`. Also updated `pipe-to-loki` and bootstrap scripts to use `hostname` instead of `host`.
-1. **Fix varlog label:** Rename `hostname` to `host` for consistency with journal scrape config
+1. **Standardized label:** Both scrape configs use `hostname` (matching Prometheus) via shared `hostLabels`
-2. **Add `tier` label:** Static label from `config.homelab.host.tier` (`test`/`prod`) on both scrape configs
+2. **Added `tier` label:** Static label from `config.homelab.host.tier` (`test`/`prod`) on both scrape configs
-3. **Add `role` label:** Static label from `config.homelab.host.role` on both scrape configs, only when set (10 hosts have no role, so omit to keep labels clean)
+3. **Added `role` label:** Static label from `config.homelab.host.role` on both scrape configs (conditionally, only when non-null)
-No cardinality impact - `tier` and `role` are 1:1 with `host`, so they add metadata to existing streams without creating new ones.
+No cardinality impact - `tier` and `role` are 1:1 with `hostname`, so they add metadata to existing streams without creating new ones.
 This enables queries like:
 - `{tier="prod"} |= "error"` - all errors on prod hosts
@@ -167,10 +167,10 @@ For each service, check whether it supports a JSON log format option and whether
 1. Add `compactor` section to `services/monitoring/loki.nix`
 2. Add `limits_config` with 30-day retention and basic rate limits
 3. Update `system/monitoring/logs.nix`:
-   - Fix `hostname` → `host` label in varlog scrape config
+   - ~~Fix `hostname` → `host` label in varlog scrape config~~ Done: standardized on `hostname` (matching Prometheus)
-   - Add `tier` static label from `config.homelab.host.tier` to both scrape configs
+   - ~~Add `tier` static label from `config.homelab.host.tier` to both scrape configs~~ Done
-   - Add `role` static label from `config.homelab.host.role` (conditionally, only when set) to both scrape configs
+   - ~~Add `role` static label from `config.homelab.host.role` (conditionally, only when set) to both scrape configs~~ Done
-   - Add pipeline stages to journal scrape config: `json` to extract PRIORITY, `template` to map to level name, `labels` to attach as `level`
+   - ~~Add pipeline stages to journal scrape config: `json` to extract PRIORITY, `template` to map to level name, `labels` to attach as `level`~~ Done
 4. Deploy to monitoring01, verify compactor runs and old data gets cleaned
 5. Verify `level` label works: `{level="error"}` should return results, and match cases where `detected_level="unknown"`
--- a/hosts/template2/bootstrap.nix
+++ b/hosts/template2/bootstrap.nix
@@ -28,7 +28,7 @@ let
            streams: [{
              stream: {
                job: "bootstrap",
-                host: $host,
+                hostname: $host,
                stage: $stage,
                branch: $branch
              },
--- a/system/pipe-to-loki.nix
+++ b/system/pipe-to-loki.nix
@@ -61,7 +61,7 @@ let
            streams: [{
              stream: {
                job: $job,
-                host: $host,
+                hostname: $host,
                type: $type,
                id: $id
              },