2 Commits

Author SHA1 Message Date
d485948df0 docs: update Loki queries from host to hostname label
Some checks failed
Run nix flake check / flake-check (push) Has been cancelled
Update all LogQL examples, agent instructions, and scripts to use
the hostname label instead of host, matching the Prometheus label
naming convention. Also update pipe-to-loki and bootstrap scripts
to push hostname instead of host.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 23:43:47 +01:00
7b804450a3 promtail: add hostname/tier/role labels and journal priority level mapping
Align Promtail labels with Prometheus by adding hostname, tier, and role
static labels to both journal and varlog scrape configs. Add pipeline
stages to map journal PRIORITY field to a level label for reliable
severity filtering across the fleet.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 23:40:14 +01:00
9 changed files with 85 additions and 53 deletions

View File

@@ -19,7 +19,7 @@ You may receive:
## Audit Log Structure
Logs are shipped to Loki via promtail. Audit events use these labels:
- `host` - hostname
- `hostname` - hostname
- `systemd_unit` - typically `auditd.service` for audit logs
- `job` - typically `systemd-journal`
@@ -36,7 +36,7 @@ Audit log entries contain structured data:
Find SSH logins and session activity:
```logql
{host="<hostname>", systemd_unit="sshd.service"}
{hostname="<hostname>", systemd_unit="sshd.service"}
```
Look for:
@@ -48,7 +48,7 @@ Look for:
Query executed commands (filter out noise):
```logql
{host="<hostname>"} |= "EXECVE" != "PATH item" != "PROCTITLE" != "SYSCALL" != "BPF"
{hostname="<hostname>"} |= "EXECVE" != "PATH item" != "PROCTITLE" != "SYSCALL" != "BPF"
```
Further filtering:
@@ -60,28 +60,28 @@ Further filtering:
Check for privilege escalation:
```logql
{host="<hostname>"} |= "sudo" |= "COMMAND"
{hostname="<hostname>"} |= "sudo" |= "COMMAND"
```
Or via audit:
```logql
{host="<hostname>"} |= "USER_CMD"
{hostname="<hostname>"} |= "USER_CMD"
```
### 4. Service Manipulation
Check if services were manually stopped/started:
```logql
{host="<hostname>"} |= "EXECVE" |= "systemctl"
{hostname="<hostname>"} |= "EXECVE" |= "systemctl"
```
### 5. File Operations
Look for file modifications (if auditd rules are configured):
```logql
{host="<hostname>"} |= "EXECVE" |= "vim"
{host="<hostname>"} |= "EXECVE" |= "nano"
{host="<hostname>"} |= "EXECVE" |= "rm"
{hostname="<hostname>"} |= "EXECVE" |= "vim"
{hostname="<hostname>"} |= "EXECVE" |= "nano"
{hostname="<hostname>"} |= "EXECVE" |= "rm"
```
## Query Guidelines
@@ -99,7 +99,7 @@ Look for file modifications (if auditd rules are configured):
**Time-bounded queries:**
When investigating around a specific event:
```logql
{host="<hostname>"} |= "EXECVE" != "systemd"
{hostname="<hostname>"} |= "EXECVE" != "systemd"
```
With `start: "2026-02-08T14:30:00Z"` and `end: "2026-02-08T14:35:00Z"`

View File

@@ -41,13 +41,13 @@ Search for relevant log entries using `query_logs`. Focus on service-specific lo
**Query strategies (start narrow, expand if needed):**
- Start with `limit: 20-30`, increase only if needed
- Use tight time windows: `start: "15m"` or `start: "30m"` initially
- Filter to specific services: `{host="<hostname>", systemd_unit="<service>.service"}`
- Search for errors: `{host="<hostname>"} |= "error"` or `|= "failed"`
- Filter to specific services: `{hostname="<hostname>", systemd_unit="<service>.service"}`
- Search for errors: `{hostname="<hostname>"} |= "error"` or `|= "failed"`
**Common patterns:**
- Service logs: `{host="<hostname>", systemd_unit="<service>.service"}`
- All errors on host: `{host="<hostname>"} |= "error"`
- Journal for a unit: `{host="<hostname>", systemd_unit="nginx.service"} |= "failed"`
- Service logs: `{hostname="<hostname>", systemd_unit="<service>.service"}`
- All errors on host: `{hostname="<hostname>"} |= "error"`
- Journal for a unit: `{hostname="<hostname>", systemd_unit="nginx.service"} |= "failed"`
**Avoid:**
- Using `start: "1h"` with no filters on busy hosts

View File

@@ -30,11 +30,13 @@ Use the `lab-monitoring` MCP server tools:
### Label Reference
Available labels for log queries:
- `host` - Hostname (e.g., `ns1`, `monitoring01`, `ha1`)
- `hostname` - Hostname (e.g., `ns1`, `monitoring01`, `ha1`) - matches the Prometheus `hostname` label
- `systemd_unit` - Systemd unit name (e.g., `nsd.service`, `nixos-upgrade.service`)
- `job` - Either `systemd-journal` (most logs), `varlog` (file-based logs), or `bootstrap` (VM bootstrap logs)
- `filename` - For `varlog` job, the log file path
- `hostname` - Alternative to `host` for some streams
- `tier` - Deployment tier (`test` or `prod`)
- `role` - Host role (e.g., `dns`, `vault`, `monitoring`) - matches the Prometheus `role` label
- `level` - Log level mapped from journal PRIORITY (`critical`, `error`, `warning`, `notice`, `info`, `debug`) - journal scrape only
### Log Format
@@ -47,12 +49,12 @@ Journal logs are JSON-formatted. Key fields:
**Logs from a specific service on a host:**
```logql
{host="ns1", systemd_unit="nsd.service"}
{hostname="ns1", systemd_unit="nsd.service"}
```
**All logs from a host:**
```logql
{host="monitoring01"}
{hostname="monitoring01"}
```
**Logs from a service across all hosts:**
@@ -62,12 +64,12 @@ Journal logs are JSON-formatted. Key fields:
**Substring matching (case-sensitive):**
```logql
{host="ha1"} |= "error"
{hostname="ha1"} |= "error"
```
**Exclude pattern:**
```logql
{host="ns1"} != "routine"
{hostname="ns1"} != "routine"
```
**Regex matching:**
@@ -75,6 +77,20 @@ Journal logs are JSON-formatted. Key fields:
{systemd_unit="prometheus.service"} |~ "scrape.*failed"
```
**Filter by level (journal scrape only):**
```logql
{level="error"} # All errors across the fleet
{level=~"critical|error", tier="prod"} # Prod errors and criticals
{hostname="ns1", level="warning"} # Warnings from a specific host
```
**Filter by tier/role:**
```logql
{tier="prod"} |= "error" # All errors on prod hosts
{role="dns"} # All DNS server logs
{tier="test", job="systemd-journal"} # Journal logs from test hosts
```
**File-based logs (caddy access logs, etc):**
```logql
{job="varlog", hostname="nix-cache01"}
@@ -106,7 +122,7 @@ Useful systemd units for troubleshooting:
VMs provisioned from template2 send bootstrap progress directly to Loki via curl (before promtail is available). These logs use `job="bootstrap"` with additional labels:
- `host` - Target hostname
- `hostname` - Target hostname
- `branch` - Git branch being deployed
- `stage` - Bootstrap stage (see table below)
@@ -127,7 +143,7 @@ VMs provisioned from template2 send bootstrap progress directly to Loki via curl
```logql
{job="bootstrap"} # All bootstrap logs
{job="bootstrap", host="myhost"} # Specific host
{job="bootstrap", hostname="myhost"} # Specific host
{job="bootstrap", stage="failed"} # All failures
{job="bootstrap", stage=~"building|success"} # Track build progress
```
@@ -308,8 +324,8 @@ Current host labels:
1. Check `up{job="<service>"}` or `up{hostname="<host>"}` for scrape failures
2. Use `list_targets` to see target health details
3. Query service logs: `{host="<host>", systemd_unit="<service>.service"}`
4. Search for errors: `{host="<host>"} |= "error"`
3. Query service logs: `{hostname="<host>", systemd_unit="<service>.service"}`
4. Search for errors: `{hostname="<host>"} |= "error"`
5. Check `list_alerts` for related alerts
6. Use role filters for group issues: `up{role="dns"}` to check all DNS servers
@@ -324,17 +340,17 @@ Current host labels:
When provisioning new VMs, track bootstrap progress:
1. Watch bootstrap logs: `{job="bootstrap", host="<hostname>"}`
2. Check for failures: `{job="bootstrap", host="<hostname>", stage="failed"}`
1. Watch bootstrap logs: `{job="bootstrap", hostname="<hostname>"}`
2. Check for failures: `{job="bootstrap", hostname="<hostname>", stage="failed"}`
3. After success, verify host appears in metrics: `up{hostname="<hostname>"}`
4. Check logs are flowing: `{host="<hostname>"}`
4. Check logs are flowing: `{hostname="<hostname>"}`
See [docs/host-creation.md](../../../docs/host-creation.md) for the full host creation pipeline.
### Debug SSH/Access Issues
```logql
{host="<host>", systemd_unit="sshd.service"}
{hostname="<host>", systemd_unit="sshd.service"}
```
### Check Recent Upgrades

View File

@@ -59,7 +59,7 @@ The script prints the session ID which the user can share. Query results with:
```logql
{job="pipe-to-loki"} # All entries
{job="pipe-to-loki", id="my-test"} # Specific ID
{job="pipe-to-loki", host="testvm01"} # From specific host
{job="pipe-to-loki", hostname="testvm01"} # From specific host
{job="pipe-to-loki", type="session"} # Only sessions
```

View File

@@ -50,7 +50,7 @@ homelab.host.tier = "test"; # or "prod"
During the bootstrap process, status updates are sent to Loki. Query bootstrap logs with:
```
{job="bootstrap", host="<hostname>"}
{job="bootstrap", hostname="<hostname>"}
```
### Bootstrap Stages
@@ -72,7 +72,7 @@ The bootstrap process reports these stages via the `stage` label:
```
# All bootstrap activity for a host
{job="bootstrap", host="myhost"}
{job="bootstrap", hostname="myhost"}
# Track all failures
{job="bootstrap", stage="failed"}
@@ -87,7 +87,7 @@ Once the VM reboots with its full configuration, it will start publishing metric
1. Check bootstrap completed successfully:
```
{job="bootstrap", host="<hostname>", stage="success"}
{job="bootstrap", hostname="<hostname>", stage="success"}
```
2. Verify the host is up and reporting metrics:
@@ -102,7 +102,7 @@ Once the VM reboots with its full configuration, it will start publishing metric
4. Check logs are flowing:
```
{host="<hostname>"}
{hostname="<hostname>"}
```
5. Confirm expected services are running and producing logs
@@ -119,7 +119,7 @@ Once the VM reboots with its full configuration, it will start publishing metric
1. Check bootstrap logs in Loki - if they never progress past `building`, the rebuild likely consumed all resources:
```
{job="bootstrap", host="<hostname>"}
{job="bootstrap", hostname="<hostname>"}
```
2. **USER**: SSH into the host and check the bootstrap service:
@@ -149,7 +149,7 @@ Usually caused by running the `create-host` script without proper credentials, o
2. Check bootstrap logs for vault-related stages:
```
{job="bootstrap", host="<hostname>", stage=~"vault.*"}
{job="bootstrap", hostname="<hostname>", stage=~"vault.*"}
```
3. **USER**: Regenerate and provision credentials manually:

View File

@@ -86,13 +86,13 @@ These are generous limits that shouldn't affect normal operation but protect aga
- The `varlog` scrape config uses `hostname` while journal uses `host` (different label name)
- No `tier` or `role` labels, making it hard to filter logs by deployment tier or host function
**Recommendations:**
**Implemented:** Standardized on `hostname` to match Prometheus labels. The journal scrape previously used a relabel from `__journal__hostname` to `host`; now both scrape configs use a static `hostname` label from `config.networking.hostName`. Also updated `pipe-to-loki` and bootstrap scripts to use `hostname` instead of `host`.
1. **Fix varlog label:** Rename `hostname` to `host` for consistency with journal scrape config
2. **Add `tier` label:** Static label from `config.homelab.host.tier` (`test`/`prod`) on both scrape configs
3. **Add `role` label:** Static label from `config.homelab.host.role` on both scrape configs, only when set (10 hosts have no role, so omit to keep labels clean)
1. **Standardized label:** Both scrape configs use `hostname` (matching Prometheus) via shared `hostLabels`
2. **Added `tier` label:** Static label from `config.homelab.host.tier` (`test`/`prod`) on both scrape configs
3. **Added `role` label:** Static label from `config.homelab.host.role` on both scrape configs (conditionally, only when non-null)
No cardinality impact - `tier` and `role` are 1:1 with `host`, so they add metadata to existing streams without creating new ones.
No cardinality impact - `tier` and `role` are 1:1 with `hostname`, so they add metadata to existing streams without creating new ones.
This enables queries like:
- `{tier="prod"} |= "error"` - all errors on prod hosts
@@ -167,10 +167,10 @@ For each service, check whether it supports a JSON log format option and whether
1. Add `compactor` section to `services/monitoring/loki.nix`
2. Add `limits_config` with 30-day retention and basic rate limits
3. Update `system/monitoring/logs.nix`:
- Fix `hostname``host` label in varlog scrape config
- Add `tier` static label from `config.homelab.host.tier` to both scrape configs
- Add `role` static label from `config.homelab.host.role` (conditionally, only when set) to both scrape configs
- Add pipeline stages to journal scrape config: `json` to extract PRIORITY, `template` to map to level name, `labels` to attach as `level`
- ~~Fix `hostname``host` label in varlog scrape config~~ Done: standardized on `hostname` (matching Prometheus)
- ~~Add `tier` static label from `config.homelab.host.tier` to both scrape configs~~ Done
- ~~Add `role` static label from `config.homelab.host.role` (conditionally, only when set) to both scrape configs~~ Done
- ~~Add pipeline stages to journal scrape config: `json` to extract PRIORITY, `template` to map to level name, `labels` to attach as `level`~~ Done
4. Deploy to monitoring01, verify compactor runs and old data gets cleaned
5. Verify `level` label works: `{level="error"}` should return results, and match cases where `detected_level="unknown"`

View File

@@ -28,7 +28,7 @@ let
streams: [{
stream: {
job: "bootstrap",
host: $host,
hostname: $host,
stage: $stage,
branch: $branch
},

View File

@@ -1,4 +1,12 @@
{ config, ... }:
{ config, lib, ... }:
let
hostLabels = {
hostname = config.networking.hostName;
tier = config.homelab.host.tier;
} // lib.optionalAttrs (config.homelab.host.role != null) {
role = config.homelab.host.role;
};
in
{
# Configure journald
services.journald = {
@@ -32,17 +40,26 @@
json = true;
labels = {
job = "systemd-journal";
};
} // hostLabels;
};
relabel_configs = [
{
source_labels = [ "__journal__systemd_unit" ];
target_label = "systemd_unit";
}
];
pipeline_stages = [
# Extract PRIORITY from journal JSON
{ json.expressions.priority = "PRIORITY"; }
# Map numeric PRIORITY to level name
{
source_labels = [ "__journal__hostname" ];
target_label = "host";
template = {
source = "priority";
template = ''{{ if or (eq .Value "0") (eq .Value "1") (eq .Value "2") }}critical{{ else if eq .Value "3" }}error{{ else if eq .Value "4" }}warning{{ else if eq .Value "5" }}notice{{ else if eq .Value "6" }}info{{ else if eq .Value "7" }}debug{{ end }}'';
};
}
# Attach as level label
{ labels.level = "priority"; }
];
}
{
@@ -53,8 +70,7 @@
labels = {
job = "varlog";
__path__ = "/var/log/**/*.log";
hostname = "${config.networking.hostName}";
};
} // hostLabels;
}
];
}

View File

@@ -61,7 +61,7 @@ let
streams: [{
stream: {
job: $job,
host: $host,
hostname: $host,
type: $type,
id: $id
},