2 Commits

Author SHA1 Message Date
d485948df0 docs: update Loki queries from host to hostname label
Some checks failed
Run nix flake check / flake-check (push) Has been cancelled
Update all LogQL examples, agent instructions, and scripts to use
the hostname label instead of host, matching the Prometheus label
naming convention. Also update pipe-to-loki and bootstrap scripts
to push hostname instead of host.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 23:43:47 +01:00
7b804450a3 promtail: add hostname/tier/role labels and journal priority level mapping
Align Promtail labels with Prometheus by adding hostname, tier, and role
static labels to both journal and varlog scrape configs. Add pipeline
stages to map journal PRIORITY field to a level label for reliable
severity filtering across the fleet.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 23:40:14 +01:00
9 changed files with 85 additions and 53 deletions

View File

@@ -19,7 +19,7 @@ You may receive:
## Audit Log Structure ## Audit Log Structure
Logs are shipped to Loki via promtail. Audit events use these labels: Logs are shipped to Loki via promtail. Audit events use these labels:
- `host` - hostname - `hostname` - hostname
- `systemd_unit` - typically `auditd.service` for audit logs - `systemd_unit` - typically `auditd.service` for audit logs
- `job` - typically `systemd-journal` - `job` - typically `systemd-journal`
@@ -36,7 +36,7 @@ Audit log entries contain structured data:
Find SSH logins and session activity: Find SSH logins and session activity:
```logql ```logql
{host="<hostname>", systemd_unit="sshd.service"} {hostname="<hostname>", systemd_unit="sshd.service"}
``` ```
Look for: Look for:
@@ -48,7 +48,7 @@ Look for:
Query executed commands (filter out noise): Query executed commands (filter out noise):
```logql ```logql
{host="<hostname>"} |= "EXECVE" != "PATH item" != "PROCTITLE" != "SYSCALL" != "BPF" {hostname="<hostname>"} |= "EXECVE" != "PATH item" != "PROCTITLE" != "SYSCALL" != "BPF"
``` ```
Further filtering: Further filtering:
@@ -60,28 +60,28 @@ Further filtering:
Check for privilege escalation: Check for privilege escalation:
```logql ```logql
{host="<hostname>"} |= "sudo" |= "COMMAND" {hostname="<hostname>"} |= "sudo" |= "COMMAND"
``` ```
Or via audit: Or via audit:
```logql ```logql
{host="<hostname>"} |= "USER_CMD" {hostname="<hostname>"} |= "USER_CMD"
``` ```
### 4. Service Manipulation ### 4. Service Manipulation
Check if services were manually stopped/started: Check if services were manually stopped/started:
```logql ```logql
{host="<hostname>"} |= "EXECVE" |= "systemctl" {hostname="<hostname>"} |= "EXECVE" |= "systemctl"
``` ```
### 5. File Operations ### 5. File Operations
Look for file modifications (if auditd rules are configured): Look for file modifications (if auditd rules are configured):
```logql ```logql
{host="<hostname>"} |= "EXECVE" |= "vim" {hostname="<hostname>"} |= "EXECVE" |= "vim"
{host="<hostname>"} |= "EXECVE" |= "nano" {hostname="<hostname>"} |= "EXECVE" |= "nano"
{host="<hostname>"} |= "EXECVE" |= "rm" {hostname="<hostname>"} |= "EXECVE" |= "rm"
``` ```
## Query Guidelines ## Query Guidelines
@@ -99,7 +99,7 @@ Look for file modifications (if auditd rules are configured):
**Time-bounded queries:** **Time-bounded queries:**
When investigating around a specific event: When investigating around a specific event:
```logql ```logql
{host="<hostname>"} |= "EXECVE" != "systemd" {hostname="<hostname>"} |= "EXECVE" != "systemd"
``` ```
With `start: "2026-02-08T14:30:00Z"` and `end: "2026-02-08T14:35:00Z"` With `start: "2026-02-08T14:30:00Z"` and `end: "2026-02-08T14:35:00Z"`

View File

@@ -41,13 +41,13 @@ Search for relevant log entries using `query_logs`. Focus on service-specific lo
**Query strategies (start narrow, expand if needed):** **Query strategies (start narrow, expand if needed):**
- Start with `limit: 20-30`, increase only if needed - Start with `limit: 20-30`, increase only if needed
- Use tight time windows: `start: "15m"` or `start: "30m"` initially - Use tight time windows: `start: "15m"` or `start: "30m"` initially
- Filter to specific services: `{host="<hostname>", systemd_unit="<service>.service"}` - Filter to specific services: `{hostname="<hostname>", systemd_unit="<service>.service"}`
- Search for errors: `{host="<hostname>"} |= "error"` or `|= "failed"` - Search for errors: `{hostname="<hostname>"} |= "error"` or `|= "failed"`
**Common patterns:** **Common patterns:**
- Service logs: `{host="<hostname>", systemd_unit="<service>.service"}` - Service logs: `{hostname="<hostname>", systemd_unit="<service>.service"}`
- All errors on host: `{host="<hostname>"} |= "error"` - All errors on host: `{hostname="<hostname>"} |= "error"`
- Journal for a unit: `{host="<hostname>", systemd_unit="nginx.service"} |= "failed"` - Journal for a unit: `{hostname="<hostname>", systemd_unit="nginx.service"} |= "failed"`
**Avoid:** **Avoid:**
- Using `start: "1h"` with no filters on busy hosts - Using `start: "1h"` with no filters on busy hosts

View File

@@ -30,11 +30,13 @@ Use the `lab-monitoring` MCP server tools:
### Label Reference ### Label Reference
Available labels for log queries: Available labels for log queries:
- `host` - Hostname (e.g., `ns1`, `monitoring01`, `ha1`) - `hostname` - Hostname (e.g., `ns1`, `monitoring01`, `ha1`) - matches the Prometheus `hostname` label
- `systemd_unit` - Systemd unit name (e.g., `nsd.service`, `nixos-upgrade.service`) - `systemd_unit` - Systemd unit name (e.g., `nsd.service`, `nixos-upgrade.service`)
- `job` - Either `systemd-journal` (most logs), `varlog` (file-based logs), or `bootstrap` (VM bootstrap logs) - `job` - Either `systemd-journal` (most logs), `varlog` (file-based logs), or `bootstrap` (VM bootstrap logs)
- `filename` - For `varlog` job, the log file path - `filename` - For `varlog` job, the log file path
- `hostname` - Alternative to `host` for some streams - `tier` - Deployment tier (`test` or `prod`)
- `role` - Host role (e.g., `dns`, `vault`, `monitoring`) - matches the Prometheus `role` label
- `level` - Log level mapped from journal PRIORITY (`critical`, `error`, `warning`, `notice`, `info`, `debug`) - journal scrape only
### Log Format ### Log Format
@@ -47,12 +49,12 @@ Journal logs are JSON-formatted. Key fields:
**Logs from a specific service on a host:** **Logs from a specific service on a host:**
```logql ```logql
{host="ns1", systemd_unit="nsd.service"} {hostname="ns1", systemd_unit="nsd.service"}
``` ```
**All logs from a host:** **All logs from a host:**
```logql ```logql
{host="monitoring01"} {hostname="monitoring01"}
``` ```
**Logs from a service across all hosts:** **Logs from a service across all hosts:**
@@ -62,12 +64,12 @@ Journal logs are JSON-formatted. Key fields:
**Substring matching (case-sensitive):** **Substring matching (case-sensitive):**
```logql ```logql
{host="ha1"} |= "error" {hostname="ha1"} |= "error"
``` ```
**Exclude pattern:** **Exclude pattern:**
```logql ```logql
{host="ns1"} != "routine" {hostname="ns1"} != "routine"
``` ```
**Regex matching:** **Regex matching:**
@@ -75,6 +77,20 @@ Journal logs are JSON-formatted. Key fields:
{systemd_unit="prometheus.service"} |~ "scrape.*failed" {systemd_unit="prometheus.service"} |~ "scrape.*failed"
``` ```
**Filter by level (journal scrape only):**
```logql
{level="error"} # All errors across the fleet
{level=~"critical|error", tier="prod"} # Prod errors and criticals
{hostname="ns1", level="warning"} # Warnings from a specific host
```
**Filter by tier/role:**
```logql
{tier="prod"} |= "error" # All errors on prod hosts
{role="dns"} # All DNS server logs
{tier="test", job="systemd-journal"} # Journal logs from test hosts
```
**File-based logs (caddy access logs, etc):** **File-based logs (caddy access logs, etc):**
```logql ```logql
{job="varlog", hostname="nix-cache01"} {job="varlog", hostname="nix-cache01"}
@@ -106,7 +122,7 @@ Useful systemd units for troubleshooting:
VMs provisioned from template2 send bootstrap progress directly to Loki via curl (before promtail is available). These logs use `job="bootstrap"` with additional labels: VMs provisioned from template2 send bootstrap progress directly to Loki via curl (before promtail is available). These logs use `job="bootstrap"` with additional labels:
- `host` - Target hostname - `hostname` - Target hostname
- `branch` - Git branch being deployed - `branch` - Git branch being deployed
- `stage` - Bootstrap stage (see table below) - `stage` - Bootstrap stage (see table below)
@@ -127,7 +143,7 @@ VMs provisioned from template2 send bootstrap progress directly to Loki via curl
```logql ```logql
{job="bootstrap"} # All bootstrap logs {job="bootstrap"} # All bootstrap logs
{job="bootstrap", host="myhost"} # Specific host {job="bootstrap", hostname="myhost"} # Specific host
{job="bootstrap", stage="failed"} # All failures {job="bootstrap", stage="failed"} # All failures
{job="bootstrap", stage=~"building|success"} # Track build progress {job="bootstrap", stage=~"building|success"} # Track build progress
``` ```
@@ -308,8 +324,8 @@ Current host labels:
1. Check `up{job="<service>"}` or `up{hostname="<host>"}` for scrape failures 1. Check `up{job="<service>"}` or `up{hostname="<host>"}` for scrape failures
2. Use `list_targets` to see target health details 2. Use `list_targets` to see target health details
3. Query service logs: `{host="<host>", systemd_unit="<service>.service"}` 3. Query service logs: `{hostname="<host>", systemd_unit="<service>.service"}`
4. Search for errors: `{host="<host>"} |= "error"` 4. Search for errors: `{hostname="<host>"} |= "error"`
5. Check `list_alerts` for related alerts 5. Check `list_alerts` for related alerts
6. Use role filters for group issues: `up{role="dns"}` to check all DNS servers 6. Use role filters for group issues: `up{role="dns"}` to check all DNS servers
@@ -324,17 +340,17 @@ Current host labels:
When provisioning new VMs, track bootstrap progress: When provisioning new VMs, track bootstrap progress:
1. Watch bootstrap logs: `{job="bootstrap", host="<hostname>"}` 1. Watch bootstrap logs: `{job="bootstrap", hostname="<hostname>"}`
2. Check for failures: `{job="bootstrap", host="<hostname>", stage="failed"}` 2. Check for failures: `{job="bootstrap", hostname="<hostname>", stage="failed"}`
3. After success, verify host appears in metrics: `up{hostname="<hostname>"}` 3. After success, verify host appears in metrics: `up{hostname="<hostname>"}`
4. Check logs are flowing: `{host="<hostname>"}` 4. Check logs are flowing: `{hostname="<hostname>"}`
See [docs/host-creation.md](../../../docs/host-creation.md) for the full host creation pipeline. See [docs/host-creation.md](../../../docs/host-creation.md) for the full host creation pipeline.
### Debug SSH/Access Issues ### Debug SSH/Access Issues
```logql ```logql
{host="<host>", systemd_unit="sshd.service"} {hostname="<host>", systemd_unit="sshd.service"}
``` ```
### Check Recent Upgrades ### Check Recent Upgrades

View File

@@ -59,7 +59,7 @@ The script prints the session ID which the user can share. Query results with:
```logql ```logql
{job="pipe-to-loki"} # All entries {job="pipe-to-loki"} # All entries
{job="pipe-to-loki", id="my-test"} # Specific ID {job="pipe-to-loki", id="my-test"} # Specific ID
{job="pipe-to-loki", host="testvm01"} # From specific host {job="pipe-to-loki", hostname="testvm01"} # From specific host
{job="pipe-to-loki", type="session"} # Only sessions {job="pipe-to-loki", type="session"} # Only sessions
``` ```

View File

@@ -50,7 +50,7 @@ homelab.host.tier = "test"; # or "prod"
During the bootstrap process, status updates are sent to Loki. Query bootstrap logs with: During the bootstrap process, status updates are sent to Loki. Query bootstrap logs with:
``` ```
{job="bootstrap", host="<hostname>"} {job="bootstrap", hostname="<hostname>"}
``` ```
### Bootstrap Stages ### Bootstrap Stages
@@ -72,7 +72,7 @@ The bootstrap process reports these stages via the `stage` label:
``` ```
# All bootstrap activity for a host # All bootstrap activity for a host
{job="bootstrap", host="myhost"} {job="bootstrap", hostname="myhost"}
# Track all failures # Track all failures
{job="bootstrap", stage="failed"} {job="bootstrap", stage="failed"}
@@ -87,7 +87,7 @@ Once the VM reboots with its full configuration, it will start publishing metric
1. Check bootstrap completed successfully: 1. Check bootstrap completed successfully:
``` ```
{job="bootstrap", host="<hostname>", stage="success"} {job="bootstrap", hostname="<hostname>", stage="success"}
``` ```
2. Verify the host is up and reporting metrics: 2. Verify the host is up and reporting metrics:
@@ -102,7 +102,7 @@ Once the VM reboots with its full configuration, it will start publishing metric
4. Check logs are flowing: 4. Check logs are flowing:
``` ```
{host="<hostname>"} {hostname="<hostname>"}
``` ```
5. Confirm expected services are running and producing logs 5. Confirm expected services are running and producing logs
@@ -119,7 +119,7 @@ Once the VM reboots with its full configuration, it will start publishing metric
1. Check bootstrap logs in Loki - if they never progress past `building`, the rebuild likely consumed all resources: 1. Check bootstrap logs in Loki - if they never progress past `building`, the rebuild likely consumed all resources:
``` ```
{job="bootstrap", host="<hostname>"} {job="bootstrap", hostname="<hostname>"}
``` ```
2. **USER**: SSH into the host and check the bootstrap service: 2. **USER**: SSH into the host and check the bootstrap service:
@@ -149,7 +149,7 @@ Usually caused by running the `create-host` script without proper credentials, o
2. Check bootstrap logs for vault-related stages: 2. Check bootstrap logs for vault-related stages:
``` ```
{job="bootstrap", host="<hostname>", stage=~"vault.*"} {job="bootstrap", hostname="<hostname>", stage=~"vault.*"}
``` ```
3. **USER**: Regenerate and provision credentials manually: 3. **USER**: Regenerate and provision credentials manually:

View File

@@ -86,13 +86,13 @@ These are generous limits that shouldn't affect normal operation but protect aga
- The `varlog` scrape config uses `hostname` while journal uses `host` (different label name) - The `varlog` scrape config uses `hostname` while journal uses `host` (different label name)
- No `tier` or `role` labels, making it hard to filter logs by deployment tier or host function - No `tier` or `role` labels, making it hard to filter logs by deployment tier or host function
**Recommendations:** **Implemented:** Standardized on `hostname` to match Prometheus labels. The journal scrape previously used a relabel from `__journal__hostname` to `host`; now both scrape configs use a static `hostname` label from `config.networking.hostName`. Also updated `pipe-to-loki` and bootstrap scripts to use `hostname` instead of `host`.
1. **Fix varlog label:** Rename `hostname` to `host` for consistency with journal scrape config 1. **Standardized label:** Both scrape configs use `hostname` (matching Prometheus) via shared `hostLabels`
2. **Add `tier` label:** Static label from `config.homelab.host.tier` (`test`/`prod`) on both scrape configs 2. **Added `tier` label:** Static label from `config.homelab.host.tier` (`test`/`prod`) on both scrape configs
3. **Add `role` label:** Static label from `config.homelab.host.role` on both scrape configs, only when set (10 hosts have no role, so omit to keep labels clean) 3. **Added `role` label:** Static label from `config.homelab.host.role` on both scrape configs (conditionally, only when non-null)
No cardinality impact - `tier` and `role` are 1:1 with `host`, so they add metadata to existing streams without creating new ones. No cardinality impact - `tier` and `role` are 1:1 with `hostname`, so they add metadata to existing streams without creating new ones.
This enables queries like: This enables queries like:
- `{tier="prod"} |= "error"` - all errors on prod hosts - `{tier="prod"} |= "error"` - all errors on prod hosts
@@ -167,10 +167,10 @@ For each service, check whether it supports a JSON log format option and whether
1. Add `compactor` section to `services/monitoring/loki.nix` 1. Add `compactor` section to `services/monitoring/loki.nix`
2. Add `limits_config` with 30-day retention and basic rate limits 2. Add `limits_config` with 30-day retention and basic rate limits
3. Update `system/monitoring/logs.nix`: 3. Update `system/monitoring/logs.nix`:
- Fix `hostname``host` label in varlog scrape config - ~~Fix `hostname``host` label in varlog scrape config~~ Done: standardized on `hostname` (matching Prometheus)
- Add `tier` static label from `config.homelab.host.tier` to both scrape configs - ~~Add `tier` static label from `config.homelab.host.tier` to both scrape configs~~ Done
- Add `role` static label from `config.homelab.host.role` (conditionally, only when set) to both scrape configs - ~~Add `role` static label from `config.homelab.host.role` (conditionally, only when set) to both scrape configs~~ Done
- Add pipeline stages to journal scrape config: `json` to extract PRIORITY, `template` to map to level name, `labels` to attach as `level` - ~~Add pipeline stages to journal scrape config: `json` to extract PRIORITY, `template` to map to level name, `labels` to attach as `level`~~ Done
4. Deploy to monitoring01, verify compactor runs and old data gets cleaned 4. Deploy to monitoring01, verify compactor runs and old data gets cleaned
5. Verify `level` label works: `{level="error"}` should return results, and match cases where `detected_level="unknown"` 5. Verify `level` label works: `{level="error"}` should return results, and match cases where `detected_level="unknown"`

View File

@@ -28,7 +28,7 @@ let
streams: [{ streams: [{
stream: { stream: {
job: "bootstrap", job: "bootstrap",
host: $host, hostname: $host,
stage: $stage, stage: $stage,
branch: $branch branch: $branch
}, },

View File

@@ -1,4 +1,12 @@
{ config, ... }: { config, lib, ... }:
let
hostLabels = {
hostname = config.networking.hostName;
tier = config.homelab.host.tier;
} // lib.optionalAttrs (config.homelab.host.role != null) {
role = config.homelab.host.role;
};
in
{ {
# Configure journald # Configure journald
services.journald = { services.journald = {
@@ -32,17 +40,26 @@
json = true; json = true;
labels = { labels = {
job = "systemd-journal"; job = "systemd-journal";
}; } // hostLabels;
}; };
relabel_configs = [ relabel_configs = [
{ {
source_labels = [ "__journal__systemd_unit" ]; source_labels = [ "__journal__systemd_unit" ];
target_label = "systemd_unit"; target_label = "systemd_unit";
} }
];
pipeline_stages = [
# Extract PRIORITY from journal JSON
{ json.expressions.priority = "PRIORITY"; }
# Map numeric PRIORITY to level name
{ {
source_labels = [ "__journal__hostname" ]; template = {
target_label = "host"; source = "priority";
template = ''{{ if or (eq .Value "0") (eq .Value "1") (eq .Value "2") }}critical{{ else if eq .Value "3" }}error{{ else if eq .Value "4" }}warning{{ else if eq .Value "5" }}notice{{ else if eq .Value "6" }}info{{ else if eq .Value "7" }}debug{{ end }}'';
};
} }
# Attach as level label
{ labels.level = "priority"; }
]; ];
} }
{ {
@@ -53,8 +70,7 @@
labels = { labels = {
job = "varlog"; job = "varlog";
__path__ = "/var/log/**/*.log"; __path__ = "/var/log/**/*.log";
hostname = "${config.networking.hostName}"; } // hostLabels;
};
} }
]; ];
} }

View File

@@ -61,7 +61,7 @@ let
streams: [{ streams: [{
stream: { stream: {
job: $job, job: $job,
host: $host, hostname: $host,
type: $type, type: $type,
id: $id id: $id
}, },