loki-monitoring02 #41

Merged
torjus merged 7 commits from loki-monitoring02 into master 2026-02-17 19:40:33 +00:00
11 changed files with 160 additions and 24 deletions

3
.gitignore vendored
View File

@@ -2,6 +2,9 @@
result result
result-* result-*
# MCP config (contains secrets)
.mcp.json
# Terraform/OpenTofu # Terraform/OpenTofu
terraform/.terraform/ terraform/.terraform/
terraform/.terraform.lock.hcl terraform/.terraform.lock.hcl

View File

@@ -20,7 +20,9 @@
"env": { "env": {
"PROMETHEUS_URL": "https://prometheus.home.2rjus.net", "PROMETHEUS_URL": "https://prometheus.home.2rjus.net",
"ALERTMANAGER_URL": "https://alertmanager.home.2rjus.net", "ALERTMANAGER_URL": "https://alertmanager.home.2rjus.net",
"LOKI_URL": "http://monitoring01.home.2rjus.net:3100" "LOKI_URL": "https://loki.home.2rjus.net",
"LOKI_USERNAME": "promtail",
"LOKI_PASSWORD": "<password from: bao kv get -field=password secret/shared/loki/push-auth>"
} }
}, },
"homelab-deploy": { "homelab-deploy": {
@@ -44,4 +46,3 @@
} }
} }
} }

View File

@@ -14,8 +14,8 @@ a `monitoring` CNAME for seamless transition.
- Alertmanager (routes to alerttonotify webhook) - Alertmanager (routes to alerttonotify webhook)
- Grafana (dashboards, datasources) - Grafana (dashboards, datasources)
- Loki (log aggregation from all hosts via Promtail) - Loki (log aggregation from all hosts via Promtail)
- Tempo (distributed tracing) - Tempo (distributed tracing) - not actively used
- Pyroscope (continuous profiling) - Pyroscope (continuous profiling) - not actively used
**Hardcoded References to monitoring01:** **Hardcoded References to monitoring01:**
- `system/monitoring/logs.nix` - Promtail sends logs to `http://monitoring01.home.2rjus.net:3100` - `system/monitoring/logs.nix` - Promtail sends logs to `http://monitoring01.home.2rjus.net:3100`
@@ -44,9 +44,7 @@ If multi-year retention with downsampling becomes necessary later, Thanos can be
│ VictoriaMetrics│ │ VictoriaMetrics│
│ + Grafana │ │ + Grafana │
monitoring │ + Loki │ monitoring │ + Loki │
CNAME ──────────│ + Tempo CNAME ──────────│ + Alertmanager
│ + Pyroscope │
│ + Alertmanager │
│ (vmalert) │ │ (vmalert) │
└─────────────────┘ └─────────────────┘
@@ -94,16 +92,11 @@ Imported by monitoring02 alongside the existing Grafana service.
4. **Grafana** (port 3000): [DONE] 4. **Grafana** (port 3000): [DONE]
- VictoriaMetrics datasource (localhost:8428) as default - VictoriaMetrics datasource (localhost:8428) as default
- monitoring01 Prometheus datasource kept for comparison during parallel operation - monitoring01 Prometheus datasource kept for comparison during parallel operation
- Loki datasource pointing to monitoring01 (until Loki migrated) - Loki datasource pointing to localhost (after Loki migrated to monitoring02)
5. **Loki** (port 3100): 5. **Loki** (port 3100): [DONE]
- TODO: Same configuration as current - Same configuration as monitoring01 in standalone `services/loki/` module
- Grafana datasource updated to localhost:3100
6. **Tempo** (ports 3200, 3201):
- TODO: Same configuration
7. **Pyroscope** (port 4040):
- TODO: Same Docker-based deployment
**Note:** pve-exporter and pushgateway scrape targets are not included on monitoring02. **Note:** pve-exporter and pushgateway scrape targets are not included on monitoring02.
pve-exporter requires a local exporter instance; pushgateway is replaced by VictoriaMetrics pve-exporter requires a local exporter instance; pushgateway is replaced by VictoriaMetrics
@@ -147,7 +140,6 @@ Update hardcoded references to use the CNAME:
- prometheus.home.2rjus.net -> monitoring.home.2rjus.net:8428 - prometheus.home.2rjus.net -> monitoring.home.2rjus.net:8428
- alertmanager.home.2rjus.net -> monitoring.home.2rjus.net:9093 - alertmanager.home.2rjus.net -> monitoring.home.2rjus.net:9093
- grafana.home.2rjus.net -> monitoring.home.2rjus.net:3000 - grafana.home.2rjus.net -> monitoring.home.2rjus.net:3000
- pyroscope.home.2rjus.net -> monitoring.home.2rjus.net:4040
Note: `hosts/template2/bootstrap.nix` stays pointed at monitoring01 until decommission. Note: `hosts/template2/bootstrap.nix` stays pointed at monitoring01 until decommission.
@@ -172,8 +164,8 @@ Once ready to cut over:
## Current Progress ## Current Progress
- **Phase 1** complete (2026-02-08): monitoring02 host created, Grafana with Kanidm OIDC validated - **Phase 1** complete (2026-02-08): monitoring02 host created, Grafana with Kanidm OIDC validated
- **Phase 2** in progress (2026-02-17): VictoriaMetrics, vmalert, Alertmanager, Grafana datasources configured - **Phase 2** complete (2026-02-17): VictoriaMetrics, vmalert, Alertmanager, Loki, Grafana datasources configured
- Remaining: Loki, Tempo, Pyroscope migration - Tempo and Pyroscope deferred (not actively used; can be added later if needed)
## Open Questions ## Open Questions

View File

@@ -18,7 +18,7 @@
role = "monitoring"; role = "monitoring";
}; };
homelab.dns.cnames = [ "grafana-test" "metrics" "vmalert" ]; homelab.dns.cnames = [ "grafana-test" "metrics" "vmalert" "loki" ];
# Enable Vault integration # Enable Vault integration
vault.enable = true; vault.enable = true;

View File

@@ -3,5 +3,6 @@
./configuration.nix ./configuration.nix
../../services/grafana ../../services/grafana
../../services/victoriametrics ../../services/victoriametrics
../../services/loki
]; ];
} }

View File

@@ -54,7 +54,7 @@
{ {
name = "Loki"; name = "Loki";
type = "loki"; type = "loki";
url = "http://monitoring01.home.2rjus.net:3100"; url = "http://localhost:3100";
uid = "loki"; uid = "loki";
} }
]; ];

104
services/loki/default.nix Normal file
View File

@@ -0,0 +1,104 @@
{ config, lib, pkgs, ... }:
let
# Script to generate bcrypt hash from Vault password for Caddy basic_auth
generateCaddyAuth = pkgs.writeShellApplication {
name = "generate-caddy-loki-auth";
runtimeInputs = [ config.services.caddy.package ];
text = ''
PASSWORD=$(cat /run/secrets/loki-push-auth)
HASH=$(caddy hash-password --plaintext "$PASSWORD")
echo "LOKI_PUSH_HASH=$HASH" > /run/secrets/caddy-loki-auth.env
chmod 0400 /run/secrets/caddy-loki-auth.env
'';
};
in
{
# Fetch Loki push password from Vault
vault.secrets.loki-push-auth = {
secretPath = "shared/loki/push-auth";
extractKey = "password";
services = [ "caddy" ];
};
# Generate bcrypt hash for Caddy before it starts
systemd.services.caddy-loki-auth = {
description = "Generate Caddy basic auth hash for Loki";
after = [ "vault-secret-loki-push-auth.service" ];
requires = [ "vault-secret-loki-push-auth.service" ];
before = [ "caddy.service" ];
requiredBy = [ "caddy.service" ];
serviceConfig = {
Type = "oneshot";
RemainAfterExit = true;
ExecStart = lib.getExe generateCaddyAuth;
};
};
# Load the bcrypt hash as environment variable for Caddy
services.caddy.environmentFile = "/run/secrets/caddy-loki-auth.env";
# Caddy reverse proxy for Loki with basic auth
services.caddy.virtualHosts."loki.home.2rjus.net".extraConfig = ''
basic_auth {
promtail {env.LOKI_PUSH_HASH}
}
reverse_proxy http://127.0.0.1:3100
'';
services.loki = {
enable = true;
configuration = {
auth_enabled = false;
server = {
http_listen_address = "127.0.0.1";
http_listen_port = 3100;
};
common = {
ring = {
instance_addr = "127.0.0.1";
kvstore = {
store = "inmemory";
};
};
replication_factor = 1;
path_prefix = "/var/lib/loki";
};
schema_config = {
configs = [
{
from = "2024-01-01";
store = "tsdb";
object_store = "filesystem";
schema = "v13";
index = {
prefix = "loki_index_";
period = "24h";
};
}
];
};
storage_config = {
filesystem = {
directory = "/var/lib/loki/chunks";
};
};
compactor = {
working_directory = "/var/lib/loki/compactor";
compaction_interval = "10m";
retention_enabled = true;
retention_delete_delay = "2h";
retention_delete_worker_count = 150;
delete_request_store = "filesystem";
};
limits_config = {
retention_period = "30d";
ingestion_rate_mb = 10;
ingestion_burst_size_mb = 20;
max_streams_per_user = 10000;
max_query_series = 500;
max_query_parallelism = 8;
};
};
};
}

View File

@@ -16,6 +16,16 @@ in
SystemKeepFree=1G SystemKeepFree=1G
''; '';
}; };
# Fetch Loki push password from Vault (only on hosts with Vault enabled)
vault.secrets.promtail-loki-auth = lib.mkIf config.vault.enable {
secretPath = "shared/loki/push-auth";
extractKey = "password";
owner = "promtail";
group = "promtail";
services = [ "promtail" ];
};
# Configure promtail # Configure promtail
services.promtail = { services.promtail = {
enable = true; enable = true;
@@ -31,6 +41,14 @@ in
{ {
url = "http://monitoring01.home.2rjus.net:3100/loki/api/v1/push"; url = "http://monitoring01.home.2rjus.net:3100/loki/api/v1/push";
} }
] ++ lib.optionals config.vault.enable [
{
url = "https://loki.home.2rjus.net/loki/api/v1/push";
basic_auth = {
username = "promtail";
password_file = "/run/secrets/promtail-loki-auth";
};
}
]; ];
scrape_configs = [ scrape_configs = [

View File

@@ -26,6 +26,17 @@ path "secret/data/shared/nixos-exporter/*" {
EOT EOT
} }
# Shared policy for Loki push authentication (all hosts push logs)
resource "vault_policy" "loki_push" {
name = "loki-push"
policy = <<EOT
path "secret/data/shared/loki/*" {
capabilities = ["read", "list"]
}
EOT
}
# Define host access policies # Define host access policies
locals { locals {
host_policies = { host_policies = {
@@ -78,7 +89,7 @@ locals {
] ]
} }
# Wave 3: DNS servers # Wave 3: DNS servers (managed in hosts-generated.tf)
# Wave 4: http-proxy # Wave 4: http-proxy
"http-proxy" = { "http-proxy" = {
@@ -138,7 +149,7 @@ resource "vault_approle_auth_backend_role" "hosts" {
backend = vault_auth_backend.approle.path backend = vault_auth_backend.approle.path
role_name = each.key role_name = each.key
token_policies = concat( token_policies = concat(
["${each.key}-policy", "homelab-deploy", "nixos-exporter"], ["${each.key}-policy", "homelab-deploy", "nixos-exporter", "loki-push"],
lookup(each.value, "extra_policies", []) lookup(each.value, "extra_policies", [])
) )

View File

@@ -74,7 +74,7 @@ resource "vault_approle_auth_backend_role" "generated_hosts" {
backend = vault_auth_backend.approle.path backend = vault_auth_backend.approle.path
role_name = each.key role_name = each.key
token_policies = ["host-${each.key}", "homelab-deploy", "nixos-exporter"] token_policies = ["host-${each.key}", "homelab-deploy", "nixos-exporter", "loki-push"]
secret_id_ttl = 0 # Never expire (wrapped tokens provide time limit) secret_id_ttl = 0 # Never expire (wrapped tokens provide time limit)
token_ttl = 3600 token_ttl = 3600
token_max_ttl = 3600 token_max_ttl = 3600

View File

@@ -153,6 +153,12 @@ locals {
auto_generate = true auto_generate = true
password_length = 64 password_length = 64
} }
# Loki push authentication (used by Promtail on all hosts)
"shared/loki/push-auth" = {
auto_generate = true
password_length = 32
}
} }
} }