diff --git a/.gitignore b/.gitignore index 567ee61..18fbe70 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,9 @@ result result-* +# MCP config (contains secrets) +.mcp.json + # Terraform/OpenTofu terraform/.terraform/ terraform/.terraform.lock.hcl diff --git a/.mcp.json b/.mcp.json.example similarity index 88% rename from .mcp.json rename to .mcp.json.example index f5d61f6..ff4fb40 100644 --- a/.mcp.json +++ b/.mcp.json.example @@ -20,7 +20,9 @@ "env": { "PROMETHEUS_URL": "https://prometheus.home.2rjus.net", "ALERTMANAGER_URL": "https://alertmanager.home.2rjus.net", - "LOKI_URL": "http://monitoring01.home.2rjus.net:3100" + "LOKI_URL": "https://loki.home.2rjus.net", + "LOKI_USERNAME": "promtail", + "LOKI_PASSWORD": "" } }, "homelab-deploy": { @@ -44,4 +46,3 @@ } } } - diff --git a/docs/plans/monitoring-migration-victoriametrics.md b/docs/plans/monitoring-migration-victoriametrics.md index 7c6c349..d562c41 100644 --- a/docs/plans/monitoring-migration-victoriametrics.md +++ b/docs/plans/monitoring-migration-victoriametrics.md @@ -14,8 +14,8 @@ a `monitoring` CNAME for seamless transition. - Alertmanager (routes to alerttonotify webhook) - Grafana (dashboards, datasources) - Loki (log aggregation from all hosts via Promtail) -- Tempo (distributed tracing) -- Pyroscope (continuous profiling) +- Tempo (distributed tracing) - not actively used +- Pyroscope (continuous profiling) - not actively used **Hardcoded References to monitoring01:** - `system/monitoring/logs.nix` - Promtail sends logs to `http://monitoring01.home.2rjus.net:3100` @@ -44,9 +44,7 @@ If multi-year retention with downsampling becomes necessary later, Thanos can be │ VictoriaMetrics│ │ + Grafana │ monitoring │ + Loki │ - CNAME ──────────│ + Tempo │ - │ + Pyroscope │ - │ + Alertmanager │ + CNAME ──────────│ + Alertmanager │ │ (vmalert) │ └─────────────────┘ ▲ @@ -94,16 +92,11 @@ Imported by monitoring02 alongside the existing Grafana service. 4. **Grafana** (port 3000): [DONE] - VictoriaMetrics datasource (localhost:8428) as default - monitoring01 Prometheus datasource kept for comparison during parallel operation - - Loki datasource pointing to monitoring01 (until Loki migrated) + - Loki datasource pointing to localhost (after Loki migrated to monitoring02) -5. **Loki** (port 3100): - - TODO: Same configuration as current - -6. **Tempo** (ports 3200, 3201): - - TODO: Same configuration - -7. **Pyroscope** (port 4040): - - TODO: Same Docker-based deployment +5. **Loki** (port 3100): [DONE] + - Same configuration as monitoring01 in standalone `services/loki/` module + - Grafana datasource updated to localhost:3100 **Note:** pve-exporter and pushgateway scrape targets are not included on monitoring02. pve-exporter requires a local exporter instance; pushgateway is replaced by VictoriaMetrics @@ -147,7 +140,6 @@ Update hardcoded references to use the CNAME: - prometheus.home.2rjus.net -> monitoring.home.2rjus.net:8428 - alertmanager.home.2rjus.net -> monitoring.home.2rjus.net:9093 - grafana.home.2rjus.net -> monitoring.home.2rjus.net:3000 - - pyroscope.home.2rjus.net -> monitoring.home.2rjus.net:4040 Note: `hosts/template2/bootstrap.nix` stays pointed at monitoring01 until decommission. @@ -172,8 +164,8 @@ Once ready to cut over: ## Current Progress - **Phase 1** complete (2026-02-08): monitoring02 host created, Grafana with Kanidm OIDC validated -- **Phase 2** in progress (2026-02-17): VictoriaMetrics, vmalert, Alertmanager, Grafana datasources configured - - Remaining: Loki, Tempo, Pyroscope migration +- **Phase 2** complete (2026-02-17): VictoriaMetrics, vmalert, Alertmanager, Loki, Grafana datasources configured + - Tempo and Pyroscope deferred (not actively used; can be added later if needed) ## Open Questions diff --git a/hosts/monitoring02/configuration.nix b/hosts/monitoring02/configuration.nix index 3cf2f8d..2616555 100644 --- a/hosts/monitoring02/configuration.nix +++ b/hosts/monitoring02/configuration.nix @@ -18,7 +18,7 @@ role = "monitoring"; }; - homelab.dns.cnames = [ "grafana-test" "metrics" "vmalert" ]; + homelab.dns.cnames = [ "grafana-test" "metrics" "vmalert" "loki" ]; # Enable Vault integration vault.enable = true; diff --git a/hosts/monitoring02/default.nix b/hosts/monitoring02/default.nix index ea273a4..a8ef155 100644 --- a/hosts/monitoring02/default.nix +++ b/hosts/monitoring02/default.nix @@ -3,5 +3,6 @@ ./configuration.nix ../../services/grafana ../../services/victoriametrics + ../../services/loki ]; } \ No newline at end of file diff --git a/services/grafana/default.nix b/services/grafana/default.nix index a0dc7b4..ed5aece 100644 --- a/services/grafana/default.nix +++ b/services/grafana/default.nix @@ -54,7 +54,7 @@ { name = "Loki"; type = "loki"; - url = "http://monitoring01.home.2rjus.net:3100"; + url = "http://localhost:3100"; uid = "loki"; } ]; diff --git a/services/loki/default.nix b/services/loki/default.nix new file mode 100644 index 0000000..f50a6e0 --- /dev/null +++ b/services/loki/default.nix @@ -0,0 +1,104 @@ +{ config, lib, pkgs, ... }: +let + # Script to generate bcrypt hash from Vault password for Caddy basic_auth + generateCaddyAuth = pkgs.writeShellApplication { + name = "generate-caddy-loki-auth"; + runtimeInputs = [ config.services.caddy.package ]; + text = '' + PASSWORD=$(cat /run/secrets/loki-push-auth) + HASH=$(caddy hash-password --plaintext "$PASSWORD") + echo "LOKI_PUSH_HASH=$HASH" > /run/secrets/caddy-loki-auth.env + chmod 0400 /run/secrets/caddy-loki-auth.env + ''; + }; +in +{ + # Fetch Loki push password from Vault + vault.secrets.loki-push-auth = { + secretPath = "shared/loki/push-auth"; + extractKey = "password"; + services = [ "caddy" ]; + }; + + # Generate bcrypt hash for Caddy before it starts + systemd.services.caddy-loki-auth = { + description = "Generate Caddy basic auth hash for Loki"; + after = [ "vault-secret-loki-push-auth.service" ]; + requires = [ "vault-secret-loki-push-auth.service" ]; + before = [ "caddy.service" ]; + requiredBy = [ "caddy.service" ]; + serviceConfig = { + Type = "oneshot"; + RemainAfterExit = true; + ExecStart = lib.getExe generateCaddyAuth; + }; + }; + + # Load the bcrypt hash as environment variable for Caddy + services.caddy.environmentFile = "/run/secrets/caddy-loki-auth.env"; + + # Caddy reverse proxy for Loki with basic auth + services.caddy.virtualHosts."loki.home.2rjus.net".extraConfig = '' + basic_auth { + promtail {env.LOKI_PUSH_HASH} + } + reverse_proxy http://127.0.0.1:3100 + ''; + + services.loki = { + enable = true; + configuration = { + auth_enabled = false; + + server = { + http_listen_address = "127.0.0.1"; + http_listen_port = 3100; + }; + common = { + ring = { + instance_addr = "127.0.0.1"; + kvstore = { + store = "inmemory"; + }; + }; + replication_factor = 1; + path_prefix = "/var/lib/loki"; + }; + schema_config = { + configs = [ + { + from = "2024-01-01"; + store = "tsdb"; + object_store = "filesystem"; + schema = "v13"; + index = { + prefix = "loki_index_"; + period = "24h"; + }; + } + ]; + }; + storage_config = { + filesystem = { + directory = "/var/lib/loki/chunks"; + }; + }; + compactor = { + working_directory = "/var/lib/loki/compactor"; + compaction_interval = "10m"; + retention_enabled = true; + retention_delete_delay = "2h"; + retention_delete_worker_count = 150; + delete_request_store = "filesystem"; + }; + limits_config = { + retention_period = "30d"; + ingestion_rate_mb = 10; + ingestion_burst_size_mb = 20; + max_streams_per_user = 10000; + max_query_series = 500; + max_query_parallelism = 8; + }; + }; + }; +} diff --git a/system/monitoring/logs.nix b/system/monitoring/logs.nix index 68d9cac..6a21a62 100644 --- a/system/monitoring/logs.nix +++ b/system/monitoring/logs.nix @@ -16,6 +16,16 @@ in SystemKeepFree=1G ''; }; + + # Fetch Loki push password from Vault (only on hosts with Vault enabled) + vault.secrets.promtail-loki-auth = lib.mkIf config.vault.enable { + secretPath = "shared/loki/push-auth"; + extractKey = "password"; + owner = "promtail"; + group = "promtail"; + services = [ "promtail" ]; + }; + # Configure promtail services.promtail = { enable = true; @@ -31,6 +41,14 @@ in { url = "http://monitoring01.home.2rjus.net:3100/loki/api/v1/push"; } + ] ++ lib.optionals config.vault.enable [ + { + url = "https://loki.home.2rjus.net/loki/api/v1/push"; + basic_auth = { + username = "promtail"; + password_file = "/run/secrets/promtail-loki-auth"; + }; + } ]; scrape_configs = [ diff --git a/terraform/vault/approle.tf b/terraform/vault/approle.tf index 8542812..5f76056 100644 --- a/terraform/vault/approle.tf +++ b/terraform/vault/approle.tf @@ -26,6 +26,17 @@ path "secret/data/shared/nixos-exporter/*" { EOT } +# Shared policy for Loki push authentication (all hosts push logs) +resource "vault_policy" "loki_push" { + name = "loki-push" + + policy = <