Some checks failed
Run nix flake check / flake-check (push) Has been cancelled
- Switch vmalert from blackhole mode to sending alerts to local Alertmanager - Import alerttonotify service so alerts route to NATS notifications - Move alertmanager and grafana CNAMEs from http-proxy to monitoring02 - Add monitoring CNAME to monitoring02 - Add Caddy reverse proxy entries for alertmanager and grafana - Remove prometheus, alertmanager, and grafana Caddy entries from http-proxy (now served directly by monitoring02) - Move monitoring02 Vault AppRole to hosts-generated.tf with extra_policies support and prometheus-metrics policy - Update Promtail to use authenticated loki.home.2rjus.net endpoint only (remove unauthenticated monitoring01 client) - Update pipe-to-loki and bootstrap to use loki.home.2rjus.net with basic auth from Vault secret - Update migration plan with current status Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
191 lines
7.1 KiB
Nix
191 lines
7.1 KiB
Nix
{ pkgs, config, lib, ... }:
|
|
let
|
|
bootstrap-script = pkgs.writeShellApplication {
|
|
name = "nixos-bootstrap";
|
|
runtimeInputs = with pkgs; [ systemd curl nixos-rebuild jq git ];
|
|
text = ''
|
|
set -euo pipefail
|
|
|
|
LOKI_URL="https://loki.home.2rjus.net/loki/api/v1/push"
|
|
LOKI_AUTH_FILE="/run/secrets/promtail-loki-auth"
|
|
|
|
# Send a log entry to Loki with bootstrap status
|
|
# Usage: log_to_loki <stage> <message>
|
|
# Fails silently if Loki is unreachable
|
|
log_to_loki() {
|
|
local stage="$1"
|
|
local message="$2"
|
|
local timestamp_ns
|
|
timestamp_ns="$(date +%s)000000000"
|
|
|
|
local payload
|
|
payload=$(jq -n \
|
|
--arg host "$HOSTNAME" \
|
|
--arg stage "$stage" \
|
|
--arg branch "''${BRANCH:-master}" \
|
|
--arg ts "$timestamp_ns" \
|
|
--arg msg "$message" \
|
|
'{
|
|
streams: [{
|
|
stream: {
|
|
job: "bootstrap",
|
|
hostname: $host,
|
|
stage: $stage,
|
|
branch: $branch
|
|
},
|
|
values: [[$ts, $msg]]
|
|
}]
|
|
}')
|
|
|
|
local auth_args=()
|
|
if [[ -f "$LOKI_AUTH_FILE" ]]; then
|
|
auth_args=(-u "promtail:$(cat "$LOKI_AUTH_FILE")")
|
|
fi
|
|
|
|
curl -s --connect-timeout 2 --max-time 5 \
|
|
-X POST \
|
|
"''${auth_args[@]}" \
|
|
-H "Content-Type: application/json" \
|
|
-d "$payload" \
|
|
"$LOKI_URL" >/dev/null 2>&1 || true
|
|
}
|
|
|
|
echo "================================================================================"
|
|
echo " NIXOS BOOTSTRAP IN PROGRESS"
|
|
echo "================================================================================"
|
|
echo ""
|
|
|
|
# Read hostname set by cloud-init (from Terraform VM name via user-data)
|
|
# Cloud-init sets the system hostname from user-data.txt, so we read it from hostnamectl
|
|
HOSTNAME=$(hostnamectl hostname)
|
|
# Read git branch from environment, default to master
|
|
BRANCH="''${NIXOS_FLAKE_BRANCH:-master}"
|
|
|
|
echo "Hostname: $HOSTNAME"
|
|
echo ""
|
|
echo "Starting NixOS bootstrap for host: $HOSTNAME"
|
|
|
|
log_to_loki "starting" "Bootstrap starting for $HOSTNAME (branch: $BRANCH)"
|
|
|
|
echo "Waiting for network connectivity..."
|
|
|
|
# Verify we can reach the git server via HTTPS (doesn't respond to ping)
|
|
if ! curl -s --connect-timeout 5 --max-time 10 https://git.t-juice.club >/dev/null 2>&1; then
|
|
echo "ERROR: Cannot reach git.t-juice.club via HTTPS"
|
|
echo "Check network configuration and DNS settings"
|
|
log_to_loki "failed" "Network check failed - cannot reach git.t-juice.club"
|
|
exit 1
|
|
fi
|
|
|
|
echo "Network connectivity confirmed"
|
|
log_to_loki "network_ok" "Network connectivity confirmed"
|
|
|
|
# Unwrap Vault token and store AppRole credentials (if provided)
|
|
if [ -n "''${VAULT_WRAPPED_TOKEN:-}" ]; then
|
|
echo "Unwrapping Vault token to get AppRole credentials..."
|
|
|
|
VAULT_ADDR="''${VAULT_ADDR:-https://vault01.home.2rjus.net:8200}"
|
|
|
|
# Unwrap the token to get role_id and secret_id
|
|
UNWRAP_RESPONSE=$(curl -sk -X POST \
|
|
-H "X-Vault-Token: $VAULT_WRAPPED_TOKEN" \
|
|
"$VAULT_ADDR/v1/sys/wrapping/unwrap") || {
|
|
echo "WARNING: Failed to unwrap Vault token (network error)"
|
|
echo "Vault secrets will not be available, but continuing bootstrap..."
|
|
}
|
|
|
|
# Check if unwrap was successful
|
|
if [ -n "$UNWRAP_RESPONSE" ] && echo "$UNWRAP_RESPONSE" | jq -e '.data' >/dev/null 2>&1; then
|
|
ROLE_ID=$(echo "$UNWRAP_RESPONSE" | jq -r '.data.role_id')
|
|
SECRET_ID=$(echo "$UNWRAP_RESPONSE" | jq -r '.data.secret_id')
|
|
|
|
# Store credentials
|
|
mkdir -p /var/lib/vault/approle
|
|
echo "$ROLE_ID" > /var/lib/vault/approle/role-id
|
|
echo "$SECRET_ID" > /var/lib/vault/approle/secret-id
|
|
chmod 600 /var/lib/vault/approle/role-id
|
|
chmod 600 /var/lib/vault/approle/secret-id
|
|
|
|
echo "Vault credentials unwrapped and stored successfully"
|
|
log_to_loki "vault_ok" "Vault credentials unwrapped and stored"
|
|
else
|
|
echo "WARNING: Failed to unwrap Vault token"
|
|
if [ -n "$UNWRAP_RESPONSE" ]; then
|
|
echo "Response: $UNWRAP_RESPONSE"
|
|
fi
|
|
echo "Possible causes:"
|
|
echo " - Token already used (wrapped tokens are single-use)"
|
|
echo " - Token expired (24h TTL)"
|
|
echo " - Invalid token"
|
|
echo ""
|
|
echo "To regenerate token, run: create-host --hostname $HOSTNAME --force"
|
|
echo ""
|
|
echo "Vault secrets will not be available, but continuing bootstrap..."
|
|
log_to_loki "vault_warn" "Failed to unwrap Vault token - continuing without secrets"
|
|
fi
|
|
else
|
|
echo "No Vault wrapped token provided (VAULT_WRAPPED_TOKEN not set)"
|
|
echo "Skipping Vault credential setup"
|
|
log_to_loki "vault_skip" "No Vault token provided - skipping credential setup"
|
|
fi
|
|
|
|
echo "Fetching and building NixOS configuration from flake..."
|
|
echo "Using git branch: $BRANCH"
|
|
log_to_loki "building" "Starting nixos-rebuild boot"
|
|
|
|
# Build and activate the host-specific configuration
|
|
FLAKE_URL="git+https://git.t-juice.club/torjus/nixos-servers.git?ref=$BRANCH#''${HOSTNAME}"
|
|
|
|
if nixos-rebuild boot --flake "$FLAKE_URL"; then
|
|
echo "Successfully built configuration for $HOSTNAME"
|
|
echo "Rebooting into new configuration..."
|
|
log_to_loki "success" "Build successful - rebooting into new configuration"
|
|
sleep 2
|
|
systemctl reboot
|
|
else
|
|
echo "ERROR: nixos-rebuild failed for $HOSTNAME"
|
|
echo "Check that flake has configuration for this hostname"
|
|
echo "Manual intervention required - system will not reboot"
|
|
log_to_loki "failed" "nixos-rebuild failed - manual intervention required"
|
|
exit 1
|
|
fi
|
|
'';
|
|
};
|
|
in
|
|
{
|
|
# Custom greeting line to indicate this is a bootstrap image
|
|
services.getty.greetingLine = lib.mkForce ''
|
|
================================================================================
|
|
BOOTSTRAP IMAGE - NixOS \V (\l)
|
|
================================================================================
|
|
|
|
Bootstrap service is running. Logs are displayed on tty1.
|
|
Check status: journalctl -fu nixos-bootstrap
|
|
'';
|
|
|
|
systemd.services."nixos-bootstrap" = {
|
|
description = "Bootstrap NixOS configuration from flake on first boot";
|
|
|
|
# Wait for cloud-init to finish setting hostname and network to be online
|
|
after = [ "cloud-config.service" "network-online.target" ];
|
|
wants = [ "network-online.target" ];
|
|
requires = [ "cloud-config.service" ];
|
|
|
|
# Run on boot
|
|
wantedBy = [ "multi-user.target" ];
|
|
|
|
serviceConfig = {
|
|
Type = "oneshot";
|
|
RemainAfterExit = true;
|
|
ExecStart = lib.getExe bootstrap-script;
|
|
|
|
# Read environment variables from cloud-init (set by cloud-init write_files)
|
|
EnvironmentFile = "-/run/cloud-init-env";
|
|
|
|
# Log to journal and console
|
|
StandardOutput = "journal+console";
|
|
StandardError = "journal+console";
|
|
};
|
|
};
|
|
}
|