Files
nixos-servers/hosts/template2/bootstrap.nix
Torjus Håkestad ae3039af19 template2: send bootstrap status to Loki for remote monitoring
Adds log_to_loki function that pushes structured log entries to Loki
at key bootstrap stages (starting, network_ok, vault_*, building,
success, failed). Enables querying bootstrap state via LogQL without
console access.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-07 15:34:47 +01:00

184 lines
6.9 KiB
Nix

{ pkgs, config, lib, ... }:
let
bootstrap-script = pkgs.writeShellApplication {
name = "nixos-bootstrap";
runtimeInputs = with pkgs; [ systemd curl nixos-rebuild jq git ];
text = ''
set -euo pipefail
LOKI_URL="http://monitoring01.home.2rjus.net:3100/loki/api/v1/push"
# Send a log entry to Loki with bootstrap status
# Usage: log_to_loki <stage> <message>
# Fails silently if Loki is unreachable
log_to_loki() {
local stage="$1"
local message="$2"
local timestamp_ns
timestamp_ns="$(date +%s)000000000"
local payload
payload=$(jq -n \
--arg host "$HOSTNAME" \
--arg stage "$stage" \
--arg branch "''${BRANCH:-master}" \
--arg ts "$timestamp_ns" \
--arg msg "$message" \
'{
streams: [{
stream: {
job: "bootstrap",
host: $host,
stage: $stage,
branch: $branch
},
values: [[$ts, $msg]]
}]
}')
curl -s --connect-timeout 2 --max-time 5 \
-X POST \
-H "Content-Type: application/json" \
-d "$payload" \
"$LOKI_URL" >/dev/null 2>&1 || true
}
echo "================================================================================"
echo " NIXOS BOOTSTRAP IN PROGRESS"
echo "================================================================================"
echo ""
# Read hostname set by cloud-init (from Terraform VM name via user-data)
# Cloud-init sets the system hostname from user-data.txt, so we read it from hostnamectl
HOSTNAME=$(hostnamectl hostname)
# Read git branch from environment, default to master
BRANCH="''${NIXOS_FLAKE_BRANCH:-master}"
echo "Hostname: $HOSTNAME"
echo ""
echo "Starting NixOS bootstrap for host: $HOSTNAME"
log_to_loki "starting" "Bootstrap starting for $HOSTNAME (branch: $BRANCH)"
echo "Waiting for network connectivity..."
# Verify we can reach the git server via HTTPS (doesn't respond to ping)
if ! curl -s --connect-timeout 5 --max-time 10 https://git.t-juice.club >/dev/null 2>&1; then
echo "ERROR: Cannot reach git.t-juice.club via HTTPS"
echo "Check network configuration and DNS settings"
log_to_loki "failed" "Network check failed - cannot reach git.t-juice.club"
exit 1
fi
echo "Network connectivity confirmed"
log_to_loki "network_ok" "Network connectivity confirmed"
# Unwrap Vault token and store AppRole credentials (if provided)
if [ -n "''${VAULT_WRAPPED_TOKEN:-}" ]; then
echo "Unwrapping Vault token to get AppRole credentials..."
VAULT_ADDR="''${VAULT_ADDR:-https://vault01.home.2rjus.net:8200}"
# Unwrap the token to get role_id and secret_id
UNWRAP_RESPONSE=$(curl -sk -X POST \
-H "X-Vault-Token: $VAULT_WRAPPED_TOKEN" \
"$VAULT_ADDR/v1/sys/wrapping/unwrap") || {
echo "WARNING: Failed to unwrap Vault token (network error)"
echo "Vault secrets will not be available, but continuing bootstrap..."
}
# Check if unwrap was successful
if [ -n "$UNWRAP_RESPONSE" ] && echo "$UNWRAP_RESPONSE" | jq -e '.data' >/dev/null 2>&1; then
ROLE_ID=$(echo "$UNWRAP_RESPONSE" | jq -r '.data.role_id')
SECRET_ID=$(echo "$UNWRAP_RESPONSE" | jq -r '.data.secret_id')
# Store credentials
mkdir -p /var/lib/vault/approle
echo "$ROLE_ID" > /var/lib/vault/approle/role-id
echo "$SECRET_ID" > /var/lib/vault/approle/secret-id
chmod 600 /var/lib/vault/approle/role-id
chmod 600 /var/lib/vault/approle/secret-id
echo "Vault credentials unwrapped and stored successfully"
log_to_loki "vault_ok" "Vault credentials unwrapped and stored"
else
echo "WARNING: Failed to unwrap Vault token"
if [ -n "$UNWRAP_RESPONSE" ]; then
echo "Response: $UNWRAP_RESPONSE"
fi
echo "Possible causes:"
echo " - Token already used (wrapped tokens are single-use)"
echo " - Token expired (24h TTL)"
echo " - Invalid token"
echo ""
echo "To regenerate token, run: create-host --hostname $HOSTNAME --force"
echo ""
echo "Vault secrets will not be available, but continuing bootstrap..."
log_to_loki "vault_warn" "Failed to unwrap Vault token - continuing without secrets"
fi
else
echo "No Vault wrapped token provided (VAULT_WRAPPED_TOKEN not set)"
echo "Skipping Vault credential setup"
log_to_loki "vault_skip" "No Vault token provided - skipping credential setup"
fi
echo "Fetching and building NixOS configuration from flake..."
echo "Using git branch: $BRANCH"
log_to_loki "building" "Starting nixos-rebuild boot"
# Build and activate the host-specific configuration
FLAKE_URL="git+https://git.t-juice.club/torjus/nixos-servers.git?ref=$BRANCH#''${HOSTNAME}"
if nixos-rebuild boot --flake "$FLAKE_URL"; then
echo "Successfully built configuration for $HOSTNAME"
echo "Rebooting into new configuration..."
log_to_loki "success" "Build successful - rebooting into new configuration"
sleep 2
systemctl reboot
else
echo "ERROR: nixos-rebuild failed for $HOSTNAME"
echo "Check that flake has configuration for this hostname"
echo "Manual intervention required - system will not reboot"
log_to_loki "failed" "nixos-rebuild failed - manual intervention required"
exit 1
fi
'';
};
in
{
# Custom greeting line to indicate this is a bootstrap image
services.getty.greetingLine = lib.mkForce ''
================================================================================
BOOTSTRAP IMAGE - NixOS \V (\l)
================================================================================
Bootstrap service is running. Logs are displayed on tty1.
Check status: journalctl -fu nixos-bootstrap
'';
systemd.services."nixos-bootstrap" = {
description = "Bootstrap NixOS configuration from flake on first boot";
# Wait for cloud-init to finish setting hostname and network to be online
after = [ "cloud-config.service" "network-online.target" ];
wants = [ "network-online.target" ];
requires = [ "cloud-config.service" ];
# Run on boot
wantedBy = [ "multi-user.target" ];
serviceConfig = {
Type = "oneshot";
RemainAfterExit = true;
ExecStart = lib.getExe bootstrap-script;
# Read environment variables from cloud-init (set by cloud-init write_files)
EnvironmentFile = "-/run/cloud-init-env";
# Log to journal and console
StandardOutput = "journal+console";
StandardError = "journal+console";
};
};
}